# Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, KBinsDiscretizer, OneHotEncoder, StandardScaler
from scipy.stats import randint
import string
from sklearn.metrics import accuracy_score, f1_score, ConfusionMatrixDisplay, mean_absolute_error, mean_squared_error, \
    mean_squared_log_error, r2_score

# Functions

In [None]:
def evaluate_classifier(model, x_test, y_test, title=""):
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print("")
    if title:
        print("Test set results on {}:".format(title.title()))
    else:
        print("Test set results:")
    print(f"  - Accuracy: {accuracy}")
    print(f"  - F1 score: {f1}")
    print("")

    disp = ConfusionMatrixDisplay.from_estimator(model, x_test, y_test,
                                                 cmap=plt.cm.Blues, normalize='true')
    disp.ax_.set_title("Confusion Matrix for " + title)
    plt.show()

def evaluate_regressor(model, x_test, y_test):
    y_pred = model.predict(x_test)
    y_pred = np.abs(y_pred)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    msle = mean_squared_log_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print("")
    print("Test set results:")
    print("  - MAE:", mae)
    print("  - MSE:", mse)
    print("  - MSLE:", msle)
    print("  - R2:", r2)
    print("")
    print("Remember: R2 score is in [-inf, 1]. R2<0 --> BAD MODEL.")

def get_categorical_features(df):
    return df.select_dtypes(include=['object']).columns

def get_k_correlated(dataset, y_name, k=5):
    y_corr = dataset.corr()[y_name].abs().sort_values(ascending=False)
    return y_corr[1:(k+1)].index.tolist()

def clean_text(text):
    text = text.lower()
    text = text.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ')
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ' '.join(text.split())
    return text