Import Libraries

In [None]:
import pymysql
from sqlalchemy import create_engine, text
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler, StandardScaler, PowerTransformer, QuantileTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

Load the data

In [None]:
# Load the data
data = pd.read_csv('./data.csv')

# database connection
import getpass
password = getpass.getpass()
connection_string = 'mysql+pymysql://root:'+password+'@localhost/database'
engine = create_engine(connection_string)
# Query the database
query = '''SELECT * FROM table WHERE x GROUP BY y;'''
pd.read_sql(query, engine)

# Data cleaning/wrangling

In [None]:
# Drop duplicates
data = data.drop_duplicates()

# Standardize column names
data.columns = data.columns.str.lower().str.strip().str.replace(' ', '_')

# Sort columns alphabetically
data = data.reindex(sorted(data.columns), axis=1).reset_index(drop=True)

# Cast columns to appropriate data types
def cast_column(col):
    try:
        col = col.astype(int)  # Try to cast the column to integer
    except ValueError:
        try:
            col = col.astype(float)  # If it fails, try to cast the column to float
        except ValueError:
            pass  # If it fails again, leave the column as object
    return col

default_dtypes = data.dtypes  # Store the default dtypes
data = data.apply(cast_column, axis=0)  # Apply the function to each column

# Clean data (handle null values)
def clean_data(data):
    null_perc = data.isnull().sum() / len(data) * 100  # Calculate the percentage of null values in each column
    for column in null_perc.index:
        if null_perc[column] > 80:  # If the percentage is greater than 80%
            data = data.drop(columns=column)  # Drop the column
        elif 30 < null_perc[column] <= 80:  # If the percentage is between 30% and 80%
            if data[column].dtype != 'object':  # If the column is numeric
                data[column] = data[column].fillna(data[column].median())  # Fill the null values with the median of the column
            else:  # If the column is categorical
                data[column] = data[column].fillna(data[column].mode()[0])  # Fill the null values with the mode of the column
        else:  # If the percentage is low, leave the column value as it is
            continue
    return data

data = clean_data(data)  # Clean the dataframe
data = data.dropna()  # Drop the remaining null values

# Exploratory Data Analysis (EDA)

In [None]:
# Separate numerical and categorical features
num_features = data.select_dtypes(include=['int64', 'float64']).columns
cat_features = data.select_dtypes(include=['object', 'category']).columns

# Plot histograms of numerical features to check distributions
data[num_features].hist(bins=30, figsize=(14, 10))
plt.tight_layout()
plt.show()

# Plot a heatmap for correlations
plt.figure(figsize=(10, 8))
sns.heatmap(data[num_features].corr(), annot=True, cmap='coolwarm')
plt.show()

# Preprocessing and Feature Engineering (PFE)

In [None]:
# Remove outliers using the IQR method
Q1 = data[num_features].quantile(0.25)
Q3 = data[num_features].quantile(0.75)
IQR = Q3 - Q1
data = data[~((data[num_features] < (Q1 - 1.5 * IQR)) | (data[num_features] > (Q3 + 1.5 * IQR))).any(axis=1)]

# Remove outliers with nearest neighbors/wiskers 

# Replace outliers by a known value/median/mode 

# High cardinality in categorical features: groupe values in bigger groupes (order by frequency)

# Variance threshold method (discard features with low variance)
# SelectKBest method (select features with highest relationships with a stat test) 
# Recursive Feature Elimination (RFE) method (select features from subset of features recursively)
# Regularization (Lasso -> L1, Ridge -> L2) method (select features with highest coefficients)

Test Train Split

In [None]:
# Train-test split
# Replace 'target' with the name of the target variable
X = data.drop(columns=['target'])
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Transforming and Scaling

In [None]:
# Categorical encoding (for making categorical values to numerical values)

# One-Hot encoding (for nominal features) 
one_hot_encoder = OneHotEncoder(drop='first') # Drop the first column to avoid multicollinearity/dummy variable trap
one_hot_data = one_hot_encoder.fit_transform(data[['column_name']]).toarray()
one_hot_columns = one_hot_encoder.get_feature_names_out(['column_name'])
one_hot_df = pd.DataFrame(one_hot_data, columns=one_hot_columns)
data = data.drop(columns=['column_name'])
data = pd.concat([data, one_hot_df], axis=1)

# Label encoding (for ordinal features)
label_encoder = LabelEncoder()
data['column_name'] = label_encoder.fit_transform(data['column_name'])


# Numerical transformations (for normalizing skewed distributions)

# PowerTransformer (Yeo-Johnson, some values can be negative)
num_transformer = PowerTransformer()
# Box-Cox transformation (only when values are postive)
num_transformer = PowerTransformer(method='box-cox')
X_train_num_transformed = num_transformer.fit_transform(X_train[num_features])
X_test_num_transformed = num_transformer.transform(X_test[num_features])
X_train_num_transformed_df = pd.DataFrame(X_train_num_transformed, columns=num_features)
X_test_num_transformed_df = pd.DataFrame(X_test_num_transformed, columns=num_features)

# QuantileTransformer (Uniform, otherwise 'gaussian' will generate a better distribution)
num_transformer = QuantileTransformer(output_distribution='gaussian')
X_train_num_transformed = num_transformer.fit_transform(X_train[num_features])
X_test_num_transformed = num_transformer.transform(X_test[num_features])
X_train_num_transformed_df = pd.DataFrame(X_train_num_transformed, columns=num_features)
X_test_num_transformed_df = pd.DataFrame(X_test_num_transformed, columns=num_features)

# Log transformation 
log_transformer = num_features.apply(np.log) 
X_train_num_transformed = np.log(X_train + 1) # Add 1 to avoid log(0)
X_test_num_transformed = np.log(X_test + 1)

# Scaling (to get values on the same scale/range)

# StandardScaler 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_num_transformed)
X_test_scaled = scaler.transform(X_test_num_transformed)
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=num_features)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=num_features)

# MinMaxScaler (0 to 1)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_num_transformed)
X_test_scaled = scaler.transform(X_test_num_transformed)
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=num_features)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=num_features)


Concatenating 

In [None]:
# Concatenate transformed numerical and encoded categorical features if needed
X_train = np.concatenate((X_train_scaled, X_train[cat_features]), axis=1)
X_test = np.concatenate((X_test_scaled, X_test[cat_features]), axis=1)

# Model fitting and scoring

In [None]:
# Regression 
# Linear regression
reg = LinearRegression()
reg.fit(X_train_scaled, y_train)
y_pred = reg.predict(X_test_scaled)

# K-NN (K-Nearest Neighbors) regression
reg = KNeighborsRegressor(n_neighbors=5)
# K value is trial and error (Elbow method) can be used to find the best K value, with 5 being the default
# Euclidean (default) / Manhattan (distance=1) / Minkowski distance can be used
knn_models = []
scores = []
for k in range(2,15):
    model = KNeighborsRegressor(n_neighbors=k)
    model.fit(X_train, y_train)
    knn_models.append(model)
    scores.append(model.score(X_test, y_test))
for index,score in enumerate(scores):
    print("R2 of k-nn model with {} neighbours on TEST set was: {:.2f}".format(index+2,score))

# Regression scores
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"R^2 score: {r2:.2f}")
print(f"MAE: {mae:.2f}")
print(f"MSE: {mse:.2f}")
print(f"RMSE: {rmse:.2f}")

In [None]:
# Classification
clf = LogisticRegression() # for multiclass classification, use LogisticRegression(multi_class='multinomial')
clf.fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)

# In case of imbalanced classes, use SMOTE to oversample the minority class
smote = SMOTE(k_neighbors=5, random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)
# In case of imbalanced classes, use RandomUnderSampler to undersample the majority class
rus = RandomUnderSampler(random_state=42)
X_train_under, y_train_under = rus.fit_resample(X_train_num_scaled_df, y_train)

# K-NN (K-Nearest Neighbors) 
reg = KNeighborsRegressor(n_neighbors=5)

# Classification scores
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
kappa = cohen_kappa_score(y_test, y_pred)
print(f"Accuracy: {acc:.2f}")
print(f"Precision: {prec:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Cohen's Kappa: {kappa:.2f}")

# Classification report
report = classification_report(y_test, y_pred)
print(report)

# Confusion matrix
confusion_matrix(y_test,y_pred)
# Visualize confusion matrix 
labels = np.array([['TN', 'FP'], ['FN', 'TP']])
annot = np.empty_like(cm).astype(str)
for i in range(2):
    for j in range(2):
        annot[i, j] = f'{labels[i, j]}\n{cm[i, j]}'


sns.heatmap(cm, annot=annot, fmt='', cmap='coolwarm', cbar=False)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:
# Feature importance 
coef_list = list(zip(X_train_scaled.columns, lrclassifier.coef_[0]))
coef_df = pd.DataFrame(coef_list, columns=['Feature', 'Coefficient'])
coef_df_sorted = coef_df.sort_values(by='Coefficient', ascending=False)
print(coef_df_sorted)

Save model and transformers/encoders

In [None]:
# Save for later use (avoid rerunning) 


In [None]:
# Hypothesis testing
# H0, null hypothesis (basis assumption we want to validate/reject)
# H1, alternative hypothesis
# p-value, probability of getting the observed result if the null hypothesis is true 
# alpha, significance level (0.05, 0.01, 0.001) / 1 - confidence level (95%, 99%, 99.9%)
# If p-value < alpha, reject the null hypothesis 
# If p-value > alpha, fail to reject the null hypothesis
    # Second approach for testing:
    # compute test statistic from sample and compare to critical value
    # if test statistic is greater than critical value, reject the null hypothesis
    # if test statistic is less than critical value, fail to reject the null hypothesis

# Steps: 
# Fix the significance level (alpha)
# compute the test statistic
# compute the critical value of the test's distribution
# if test statistic is in the allowed region (size of allowed region is determined by the confidence level, whereas the location of rejection region is determined by the alternative hypothesis), accept the H0, otherwise reject. 
# The allowed region depends on the type of test 


# Statistics scoring 
# Chi-2 (chi-squared) test (test for independence)
# t-student test (t-test) (test for difference in means) Good for sample size is less than 30
# 

# One tailed test (H1: mean > 0) / Two tailed test (H1: mean != 0)
# Two sample t-test (H0: mean1 = mean2) / Paired t-test (H0: mean1 = mean2)
