##### Machine Learning Course Project
# Modelling
##### Darryl Abraham, Riccardo Paciello

### Importing Libraries

In [38]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import os
import csv
import warnings

# Suppress
warnings.filterwarnings("ignore")

### Load Data

In [76]:
df = pd.read_csv('./data/preprocessed_data.csv', sep = ',', chunksize=10000, low_memory=False)
df = pd.concat(df, axis='rows')
df = df.rename(columns={'Unnamed: 0': 'idx'})
df = df.set_index('idx')
df.head()

Unnamed: 0_level_0,OSOURCE,TCODE,STATE,MAILCODE,RECINHSE,RECP3,CLUSTER,WEALTH1,SOLIH,WEALTH2,...,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,TARGET_B,TARGET_D
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20,20,2,10,0,0,0,12,8.0,99.0,4.0,...,-0.56004,-0.508209,0.516819,-0.086505,0.588516,-0.07223,0.361932,-0.464355,1,4.0
30,22,0,14,0,0,0,35,6.0,99.0,5.0,...,0.362351,-1.191198,-0.102913,0.03573,0.221015,0.078312,-0.61123,-0.906074,1,7.0
45,6,0,7,0,0,0,24,9.0,99.0,9.0,...,-0.656215,0.384504,-0.686458,-0.154291,1.135503,-0.115429,1.36323,0.313729,1,5.0
78,54,0,1,0,0,0,13,7.0,99.0,9.0,...,-1.30696,-0.273782,0.89927,0.447165,1.858439,0.358546,-0.54036,0.066545,1,13.0
93,23,1,18,0,0,0,18,7.0,99.0,7.0,...,1.365392,-0.49822,-1.371105,0.960526,-0.823202,-0.144125,-1.227357,-0.045361,1,10.0


In [40]:
feats_to_encode = ['OSOURCE', 'TCODE', 'STATE', 'CLUSTER', 'CLUSTER2']

### Train-Test Split    

In [41]:
X = df.drop(columns=['TARGET_B', 'TARGET_D'])
y = df['TARGET_B']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Modelling

Encoding pipeline set up.

In [42]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, feats_to_encode)
    ])

*Logistic Regression*

In [90]:
cs = [0.0001, 0.001, 0.01, 0.5, 0.1, 1, 1.5, 2]
l1ratios = [0.33, 0.5, 0.66]
solvers = {'newton-cg': ['l2', None], 
           'lbfgs': ['l2', None], 
           'liblinear': ['l1', 'l2'], 
           'sag': ['l2', None], 
           'saga': ['l1', 'l2', 'elasticnet', None], 
           'newton-cholesky': ['l2', None]
           }

csv_file_path = './eval/logistic_regression_eval.csv'

# Check if the file exists
if not os.path.exists(csv_file_path):
    os.makedirs(os.path.dirname(csv_file_path), exist_ok=True)
    # If the file doesn't exist, create a new file and write the header
    with open(csv_file_path, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["Solver", "Regularizer", "C", "L1 Ratio", "CV scores", "Mean CV Accuracy", "Std Dev CV Accuracy"])

# Now open the CSV file in append mode
with open(csv_file_path, mode='a', newline='') as file:
    
    writer = csv.writer(file)

    for solver, regs in solvers.items():
        for reg in regs:
            if reg:
                if reg == 'elasticnet':
                    for c in cs:
                        for l1ratio in l1ratios:
                            model = LogisticRegression(penalty=reg, C=c, solver=solver, max_iter=1000, n_jobs=-1, l1_ratio=l1ratio, random_state=0)                                
                            clf = Pipeline(steps=[('preprocessor', preprocessor),
                                                  ('classifier', model)])
                            cv_scores = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
                            
                            print(f"Solver: {solver}, Regularizer: {reg}, C: {c}, L1 Ratio: {l1ratio}")
                            print("Cross-validation scores:", cv_scores)
                            print("Mean CV Accuracy:", cv_scores.mean())
                            print("Standard Deviation of CV Accuracy:", cv_scores.std())
                            
                            row = [solver, reg, c, l1ratio, cv_scores, cv_scores.mean(), cv_scores.std()]
                            writer.writerow(row)
                            
                else:
                    for c in cs:
                        model = LogisticRegression(penalty=reg, C=c, solver=solver, max_iter=1000, n_jobs=-1, random_state=0)                                
                        clf = Pipeline(steps=[('preprocessor', preprocessor),
                                              ('classifier', model)])
                        cv_scores = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
                        
                        print(f"Solver: {solver}, Regularizer: {reg}, C: {c}, L1 Ratio: None")
                        print("Cross-validation scores:", cv_scores)
                        print("Mean CV Accuracy:", cv_scores.mean())
                        print("Standard Deviation of CV Accuracy:", cv_scores.std())
                        
                        row = [solver, reg, c, None, cv_scores, cv_scores.mean(), cv_scores.std()]
                        writer.writerow(row)
                        
            else:
                model = LogisticRegression(penalty=reg, solver=solver, max_iter=10000, n_jobs=-1, random_state=0)
                clf = Pipeline(steps=[('preprocessor', preprocessor),
                                      ('classifier', model)])               
                cv_scores = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
                
                print(f"Solver: {solver}, Regularizer: None, C: None, L1 Ratio: None")
                print("Cross-validation scores:", cv_scores)
                print("Mean CV Accuracy:", cv_scores.mean())
                print("Standard Deviation of CV Accuracy:", cv_scores.std())
                
                row = [solver, None, None, None, cv_scores, cv_scores.mean(), cv_scores.std()]
                writer.writerow(row)

Solver: newton-cg, Regularizer: l2, C: 0.0001, L1 Ratio: None
Cross-validation scores: [0.76883385 0.76883385 0.76883385 0.76883385 0.76883385 0.76883385
 0.7696281  0.7696281  0.76859504 0.76859504]
Mean CV Accuracy: 0.7689449376966968
Standard Deviation of CV Accuracy: 0.00035388104002423963
Solver: newton-cg, Regularizer: l2, C: 0.001, L1 Ratio: None
Cross-validation scores: [0.76883385 0.76883385 0.76883385 0.76883385 0.76883385 0.76883385
 0.7696281  0.7696281  0.76859504 0.76859504]
Mean CV Accuracy: 0.7689449376966968
Standard Deviation of CV Accuracy: 0.00035388104002423963
Solver: newton-cg, Regularizer: l2, C: 0.01, L1 Ratio: None
Cross-validation scores: [0.76883385 0.76883385 0.76883385 0.76883385 0.76883385 0.76883385
 0.7696281  0.7696281  0.76859504 0.76859504]
Mean CV Accuracy: 0.7689449376966968
Standard Deviation of CV Accuracy: 0.00035388104002423963
Solver: newton-cg, Regularizer: l2, C: 0.5, L1 Ratio: None
Cross-validation scores: [0.77089783 0.76883385 0.76883385 

In [96]:
# Get best model from eval file
eval_df = pd.read_csv(csv_file_path)
eval_df.drop_duplicates(inplace=True)
best_model = eval_df.loc[eval_df['Mean CV Accuracy'].idxmax()]
best_model

Solver                                                         newton-cg
Regularizer                                                           l2
C                                                                    0.1
L1 Ratio                                                             NaN
CV scores              [0.76883385 0.76883385 0.76986584 0.76883385 0...
Mean CV Accuracy                                                0.769048
Std Dev CV Accuracy                                             0.000445
Name: 4, dtype: object

In [99]:
# Get full model
solver = best_model['Solver']
reg = best_model['Regularizer']
c = best_model['C']
l1ratio = best_model['L1 Ratio']

clf = LogisticRegression(penalty=reg, C=c, solver=solver, max_iter=1000, n_jobs=-1, random_state=0) #, l1_ratio=l1ratio)

clf.fit(X_train, y_train)

#Intercept
intercept = clf.intercept_
print(f"Intercept: {intercept}")

# Coefficients
coefs = clf.coef_
coefs = pd.DataFrame(coefs, columns=X_train.columns).T.rename(columns={0: 'Coefficient'})
coefs.sort_values(by='Coefficient', ascending=False)

Intercept: [0.52522966]


Unnamed: 0,Coefficient
RECP3,0.510737
RECINHSE,0.463332
PC12,0.272702
PC13,0.156191
PC8,0.114003
CRAFTS,0.111391
VETERANS,0.076029
PC3,0.062658
RFA_2F,0.056892
WEALTH1,0.029335


In [98]:
# Evaluate model on test data
test_score = clf.score(X_test, y_test)
print("Test Accuracy:", test_score)

Test Accuracy: 0.7616099071207431


*Linear Regression*

In [80]:
y = df['TARGET_D']

model = LinearRegression()

regressor = Pipeline(steps=[('preprocessor', preprocessor),
                            ('regressor', model)])

model.fit(X, y)

# Perform cross-validation
cv_scores = cross_val_score(regressor, X, y, cv=10, scoring='neg_mean_squared_error')

# Convert scores to positive values and take the square root to get RMSE
cv_scores = np.sqrt(-cv_scores)

print("Cross-validation scores:", cv_scores)
print("Mean CV RMSE:", cv_scores.mean())
print("Standard Deviation of CV RMSE:", cv_scores.std())

# Intercept
intercept = regressor.named_steps['regressor'].intercept_
print(f"Intercept: {intercept}")
# Coefficients
coefs = regressor.named_steps['regressor'].coef_
coefs = pd.DataFrame(coefs, index=X.columns, columns=['Coefficient'])
coefs.sort_values(by='Coefficient', ascending=False)

Cross-validation scores: [12.03396845 13.24932336 12.08261031 14.73766101 12.21717645 12.8184523
 11.72816914 12.94476236 12.03911351 13.03905687]
Mean CV RMSE: 12.689029375140473
Standard Deviation of CV RMSE: 0.8414646113878047
Intercept: 13.928636063659482


Unnamed: 0,Coefficient
PC5,2.377209
RECP3,1.751809
PC4,1.152718
PC12,0.858257
PC13,0.809571
VETERANS,0.466263
PC8,0.438994
PC7,0.21079
RFA_2A,0.177662
WEALTH1,0.11853
