##### Machine Learning Course Project
# Modelling
##### Darryl Abraham, Riccardo Paciello

### Importing Libraries

In [32]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

### Load Data

In [33]:
df = pd.read_csv('./data/train_preprocessed.csv', sep = ',', chunksize=10000, low_memory=False)
df = pd.concat(df, axis='rows')
df = df.rename(columns={'Unnamed: 0': 'idx'})
df = df.set_index('idx')
df.head()

Unnamed: 0_level_0,OSOURCE,TCODE,STATE,MAILCODE,RECINHSE,RECP3,CLUSTER,WEALTH1,SOLIH,WEALTH2,...,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,TARGET_B,TARGET_D
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20,20,2,10,0,0,0,12,8.0,99.0,4.0,...,-0.56004,-0.508209,0.516819,-0.086505,0.588516,-0.07223,0.361932,-0.464355,1,4.0
30,22,0,14,0,0,0,35,6.0,99.0,5.0,...,0.362351,-1.191198,-0.102913,0.03573,0.221015,0.078312,-0.61123,-0.906074,1,7.0
45,6,0,7,0,0,0,24,9.0,99.0,9.0,...,-0.656215,0.384504,-0.686458,-0.154291,1.135503,-0.115429,1.36323,0.313729,1,5.0
78,54,0,1,0,0,0,13,7.0,99.0,9.0,...,-1.30696,-0.273782,0.89927,0.447165,1.858439,0.358546,-0.54036,0.066545,1,13.0
93,23,1,18,0,0,0,18,7.0,99.0,7.0,...,1.365392,-0.49822,-1.371105,0.960526,-0.823202,-0.144125,-1.227357,-0.045361,1,10.0


In [34]:
feats_to_encode = ['OSOURCE', 'TCODE', 'STATE', 'CLUSTER', 'CLUSTER2']

### Modelling

Encoding pipeline set up.

In [35]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, feats_to_encode)
    ])

*Logistic Regression*

In [36]:
model = LogisticRegression()

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', model)])

X_train = df.drop(columns=['TARGET_B', 'TARGET_D'])
y_train = df['TARGET_B']

clf.fit(X_train, y_train)

cv_scores = cross_val_score(clf, X_train, y_train, cv=10)

print("Cross-validation scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())
print("Standard Deviation of CV Accuracy:", cv_scores.std())

Cross-validation scores: [0.76986584 0.76883385 0.76883385 0.76883385 0.76780186 0.76676987
 0.76756198 0.76859504 0.76342975 0.76446281]
Mean CV Accuracy: 0.7674988699263959
Standard Deviation of CV Accuracy: 0.001963498573816208


*Linear Regression*

In [37]:
model = LinearRegression()

regressor = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', model)])

X_train = df.drop(columns=['TARGET_B', 'TARGET_D'])
y_train = df['TARGET_D']

regressor.fit(X_train, y_train)

# Perform cross-validation
cv_scores = cross_val_score(regressor, X_train, y_train, cv=10, scoring='neg_mean_squared_error')

# Convert scores to positive values and take the square root to get RMSE
cv_scores = np.sqrt(-cv_scores)

print("Cross-validation scores:", cv_scores)
print("Mean CV RMSE:", cv_scores.mean())
print("Standard Deviation of CV RMSE:", cv_scores.std())

Cross-validation scores: [12.03396845 13.24932336 12.08261031 14.73766101 12.21717645 12.8184523
 11.72816914 12.94476236 12.03911351 13.03905687]
Mean CV RMSE: 12.689029375140473
Standard Deviation of CV RMSE: 0.8414646113878047
