In [1]:
import pandas as pd

In [16]:
df = pd.read_csv('clean_df.csv', 
                        dtype={'has_children': str,
                              'direction_same': str})

In [17]:
from sklearn.model_selection import train_test_split, KFold
from sklearn import base
import numpy as np
RANDOM_SEED = 2021

x = df.drop(columns=['Y'])
y = df.Y
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=RANDOM_SEED, test_size=0.2)

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [20]:
num_features_a = X_train.select_dtypes(['int64', 'float64']).columns
cat_features_a = X_train.select_dtypes(['object']).columns
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', StandardScaler())
])
cat_transformer = OneHotEncoder()
preprocessor_a = ColumnTransformer(transformers=[
    ('num', num_transformer, num_features_a),
    ('cat', cat_transformer, cat_features_a)
])
X_train = preprocessor_a.fit_transform(X_train)
X_test = preprocessor_a.transform(X_test)

In [21]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC

In [22]:
logreg_clf = LogisticRegression(solver='saga', max_iter=500,
                               random_state=RANDOM_SEED)
dt_clf = DecisionTreeClassifier(random_state=RANDOM_SEED)
bnb_clf = BernoulliNB()
knn_clf = KNeighborsClassifier()
lsvm_clf = LinearSVC(max_iter=5000, dual=False)

In [23]:
from sklearn.utils.fixes import loguniform

In [24]:
logreg_params = dict(C=loguniform(1e-1, 1e2),
                     penalty=['l1', 'l2'])
dt_params = dict(criterion=['gini', 'entropy'],
                 min_samples_split=[2, 4, 6, 8, 10],
                 max_depth=[2, 4, 6, 8, 10])
bnb_params = dict(alpha=loguniform(1e-1, 1e0))
knn_params = dict(n_neighbors=[2, 4, 6, 8, 10, 12, 14, 20],
               weights=['uniform', 'distance'],
               metric=['euclidean', 'manhattan'])
lsvm_params = dict(C=loguniform(1e-1, 1e2))

In [25]:
clf_list = [logreg_clf, dt_clf, bnb_clf, knn_clf, lsvm_clf]
params_list = [logreg_params, dt_params, bnb_params, knn_params, lsvm_params]
model_names = ['Logistic Regression', 'Decison Tree', 'Bernoulli Naive Bayes',
               'KNN Classifier', 'Linear SVM']

In [26]:
from sklearn.model_selection import RandomizedSearchCV

In [27]:
def train_model(X, y, model_suffix, clf_list=clf_list, params_list=params_list):
    for i in range(len(clf_list)):
        # model training with RandomizedSearchCV
        rscv = RandomizedSearchCV(estimator=clf_list[i],
                                  param_distributions=params_list[i],
                                  n_jobs=-1, random_state=RANDOM_SEED).fit(X, y)
        # store cv results
        globals()['rscv%s' % model_suffix[i]] = pd.DataFrame(rscv.cv_results_)
        # store the best model
        globals()['best%s' % model_suffix[i]] = rscv

In [28]:
def record_best_result(model_list, model_suffix):
    # store the best results into a dataframe
    for i in range(len(model_list)):
        globals()['df%s' % model_suffix[i]] = model_list[i].query('rank_test_score == 1')\
        [['params', 'mean_test_score', 'std_test_score']]

In [29]:
from sklearn import metrics
from sklearn.metrics import silhouette_samples, silhouette_score

In [30]:
def model_eval(clf_list, model_names, X_test, y_test):
    test_acc = []
    f1_score = []
    for clf in clf_list:
        test_acc.append(clf.score(X_test, y_test))
        f1_score.append(metrics.f1_score(y_test, clf.predict(X_test)))
    return pd.DataFrame(data={'model': model_names, 'test_acc': test_acc, 'f1_score': f1_score})

In [32]:
# train models
model_suffix_a = ['_logreg_a', '_dt_a', '_bnb_a', '_knn_a', '_lsvm_a']
train_model(X_train, y_train, model_suffix_a)

# record best results in cross validation
rscv_list_a = [rscv_logreg_a, rscv_dt_a, rscv_bnb_a, rscv_knn_a, rscv_lsvm_a]
record_best_result(rscv_list_a, model_suffix_a)

# output the best results as a dataframe
df_list_a = [df_logreg_a, df_dt_a, df_bnb_a, df_knn_a, df_lsvm_a]
for df, model in zip(df_list_a, model_names):
    df['model'] = model
result_df_a = pd.concat(df_list_a)

# check test scores
best_clfs_a = [best_logreg_a, best_dt_a, best_bnb_a, best_knn_a, best_lsvm_a]
test_result_a = model_eval(best_clfs_a, model_names, X_test, y_test)

In [35]:
best_logreg_a

RandomizedSearchCV(estimator=LogisticRegression(max_iter=500, random_state=2021,
                                                solver='saga'),
                   n_jobs=-1,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000192820709A0>,
                                        'penalty': ['l1', 'l2']},
                   random_state=2021)

In [33]:
result_df_a

Unnamed: 0,params,mean_test_score,std_test_score,model
4,"{'C': 0.344333532380571, 'penalty': 'l1'}",0.683949,0.009104,Logistic Regression
9,"{'min_samples_split': 2, 'max_depth': 8, 'crit...",0.692747,0.005682,Decison Tree
0,{'alpha': 0.40362520519080136},0.656524,0.006633,Bernoulli Naive Bayes
1,{'alpha': 0.5412144223247399},0.656524,0.006633,Bernoulli Naive Bayes
7,{'alpha': 0.5661420703684213},0.656524,0.006633,Bernoulli Naive Bayes
8,{'alpha': 0.4593677628057311},0.656524,0.006633,Bernoulli Naive Bayes
9,{'alpha': 0.6085694290112444},0.656524,0.006633,Bernoulli Naive Bayes
6,"{'weights': 'distance', 'n_neighbors': 12, 'me...",0.695539,0.00803,KNN Classifier
5,{'C': 0.24237461166526628},0.685087,0.008189,Linear SVM


In [34]:
test_result_a


Unnamed: 0,model,test_acc,f1_score
0,Logistic Regression,0.687086,0.743555
1,Decison Tree,0.688742,0.742642
2,Bernoulli Naive Bayes,0.666391,0.714993
3,KNN Classifier,0.692053,0.742025
4,Linear SVM,0.687914,0.743537
