In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import  make_pipeline
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, ParameterGrid
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.metrics import log_loss
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import ConstantKernel, RBF

from xgboost import XGBClassifier
from sklearn.feature_selection import f_classif, mutual_info_classif, SelectKBest

In [3]:
df = pd.read_csv('../data/multilabel_raw_data.csv')
features = pd.read_csv('../data/features_plus_descriptions.csv')
features.set_index('Feature Type and Number', inplace=True)
features.drop(['S5', 'D21'], axis=0, inplace=True)
H_Best = ['H0_Best', 'H1_Best', 'H2_Best', 'H3_Best', 'H4_Best', 'H5_Best']
y = df[H_Best]
X = df.loc[:,features.index]
y

Unnamed: 0,H0_Best,H1_Best,H2_Best,H3_Best,H4_Best,H5_Best
0,1,0,0,0,0,0
1,0,1,1,0,1,1
2,1,0,0,0,0,0
3,1,0,0,0,0,0
4,1,0,0,0,0,0
...,...,...,...,...,...,...
6113,1,0,0,0,0,0
6114,1,0,0,0,0,0
6115,1,0,0,0,0,0
6116,1,0,0,0,0,0


In [3]:
df2 = pd.read_csv('../data/raw_data_plus_labeled_targets.csv')

X2 = df2.loc[:,features.index]
y2 = df2['Best Heuristic']

In [4]:
tell_types = dict.fromkeys(features.index, 'lengthlike')
fractionlike = ['S1','S3', 'S4', 'S6', 'S8','S11', 'S12', 'D3', 'D39']

for feat in fractionlike:
    tell_types[feat] = 'fractionlike'

In [5]:
minmax_feats = []
std_feats = []

for feat in tell_types.keys():
    if tell_types[feat] == 'lengthlike':
        std_feats.append(feat)
    elif tell_types[feat] == 'fractionlike':
        minmax_feats.append(feat)

print('MinMax Scaler Features: ', minmax_feats)
print('Standard Scaler Features: ', std_feats)

preprocessor = ColumnTransformer(
    transformers=[
        ('mm_scaler', MinMaxScaler(), minmax_feats),
        ('std_scaler', StandardScaler(), std_feats)])



MinMax Scaler Features:  ['S1', 'S3', 'S4', 'S6', 'S8', 'S11', 'S12', 'D3', 'D39']
Standard Scaler Features:  ['S2', 'S7', 'S9', 'S10', 'S13', 'S14', 'D1', 'D2', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'D16', 'D17', 'D18', 'D19', 'D20', 'D22', 'D23', 'D24', 'D25', 'D26', 'D27', 'D28', 'D29', 'D30', 'D31', 'D32', 'D33', 'D34', 'D35', 'D36', 'D37', 'D38']


In [8]:
def ML_pipeline_KFold_f1_macro(X, y, preprocessor, ML_algo, param_grid):
    """
    This function splits the data into other and test sets (80-20 split) and
    then applies KFold with 4 folds to other set.
    
    f1 score 
    """
    test_scores = []
    best_models = []
    for i in range(1,11):
        random_state = 431 * i
        X_other, X_test, y_other, y_test = train_test_split(X, y, test_size = 0.2, random_state=random_state)
        kf = KFold(n_splits=4, shuffle=True, random_state=random_state)
        clf = make_pipeline(preprocessor, OneVsRestClassifier(ML_algo))
        grid = GridSearchCV(clf, param_grid=param_grid, scoring='f1_macro', cv=kf, return_train_score=True)   
        grid.fit(X_other, y_other)
        best_models.append(grid.best_params_)
        y_pred = grid.predict(X_test)
        f1 = f1_score(y_test, y_pred, average="macro")
        print("best params", grid.best_params_, "score", f1)
        test_scores.append(f1)
    return best_models, test_scores

In [9]:
def ML_pipeline_KFold_f1_micro(X, y, preprocessor, ML_algo, param_grid):
    """
    This function splits the data into other and test sets (80-20 split) and
    then applies KFold with 4 folds to other set. The log-loss is minimized 
    in cross validation.
    """
    test_scores = []
    best_models = []
    for i in range(1,3):
        random_state = 431 * i
        X_other, X_test, y_other, y_test = train_test_split(X, y, test_size = 0.2, random_state=random_state)
        kf = KFold(n_splits=4, shuffle=True, random_state=random_state)
        clf = make_pipeline(preprocessor, OneVsRestClassifier(ML_algo))
        grid = GridSearchCV(clf, param_grid=param_grid, scoring='f1_micro', cv=kf, return_train_score=True)   
        grid.fit(X_other, y_other)
        best_models.append(grid.best_params_)
        y_pred = grid.predict(X_test)
        f1 = f1_score(y_test, y_pred, average="micro")
        print("best params", grid.best_params_, "score", f1)
        test_scores.append(f1)
    return best_models, test_scores


In [None]:
n_neighbors = [5,10,15,20,25]
ML_algo = KNeighborsClassifier()

In [None]:
n_neighbors = [5,10,15,20,30,40,50,60,70,75]#, 100, 125, 150, 175, 200] # 3, 5, 10, 15, 20, 25, 30, 40, 175, 200, 250, 300

ML_algo = 
param_grid = {'kneighborsclassifier__n_neighbors': n_neighbors, 'kneighborsclassifier__weights': ['distance', 'uniform']}

models, scores = ML_pipeline_KFold_f1_weighted(X, y, preprocessor, ML_algo, param_grid)
print('KNN f1:', scores, np.mean(scores), np.std(scores))
print('Best Models:')
for m in models:
    print(m)

In [19]:
test_scores = []
best_models = []
n_neighbors = [1,2,3,4,5,6,7,8,9,10]# 15, 20, 30, 50, 75]

param_grid = {'n_neighbors': n_neighbors, 'weights': ['distance', 'uniform']}
pg = ParameterGrid(param_grid)
for j in range(len(pg)):
    params = pg[j]
    for i in range(1,2):
        random_state = 431 * i
        X_other, X_test, y_other, y_test = train_test_split(X, y, test_size = 0.2, random_state=random_state)
        kf = KFold(n_splits=4, shuffle=True, random_state=random_state)
        KNN = KNeighborsClassifier(**params)
       # KNN.set_params()
        OVR = OneVsRestClassifier(KNN)
        clf = make_pipeline(preprocessor, OVR)
        for train_index, val_index in kf.split(X_other, y_other):
            X_train = X_other.iloc[train_index]
            X_val = X_other.iloc[val_index]
            y_train = y_other.iloc[train_index]
            y_val = y_other.iloc[val_index]
            X_train_prep = clf.fit_transform(X_train)
            X_val_prep = clf.transform(X_val)
            train_score = f1_score(y_train, clf.predict(X_train_prep), average="macro")
            val_score = f1_score(y_val, clf.predict(X_val_prep), average="macro") 
            print("val score", val_score)
        X_test_prep = clf.transform(X_test)
        y_pred = clf.predict(X_test_prep)
        f1 = f1_score(y_test, y_pred, average="macro")
        print(params, "test score", f1)
        test_scores.append(f1)



ValueError: Expected array-like (array or non-string sequence), got None

In [32]:
test_scores = []
best_models = []
for i in range(1,3):
    random_state = 431 * i
    X_other, X_test, y_other, y_test = train_test_split(X, y, test_size = 0.2, random_state=random_state)
    kf = KFold(n_splits=4, shuffle=True, random_state=random_state)
    clf = make_pipeline(preprocessor, ML_algo)
    grid = GridSearchCV(clf, param_grid=param_grid,
                            scoring='f1_micro', cv=kf, return_train_score=True)   
    grid.fit(X_other, y_other)
    best_models.append(grid.best_params_)
    y_pred = grid.predict(X_test)
    acc_score = accuracy_score(y_test, y_pred)
    print("best params", grid.best_params_, "score", acc_score)
    test_scores.append(acc_score)

NameError: name 'ML_algo' is not defined

In [31]:
test_scores = []
best_models = []
n_neighbors = [1,2,3,4,5,6,7,8,9,10]# 15, 20, 30, 50, 75]

for n in n_neighbors:
    for i in range(1,3):
        random_state = 431 * i
        X_other, X_test, y_other, y_test = train_test_split(X, y, test_size = 0.2, random_state=random_state)
        kf = KFold(n_splits=4, shuffle=True, random_state=random_state)
        clf = make_pipeline(preprocessor, OneVsRestClassifier(KNeighborsClassifier(n_neighbors=n)))
        for train_index, val_index in kf.split(X_other, y_other):
            X_train = X_other.iloc[train_index]
            X_val = X_other.iloc[val_index]
            y_train = y_other.iloc[train_index]
            y_val = y_other.iloc[val_index]
            X_train_prep = clf.fit(X_train)
            X_val_prep = clf.transform(X_val)
            val_score = f1_score(y_val, clf.predict(X_val_prep)) 
            print("val score", val_score)
        X_test_prep = clf.transform(X_test)
        y_pred = clf.predict(X_test_prep)
        f1 = f1_score(y_test, y_pred, average="micro")
        print(n, "test score", f1)
        test_scores.append(f1)


ValueError: Expected array-like (array or non-string sequence), got None