# DATA 1030 Project Notebook - Part IV, Testing ML Algorithms

In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import  make_pipeline
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.metrics import log_loss
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import ConstantKernel, RBF

from xgboost import XGBClassifier
from sklearn.feature_selection import f_classif, mutual_info_classif, SelectKBest



In [10]:
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils.multiclass import unique_labels

def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = np.array(classes)
    classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        
    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [4]:
df = pd.read_csv('../data/raw_data_plus_labeled_targets.csv')

features = pd.read_csv('../data/features_plus_descriptions.csv')
features.set_index('Feature Type and Number', inplace=True)
features.drop(['S5', 'D21'], axis=0, inplace=True)

X = df.loc[:,features.index]
y = df['Best Heuristic']


In [5]:
tell_types = dict.fromkeys(features.index, 'lengthlike')
fractionlike = ['S1','S3', 'S4', 'S6', 'S8','S11', 'S12', 'D3', 'D39']

for feat in fractionlike:
    tell_types[feat] = 'fractionlike'

In [7]:
minmax_feats = []
std_feats = []

for feat in tell_types.keys():
    if tell_types[feat] == 'lengthlike':
        std_feats.append(feat)
    elif tell_types[feat] == 'fractionlike':
        minmax_feats.append(feat)

print('MinMax Scaler Features: ', minmax_feats)
print('Standard Scaler Features: ', std_feats)

preprocessor = ColumnTransformer(
    transformers=[
        ('mm_scaler', MinMaxScaler(), minmax_feats),
        ('std_scaler', StandardScaler(), std_feats)])

MinMax Scaler Features:  ['S1', 'S3', 'S4', 'S6', 'S8', 'S11', 'S12', 'D3', 'D39']
Standard Scaler Features:  ['S2', 'S7', 'S9', 'S10', 'S13', 'S14', 'D1', 'D2', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'D16', 'D17', 'D18', 'D19', 'D20', 'D22', 'D23', 'D24', 'D25', 'D26', 'D27', 'D28', 'D29', 'D30', 'D31', 'D32', 'D33', 'D34', 'D35', 'D36', 'D37', 'D38']


In [35]:
def ML_pipeline_KFold_log_loss(X, y, preprocessor, ML_algo, param_grid):
    """
    This function splits the data into other and test sets (80-20 split) and
    then applies KFold with 4 folds to other set. The log-loss is minimized 
    in cross validation.
    """
    test_scores = []
    best_models = []
    for i in range(1,11):
        random_state = 431 * i
        X_other, X_test, y_other, y_test = train_test_split(X, y, test_size = 0.2, random_state=random_state)
        kf = KFold(n_splits=4, shuffle=True, random_state=random_state)
        clf = make_pipeline(preprocessor, ML_algo)
        grid = GridSearchCV(clf, param_grid=param_grid,
                                scoring='neg_log_loss', cv=kf, return_train_score=True)   
        grid.fit(X_other, y_other)
        best_models.append(grid.best_params_)
        y_pred = grid.predict_proba(X_test)
        logloss_score = log_loss(y_test, y_pred, labels=[0,1,2,3,4,5])
        print("best params", grid.best_params_, "score", logloss_score)
        test_scores.append(logloss_score)
    return best_models, test_scores


In [36]:
def ML_pipeline_KFold_accuracy(X, y, preprocessor, ML_algo, param_grid):
    """
    This function splits the data into other and test sets (80-20 split) and
    then applies KFold with 4 folds to other set. The log-loss is minimized 
    in cross validation.
    """
    test_scores = []
    best_models = []
    for i in range(1,11):
        random_state = 431 * i
        X_other, X_test, y_other, y_test = train_test_split(X, y, test_size = 0.2, random_state=random_state)
        kf = KFold(n_splits=4, shuffle=True, random_state=random_state)
        clf = make_pipeline(preprocessor, ML_algo)
        grid = GridSearchCV(clf, param_grid=param_grid,
                                scoring='accuracy', cv=kf, return_train_score=True)   
        grid.fit(X_other, y_other)
        best_models.append(grid.best_params_)
        y_pred = grid.predict(X_test)
        acc_score = accuracy_score(y_test, y_pred)
        print("best params", grid.best_params_, "score", acc_score)
        test_scores.append(acc_score)
    return best_models, test_scores


In [43]:
def ML_pipeline_KFold_classif_report_accuracy(X, y, preprocessor, ML_algo, param_grid):
    """
    This function splits the data into other and test sets (80-20 split) and
    then applies KFold with 4 folds to other set. The log-loss is minimized 
    in cross validation.
    """
    reports = []
    best_models = []
    for i in range(1,11):
        random_state = 431 * i
        X_other, X_test, y_other, y_test = train_test_split(X, y, test_size = 0.2, random_state=random_state)
        kf = KFold(n_splits=4, shuffle=True, random_state=random_state)
        clf = make_pipeline(preprocessor, ML_algo)
        grid = GridSearchCV(clf, param_grid=param_grid,
                                scoring='accuracy', cv=kf, return_train_score=True)   
        grid.fit(X_other, y_other)
        best_models.append(grid.best_params_)
        y_pred = grid.predict(X_test)
        report = classification_report(y_true, y_pred, target_names=target_names)
        reports.append(report)
    return best_models, reports



### K Neighbors Classifier

In [34]:
n_neighbors = [50, 75, 100, 125, 150] # 3, 5, 10, 15, 20, 25, 30, 40, 175, 200, 250, 300

ML_algo = KNeighborsClassifier()
param_grid = {'kneighborsclassifier__n_neighbors': n_neighbors, 'kneighborsclassifier__weights': ['distance', 'uniform']}

models, scores = ML_pipeline_KFold_log_loss(X, y, preprocessor, ML_algo, param_grid)
print('KNN Log Loss:', scores, np.mean(scores), np.std(scores))
print('Best Models:')
for m in models:
    print(m)

best params {'kneighborsclassifier__n_neighbors': 125, 'kneighborsclassifier__weights': 'uniform'} score 1.4342941542763021
best params {'kneighborsclassifier__n_neighbors': 100, 'kneighborsclassifier__weights': 'uniform'} score 1.3782162273949388
best params {'kneighborsclassifier__n_neighbors': 100, 'kneighborsclassifier__weights': 'uniform'} score 1.4541581118060336
best params {'kneighborsclassifier__n_neighbors': 200, 'kneighborsclassifier__weights': 'uniform'} score 1.3793975948933825
best params {'kneighborsclassifier__n_neighbors': 100, 'kneighborsclassifier__weights': 'uniform'} score 1.3362562089637537
best params {'kneighborsclassifier__n_neighbors': 100, 'kneighborsclassifier__weights': 'uniform'} score 1.4615059530057792
best params {'kneighborsclassifier__n_neighbors': 150, 'kneighborsclassifier__weights': 'uniform'} score 1.3946394910956332
best params {'kneighborsclassifier__n_neighbors': 100, 'kneighborsclassifier__weights': 'uniform'} score 1.3134506950491107
best par

TypeError: cannot unpack non-iterable NoneType object

In [39]:
n_neighbors = [10, 15, 20, 25, 30, 40] # 3, 5, 10, 15, 20, 25, 30, 40, 50, 60, 75, 100, 125, 150
param_grid = {'kneighborsclassifier__n_neighbors': n_neighbors, 'kneighborsclassifier__weights': ['distance', 'uniform']}
ML_algo = KNeighborsClassifier()

models2, scores2 = ML_pipeline_KFold_accuracy(X, y, preprocessor, ML_algo, param_grid)
print('KNN Accuracy:', scores2, np.mean(scores2), np.std(scores2)) 
for m2 in models2:
    print(m2)

best params {'kneighborsclassifier__n_neighbors': 15, 'kneighborsclassifier__weights': 'distance'} score 0.5898692810457516
best params {'kneighborsclassifier__n_neighbors': 20, 'kneighborsclassifier__weights': 'distance'} score 0.5939542483660131
best params {'kneighborsclassifier__n_neighbors': 20, 'kneighborsclassifier__weights': 'distance'} score 0.5849673202614379
best params {'kneighborsclassifier__n_neighbors': 20, 'kneighborsclassifier__weights': 'distance'} score 0.5939542483660131
best params {'kneighborsclassifier__n_neighbors': 10, 'kneighborsclassifier__weights': 'distance'} score 0.6053921568627451
best params {'kneighborsclassifier__n_neighbors': 10, 'kneighborsclassifier__weights': 'distance'} score 0.6217320261437909
best params {'kneighborsclassifier__n_neighbors': 15, 'kneighborsclassifier__weights': 'distance'} score 0.5980392156862745
best params {'kneighborsclassifier__n_neighbors': 15, 'kneighborsclassifier__weights': 'distance'} score 0.6062091503267973
best par

In [42]:
ML_algo = KNeighborsClassifier(n_neighbors=100, weights='uniform')
param_grid = {'kneighborsclassifier__n_neighbors': [100, 110, 120], 'kneighborsclassifier__weights': ['distance', 'uniform']}
models, reports = ML_pipeline_KFold_classif_report_precision(X, y, preprocessor, ML_algo, param_grid)
for m in models:
    print(m)
    
for r in reports:
    print(r)
    

ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

### Support Vector Classifier

In [50]:
help(SVC)

Help on class SVC in module sklearn.svm._classes:

class SVC(sklearn.svm._base.BaseSVC)
 |  SVC(*, C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', break_ties=False, random_state=None)
 |  
 |  C-Support Vector Classification.
 |  
 |  The implementation is based on libsvm. The fit time scales at least
 |  quadratically with the number of samples and may be impractical
 |  beyond tens of thousands of samples. For large datasets
 |  consider using :class:`sklearn.svm.LinearSVC` or
 |  :class:`sklearn.linear_model.SGDClassifier` instead, possibly after a
 |  :class:`sklearn.kernel_approximation.Nystroem` transformer.
 |  
 |  The multiclass support is handled according to a one-vs-one scheme.
 |  
 |  For details on the precise mathematical formulation of the provided
 |  kernel functions and how `gamma`, `coef0` and `degree` affect each
 |  

In [51]:
ML_algo = SVC(probability=True)
param_grid = {'svc__C': np.logspace(-3,4,num=8),'svc__gamma': np.logspace(-3,4,num=8)}

models, scores = ML_pipeline_KFold_log_loss(X, y, preprocessor, ML_algo, param_grid)

print('SVC Log Loss:', scores)

print('mean and std:', np.mean(scores), np.std(scores))

for m in models:
    print(m)


KeyboardInterrupt: 

### Gaussian Process Classifier

In [None]:
ML_algo = GaussianProcessClassifier()

param_grid = { 'gaussianprocessclassifier__multi_class': ['one_vs_rest'] }

models, scores = ML_pipeline_KFold_log_loss(X, y, preprocessor, ML_algo, param_grid)

print('GCP Log Loss:', scores)

print('mean and std:', np.mean(scores), np.std(scores))


# output of first run, no tuning : param_grid = { 'gaussianprocessclassifier__multi_class': ['one_vs_rest'] }
### best params {'gaussianprocessclassifier__multi_class': 'one_vs_rest'} score 1.3492029741757106

In [None]:
models2, scores2 = ML_pipeline_KFold_classif_report_accuracy(X, y, preprocessor, ML_algo, param_grid)



In [None]:
test_scores = []
best_models = []
for i in range(1,11):
    random_state = 431 * i
    X_other, X_test, y_other, y_test = train_test_split(X, y, test_size = 0.2, random_state=random_state)
    kf = KFold(n_splits=4, shuffle=True, random_state=random_state)
    clf = make_pipeline(preprocessor, ML_algo)
    grid = GridSearchCV(clf, param_grid=param_grid, scoring='neg_log_loss', cv=kf, return_train_score=True)   
    grid.fit(X_other, y_other)
    best_models.append(grid.best_params_)
    y_pred = grid.predict_proba(X_test)
    logloss_score = log_loss(y_test, y_pred, labels=[0,1,2,3,4,5])
    print("best params", grid.best_params_, "score", logloss_score)
    test_scores.append(logloss_score)

### Logistic Regression

In [None]:
ML_algo = LogisticRegression()

In [None]:
param_grid = {}

### Random Forest Classifier

## Feature Engineering

In [None]:
        
        
x='''for train_index, val_index in kf.split(X_other,y_other):
    X_train = X_other.iloc[train_index]
    y_train = y_other.iloc[train_index]
    X_valid = X_other.iloc[val_index]
    y_valid = y_other.iloc[val_index]
    print(pd.value_counts(y_train))
    X_train_prep = clf.fit_transform(X_train)
    X_valid_prep = clf.transform(X_valid)
    X_test_prep = clf.transform(X_test)
'''

