In [24]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_curve, auc
from sklearn import svm
from sklearn.model_selection import PredefinedSplit, GridSearchCV
import warnings

In [29]:
#Reads in training dataset (80% of given observations) and scales using StandardScaler
df = pd.read_csv('train_X.csv')
numeric_columns = ['min', 'max', 'mean', 'sd', 'range',
       'sum','duplicates', 'mra_D1_min', 'mra_D1_max', 'mra_D1_mean', 'mra_D1_sd',
       'mra_D1_range', 'mra_D1_zero', 'mra_D2_min', 'mra_D2_max',
       'mra_D2_mean', 'mra_D2_sd', 'mra_D2_range', 'mra_D2_zero', 'mra_D3_min',
       'mra_D3_max', 'mra_D3_mean', 'mra_D3_sd', 'mra_D3_range', 'mra_D3_zero',
       'tri_min', 'tri_max', 'tri_mean', 'tri_sd', 'tri_range', 'var_sill',
       'var_range', 'var_kappa', 'g', 'zeros', 'num_peaks', 'gradient_max', 'gradient_min', 'gradient_mean', 'gradient_stdev']
df = pd.concat([
    df[['image_id', 'classification']],
    pd.DataFrame(StandardScaler().fit_transform(df[numeric_columns]), columns=numeric_columns)
], axis=1)

#Reads in validation dataset (15% of given observations) and scales
df_val = pd.read_csv('validate_X.csv')
df_val = pd.concat([
    df_val[['image_id', 'classification']],
    pd.DataFrame(StandardScaler().fit_transform(df_val[numeric_columns]), columns=numeric_columns)
], axis=1)

#Reads in test dataset (5% of given observations) and scales
df_test = pd.read_csv('test_X.csv')
df_test = pd.concat([
    df_test[['image_id', 'classification']],
    pd.DataFrame(StandardScaler().fit_transform(df_test[numeric_columns]), columns=numeric_columns)
], axis=1)
df

Unnamed: 0,image_id,classification,min,max,mean,sd,range,sum,duplicates,mra_D1_min,...,var_sill,var_range,var_kappa,g,zeros,num_peaks,gradient_max,gradient_min,gradient_mean,gradient_stdev
0,1,high,-0.235309,-0.273585,-0.249757,-0.244137,-0.242619,-0.249757,-0.092816,0.231935,...,-0.039307,-0.038403,-0.512350,1.518153,-0.410836,0.239040,-0.261946,0.0,-0.272151,-0.261337
1,2,high,-0.235309,-0.273585,-0.249757,-0.244137,-0.242619,-0.249757,-0.092816,0.231935,...,-0.039307,-0.038851,-0.426757,1.486238,-0.410836,0.239040,-0.261946,0.0,-0.272151,-0.261337
2,3,high,-0.235309,-0.273585,-0.249757,-0.244137,-0.242619,-0.249757,-0.092816,0.231935,...,-0.039307,-0.038993,-0.426757,1.488180,-0.410836,0.239040,-0.261946,0.0,-0.272151,-0.261337
3,4,high,-0.235309,-0.273585,-0.249757,-0.244137,-0.242619,-0.249757,-0.092816,0.231935,...,-0.039307,-0.039168,-0.769128,1.435602,-0.410836,0.239040,-0.261946,0.0,-0.272151,-0.261337
4,5,high,-0.235309,-0.273585,-0.249757,-0.244137,-0.242619,-0.249757,-0.092816,0.231935,...,-0.039307,-0.039173,-0.769128,1.480840,-0.410836,0.239040,-0.261946,0.0,-0.272151,-0.261337
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
705,706,low,-0.223042,-0.260971,-0.237608,-0.232459,-0.233882,-0.237608,-0.092816,0.226484,...,-0.039306,-0.038192,0.086798,-1.432578,-0.410836,0.213364,-0.251005,0.0,-0.264181,-0.254032
706,707,low,-0.223121,-0.260998,-0.237614,-0.232451,-0.233808,-0.237614,-0.092816,0.225263,...,-0.039306,-0.038233,0.086798,-1.432506,-0.410836,0.213364,-0.251275,0.0,-0.264205,-0.254255
707,708,low,-0.223179,-0.261061,-0.237629,-0.232542,-0.233857,-0.237629,-0.092816,0.225791,...,-0.039306,-0.038395,0.172391,-1.433405,-0.410836,0.212670,-0.251209,0.0,-0.264192,-0.254200
708,709,low,-0.223129,-0.260764,-0.237616,-0.232654,-0.233239,-0.237616,-0.092816,0.225878,...,-0.039306,-0.038355,0.172391,-1.434569,-0.410836,0.211282,-0.251398,0.0,-0.264431,-0.254352


In [30]:
#Performs manual feature selection by dropping some variables from each set
dropped_columns = ['image_id','classification','min','max','range','sd','mean', 'mra_D1_zero', 'mra_D2_mean', 'mra_D2_sd', 'mra_D2_range', 'mra_D3_max', 'mra_D3_mean', 'mra_D3_sd',  'mra_D3_range', 'mra_D3_zero',   'gradient_min']

#Creates X and y for training, testing, and validation sets. 
X_train = df.drop(dropped_columns,axis=1)
y_train = df['classification']
X_val = df_val.drop(dropped_columns,axis=1)
y_val = df_val['classification']
X_test = df_test.drop(dropped_columns,axis=1)
y_test = df_test['classification']

In [31]:
#Creates an index list to be used so that gridsearchcv differentiates between existing training and validation sets
split_index = [-1]*len(X_train) + [0]*len(X_val)
ps = PredefinedSplit(test_fold = split_index)
X = np.concatenate((X_train, X_val), axis=0)
y = np.concatenate((y_train, y_val), axis=0)

############################ Random Forest Hyperparameter Search ############################################
#Initializes a random forest model
rf = RandomForestClassifier()
#Creates lists of possible parameters to try for the random forest
rf_params = {
 'max_depth': [10, 50, 100],
 'random_state':[0],
 'n_estimators': [200, 1000, 2000]
}

#Performs a grid search over the given parameters using F1 score as a metric.
clf_rf = GridSearchCV(estimator = rf, cv=ps,scoring = "f1_weighted", param_grid=rf_params)
clf_rf.fit(X, y)
print("Best Random Forest Score:", clf_rf.best_score_)
print("Best Random Forest Params:", clf_rf.best_params_)

#Displays the features in order of importance for the Random Forest model
print("\nRandom Forest Feature importances:")
#Creates a new RF model that is not a gridsearchcv object so that feature importances can be calculated
clf_rf_features = RandomForestClassifier(max_depth = clf_rf.best_params_["max_depth"], n_estimators = clf_rf.best_params_["n_estimators"], random_state = 0)
clf_rf_features.fit(X,y)
#Calculates and displays feature importances
zipped_lists = zip(clf_rf_features.feature_importances_, X_train.columns)
sorted_pairs = sorted(zipped_lists, reverse=True)
for pair in sorted_pairs:
    print(f"{round(pair[0], 2)} {pair[1]}")

############################ Logistic Regression Hyperparamter Search #######################################
warnings.filterwarnings("ignore")
#Initializes a logistic regression model
lr = LogisticRegression()
#Creates lists of possible parameters to try for the logistic regression
lr_params = {
    "C": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e2, 1e3, 1e4],
    "random_state": [0],
    "max_iter": [1000]
}
#Perfoms grid search over the given parameters using F1 score as a metric
clf_lr = GridSearchCV(estimator = lr, cv=ps,scoring = "f1_weighted", param_grid=lr_params)
clf_lr.fit(X, y)
print("Best Logistic Regression Score:", clf_lr.best_score_)
print("Best Logistic Regression Params:", clf_lr.best_params_)

############################ K Nearest Neighbors Hyperparamter Search #######################################
#Initializes a KNN model
knn = KNeighborsClassifier()
#Creates a list of n_neighbors parameter values to try
knn_params = {
    "n_neighbors": [i for i in range(1,20,1)]
}
#Performs grid search over given parameters using F1 score as a metric
clf_knn = GridSearchCV(estimator = knn, cv=ps, scoring = "f1_weighted", param_grid=knn_params)
clf_knn.fit(X, y)
print("Best KNN Classifier Score:", clf_knn.best_score_)
print("Best KNN Classifier Params:", clf_knn.best_params_)

############################ Support Vector Machine Hyperparamter Search #######################################
#Initializes a SVM classifier model
svm_mod = svm.SVC()
#Creates a list of C parameters to try
svm_params = {
    "C": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e2, 1e3, 1e4],
    "random_state": [0]
}
#Performs grid search over given list of C parameters
clf_svm = GridSearchCV(estimator = svm_mod, cv=ps,scoring = "f1_weighted", param_grid=svm_params)
clf_svm.fit(X, y)
print("Best SVM Score:", clf_svm.best_score_)
print("Best SVM Params:", clf_svm.best_params_)

Best Random Forest Score: 0.5991921864044831
Best Random Forest Params: {'max_depth': 10, 'n_estimators': 2000, 'random_state': 0}

Random Forest Feature importances:
0.24 g
0.15 var_range
0.07 sum
0.07 gradient_stdev
0.07 gradient_max
0.06 gradient_mean
0.03 mra_D3_min
0.03 zeros
0.02 mra_D1_max
0.02 num_peaks
0.02 var_kappa
0.02 mra_D2_max
0.02 tri_mean
0.02 var_sill
0.02 tri_range
0.02 tri_max
0.02 tri_min
0.02 mra_D2_min
0.02 tri_sd
0.02 mra_D1_range
0.02 mra_D1_sd
0.01 mra_D1_min
0.01 mra_D2_zero
0.01 mra_D1_mean
0.0 duplicates
Best Logistic Regression Score: 0.6883000178734707
Best Logistic Regression Params: {'C': 1, 'max_iter': 1000, 'random_state': 0}
Best KNN Classifier Score: 0.6117043450376785
Best KNN Classifier Params: {'n_neighbors': 14}
Best SVM Score: 0.6827126861706063
Best SVM Params: {'C': 100.0, 'random_state': 0}


In [32]:
#Uses test set to assess final test error of tuned model
print(f"RF Test Accuracy: {round(clf_rf.score(X_test, y_test), 3)}")
print(f"LR Test Accuracy: {round(clf_lr.score(X_test, y_test), 3)}")
print(f"KNN Test Accuracy: {round(clf_knn.score(X_test, y_test), 3)}")
print(f"SVM Test Accuracy: {round(clf_svm.score(X_test, y_test), 3)}")

RF Test Accuracy: 0.868
LR Test Accuracy: 0.683
KNN Test Accuracy: 0.735
SVM Test Accuracy: 0.877


In [19]:
#Prints confusion a confusion matrix for each model
print(confusion_matrix(y_test, clf_rf.predict(X_test)))
print(confusion_matrix(y_test, clf_lr.predict(X_test)))
print(confusion_matrix(y_test, clf_knn.predict(X_test)))
print(confusion_matrix(y_test, clf_svm.predict(X_test)))

[[68  0  1]
 [ 0 20  0]
 [ 8  8 30]]
[[69  0  0]
 [ 0  3 17]
 [14  5 27]]
[[55  0 14]
 [ 0  7 13]
 [19  9 18]]
[[69  0  0]
 [ 0 17  3]
 [10 12 24]]


In [20]:
#Prints percent of correct predictions for each model based on confusion matrices

print("RF % Correct:",(24+1+8)/(24+2+11+8+ 1+ 1+ 2))
print("LR % Correct:",(4+3+4)/(4+3+4+21+1+2+14))
print("KNN % Correct:",(17+2+16)/(17+2+16+2+7+1+3+1))
print("SVM % Correct:",(19+1+3)/(19+1+3+5+2+2+8+9))

RF % Correct: 0.673469387755102
LR % Correct: 0.22448979591836735
KNN % Correct: 0.7142857142857143
SVM % Correct: 0.46938775510204084


In [None]:
def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True):
    """
    given a sklearn confusion matrix (cm), make a nice plot

    Arguments
    ---------
    cm:           confusion matrix from sklearn.metrics.confusion_matrix

    target_names: given classification classes such as [0, 1, 2]
                  the class names, for example: ['high', 'medium', 'low']

    title:        the text to display at the top of the matrix

    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
                  see http://matplotlib.org/examples/color/colormaps_reference.html
                  plt.get_cmap('jet') or plt.cm.Blues

    normalize:    If False, plot the raw numbers
                  If True, plot the proportions

    Usage
    -----
    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by
                                                              # sklearn.metrics.confusion_matrix
                          normalize    = True,                # show proportions
                          target_names = y_labels_vals,       # list of names of the classes
                          title        = best_estimator_name) # title of graph
  Citiation
    ---------
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

    """
    import matplotlib.pyplot as plt
    import numpy as np
    import itertools

    accuracy = np.trace(cm) / np.sum(cm).astype('float')
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
     plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.show()
    
plot_confusion_matrix(confusion_matrix(y_test, clf_rf.predict(X_test))