In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_curve, auc
from sklearn import svm
from sklearn.model_selection import PredefinedSplit, GridSearchCV
import warnings

In [2]:
#Reads in training dataset (80% of given observations) and scales using StandardScaler
df = pd.read_csv('train_X.csv')
numeric_columns = ['min', 'max', 'mean', 'sd', 'range',
       'sum','duplicates', 'mra_D1_min', 'mra_D1_max', 'mra_D1_mean', 'mra_D1_sd',
       'mra_D1_range', 'mra_D1_zero', 'mra_D2_min', 'mra_D2_max',
       'mra_D2_mean', 'mra_D2_sd', 'mra_D2_range', 'mra_D2_zero', 'mra_D3_min',
       'mra_D3_max', 'mra_D3_mean', 'mra_D3_sd', 'mra_D3_range', 'mra_D3_zero',
       'tri_min', 'tri_max', 'tri_mean', 'tri_sd', 'tri_range', 'var_sill',
       'var_range', 'var_kappa', 'g', 'zeros', 'num_peaks', 'gradient_max', 'gradient_min', 'gradient_mean', 'gradient_stdev']
df = pd.concat([
    df[['image_id', 'classification']],
    pd.DataFrame(StandardScaler().fit_transform(df[numeric_columns]), columns=numeric_columns)
], axis=1)

#Reads in validation dataset (15% of given observations) and scales
df_val = pd.read_csv('validate_X.csv')
df_val = pd.concat([
    df_val[['image_id', 'classification']],
    pd.DataFrame(StandardScaler().fit_transform(df_val[numeric_columns]), columns=numeric_columns)
], axis=1)

#Reads in test dataset (5% of given observations) and scales
df_test = pd.read_csv('test_X.csv')
df_test = pd.concat([
    df_test[['image_id', 'classification']],
    pd.DataFrame(StandardScaler().fit_transform(df_test[numeric_columns]), columns=numeric_columns)
], axis=1)
df

Unnamed: 0,image_id,classification,min,max,mean,sd,range,sum,duplicates,mra_D1_min,...,var_sill,var_range,var_kappa,g,zeros,num_peaks,gradient_max,gradient_min,gradient_mean,gradient_stdev
0,371,med,-0.197098,-0.230893,-0.210207,-0.214613,-0.206882,-0.210207,-1.198409,0.188957,...,-0.037780,-0.047950,-0.761404,-0.830924,-0.384974,0.228128,-0.231914,0.0,-0.253649,-0.230840
1,693,low,3.324597,4.796719,4.874356,5.627270,5.579711,4.874356,0.419718,-4.168532,...,-0.004684,-0.048187,-0.155197,-1.134601,-0.384974,0.214048,-0.218007,0.0,-0.242751,-0.221400
2,404,med,-0.198599,-0.239107,-0.214293,-0.226611,-0.223416,-0.214293,0.859210,0.209883,...,-0.037781,-0.048127,0.018005,0.277871,-0.384974,0.199968,-0.229447,0.0,-0.238214,-0.230084
3,679,low,-0.183238,-0.226061,-0.201066,-0.218583,-0.218760,-0.201066,-0.199565,0.208781,...,-0.037780,-0.047253,0.537611,-1.169235,-0.384974,0.196448,-0.231728,0.0,-0.253378,-0.230842
4,967,low,-0.183756,-0.224326,-0.199780,-0.213349,-0.213875,-0.199780,-0.359380,0.205124,...,-0.037780,-0.047284,0.104606,-1.125806,-0.384974,0.211936,-0.220567,0.0,-0.249421,-0.223404
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
779,451,med,-0.198599,-0.218270,-0.203776,-0.144040,-0.175138,-0.203776,-0.449276,0.172585,...,-0.037731,-0.044290,0.018005,-0.082908,0.240790,0.216864,-0.206611,0.0,-0.232781,-0.211051
780,138,high,-0.198634,-0.239096,-0.214293,-0.226580,-0.223333,-0.214293,0.589522,0.209836,...,-0.037781,-0.048310,-0.241798,,-0.384974,0.207008,-0.231914,0.0,-0.253649,-0.230840
781,537,med,-0.185277,-0.228114,-0.203077,-0.222786,-0.220120,-0.203077,-1.058571,0.209486,...,-0.037781,-0.047428,-0.241798,-1.209707,-0.384974,0.228128,-0.231914,0.0,-0.253649,-0.230840
782,697,low,3.320537,4.804306,4.876090,5.633489,5.604048,4.876090,0.469661,-4.071456,...,-0.004643,-0.048187,-0.155197,-1.134963,-0.384974,0.228832,-0.218940,0.0,-0.241302,-0.222090


In [6]:
#Performs manual feature selection by dropping some variables from each set
dropped_columns = ['image_id','classification','min','max','range','sd','mean', 'num_peaks', 'gradient_min']

#Creates X and y for training, testing, and validation sets. 
X_train = df.drop(dropped_columns,axis=1)
y_train = df['classification']
X_val = df_val.drop(dropped_columns,axis=1)
y_val = df_val['classification']
X_test = df_test.drop(dropped_columns,axis=1)
y_test = df_test['classification']
print(X_train)

          sum  duplicates  mra_D1_min  mra_D1_max  mra_D1_mean  mra_D1_sd  \
0   -0.210207   -1.198409    0.188957   -0.192009     0.131686  -0.195083   
1    4.874356    0.419718   -4.168532    4.605365     3.051550   5.112639   
2   -0.214293    0.859210    0.209883   -0.206535     0.064914  -0.217686   
3   -0.201066   -0.199565    0.208781   -0.205953     0.066189  -0.216723   
4   -0.199780   -0.359380    0.205124   -0.204086     0.066553  -0.211674   
..        ...         ...         ...         ...          ...        ...   
779 -0.203776   -0.449276    0.172585   -0.183689     0.004913  -0.166652   
780 -0.214293    0.589522    0.209836   -0.206490     0.064945  -0.217636   
781 -0.203077   -1.058571    0.209486   -0.206224     0.066098  -0.217042   
782  4.876090    0.469661   -4.071456    4.656394     0.891173   5.108709   
783 -0.214277   -1.328259    0.209740   -0.206379     0.064736  -0.217553   

     mra_D1_range  mra_D1_zero  mra_D2_min  mra_D2_max  ...    tri_sd  \
0 

In [5]:
#Creates an index list to be used so that gridsearchcv differentiates between existing training and validation sets
split_index = [-1]*len(X_train) + [0]*len(X_val)
ps = PredefinedSplit(test_fold = split_index)
X = np.concatenate((X_train, X_val), axis=0)
y = np.concatenate((y_train, y_val), axis=0)

print(X)

############################ Random Forest Hyperparameter Search ############################################
#Initializes a random forest model
rf = RandomForestClassifier()
#Creates lists of possible parameters to try for the random forest
rf_params = {
 'max_depth': [10, 50, 100],
 'random_state':[0],
 'n_estimators': [200, 1000, 2000]
}

#Performs a grid search over the given parameters using F1 score as a metric.
clf_rf = GridSearchCV(estimator = rf, cv=ps,scoring = "f1_weighted", param_grid=rf_params)
clf_rf.fit(X, y)
print("Best Random Forest Score:", clf_rf.best_score_)
print("Best Random Forest Params:", clf_rf.best_params_)

#Displays the features in order of importance for the Random Forest model
print("\nRandom Forest Feature importances:")
zipped_lists = zip(clf_rf.feature_importances_, X_train.columns)
sorted_pairs = sorted(zipped_lists, reverse=True)
for pair in sorted_pairs:
    print(f"{round(pair[0], 2)} {pair[1]}")

############################ Logistic Regression Hyperparamter Search #######################################
warnings.filterwarnings("ignore")
#Initializes a logistic regression model
lr = LogisticRegression()
#Creates lists of possible parameters to try for the logistic regression
lr_params = {
    "C": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e2, 1e3, 1e4],
    "random_state": [0],
    "max_iter": [1000]
}
#Perfoms grid search over the given parameters using F1 score as a metric
clf_lr = GridSearchCV(estimator = lr, cv=ps,scoring = "f1_weighted", param_grid=lr_params)
clf_lr.fit(X, y)
print("Best Logistic Regression Score:", clf_lr.best_score_)
print("Best Logistic Regression Params:", clf_lr.best_params_)

############################ K Nearest Neighbors Hyperparamter Search #######################################
#Initializes a KNN model
knn = KNeighborsClassifier()
#Creates a list of n_neighbors parameter values to try
knn_params = {
    "n_neighbors": [i for i in range(1,20,1)]
}
#Performs grid search over given parameters using F1 score as a metric
clf_knn = GridSearchCV(estimator = knn, cv=ps, scoring = "f1_weighted", param_grid=knn_params)
clf_knn.fit(X, y)
print("Best KNN Classifier Score:", clf_knn.best_score_)
print("Best KNN Classifier Params:", clf_knn.best_params_)

############################ Support Vector Machine Hyperparamter Search #######################################
#Initializes a SVM classifier model
svm_mod = svm.SVC()
#Creates a list of C parameters to try
svm_params = {
    "C": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e2, 1e3, 1e4],
    "random_state": [0]
}
#Performs grid search over given list of C parameters
clf_svm = GridSearchCV(estimator = svm_mod, cv=ps,scoring = "f1_weighted", param_grid=svm_params)
clf_svm.fit(X, y)
print("Best SVM Score:", clf_svm.best_score_)
print("Best SVM Params:", clf_svm.best_params_)

[[-0.21020726 -1.19840925  0.18895698 ... -0.23191411 -0.25364939
  -0.23083959]
 [ 4.87435644  0.41971843 -4.16853218 ... -0.21800687 -0.24275114
  -0.22140042]
 [-0.21429262  0.8592099   0.20988337 ... -0.22944744 -0.23821381
  -0.23008442]
 ...
 [-0.27265531  0.54652516  0.29378731 ...  2.95897954  4.06658327
   2.96917273]
 [ 4.00320067  0.60835625  0.08492225 ... -0.27978756 -0.29846159
  -0.28281103]
 [-0.27292817  1.48429671  0.29399811 ... -0.29143839 -0.30427139
  -0.29083625]]




ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [None]:
#Uses test set to assess final test error of tuned model
print(f"RF Test Accuracy: {round(clf_rf.score(X_test, y_test), 3)}")
print(f"LR Test Accuracy: {round(clf_lr.score(X_test, y_test), 3)}")
print(f"KNN Test Accuracy: {round(clf_knn.score(X_test, y_test), 3)}")
print(f"SVM Test Accuracy: {round(clf_svm.score(X_test, y_test), 3)}")

In [None]:
#Prints confusion a confusion matrix for each model
print(confusion_matrix(y_test, clf_rf.predict(X_test)))
print(confusion_matrix(y_test, clf_lr.predict(X_test)))
print(confusion_matrix(y_test, clf_knn.predict(X_test)))
print(confusion_matrix(y_test, clf_svm.predict(X_test)))

In [None]:
#Prints percent of correct predictions for each model based on confusion matrices

print("RF % Correct:",(24+1+8)/(24+2+11+8+ 1+ 1+ 2))
print("LR % Correct:",(4+3+4)/(4+3+4+21+1+2+14))
print("KNN % Correct:",(17+2+16)/(17+2+16+2+7+1+3+1))
print("SVM % Correct:",(19+1+3)/(19+1+3+5+2+2+8+9))

In [None]:
def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True):
    """
    given a sklearn confusion matrix (cm), make a nice plot

    Arguments
    ---------
    cm:           confusion matrix from sklearn.metrics.confusion_matrix

    target_names: given classification classes such as [0, 1, 2]
                  the class names, for example: ['high', 'medium', 'low']

    title:        the text to display at the top of the matrix

    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
                  see http://matplotlib.org/examples/color/colormaps_reference.html
                  plt.get_cmap('jet') or plt.cm.Blues

    normalize:    If False, plot the raw numbers
                  If True, plot the proportions

    Usage
    -----
    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by
                                                              # sklearn.metrics.confusion_matrix
                          normalize    = True,                # show proportions
                          target_names = y_labels_vals,       # list of names of the classes
                          title        = best_estimator_name) # title of graph
  Citiation
    ---------
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

    """
    import matplotlib.pyplot as plt
    import numpy as np
    import itertools

    accuracy = np.trace(cm) / np.sum(cm).astype('float')
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
     plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.show()
    
plot_confusion_matrix(confusion_matrix(y_test, clf_rf.predict(X_test))