In [270]:
#imports

#pre processing and helper libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from numpy import asarray
from numpy import savetxt
from numpy import loadtxt
from sklearn.preprocessing import MinMaxScaler

#mount google drive for data
#from google.colab import drive
#drive.mount('/content/drive')

#imputation
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.neighbors import KNeighborsRegressor

#feature selection
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

#data augmentation
#!pip install imblearn
import imblearn

#model selection 
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.model_selection import validation_curve, learning_curve, GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, make_scorer



## Data Fetching and Missing Value Imputation

In [271]:
#the following code assumes a local file of data from kaggle, not included
#fetch data
training = pd.read_csv('data/training.csv', dtype = np.float64, keep_default_na= True)
testing = pd.read_csv('data/testing.csv', dtype = np.float64, keep_default_na= True)


In [272]:
#fetch incomplete training data
incomplete = pd.read_csv('data/additional_training.csv', dtype = np.float64, keep_default_na= True, delimiter= ',')


In [273]:
#append incomplete and complete
frames = [training, incomplete]
combined_training = pd.concat(frames)
print(combined_training)

          ID      CNNs   CNNs.1  CNNs.2   CNNs.3   CNNs.4   CNNs.5    CNNs.6  \
0        1.0  0.000000  0.00000     0.0  0.00000  0.44033  0.00000  0.277470   
1        2.0  0.000000  0.00000     0.0  0.00000  0.00000  1.84800  0.000000   
2        3.0  0.000000  0.00000     0.0  0.30395  0.00000  0.00000  0.119300   
3        4.0  0.211680  0.53031     0.0  0.00000  0.17560  0.00000  0.000000   
4        5.0  0.000000  0.00000     0.0  0.60434  0.00000  0.00000  0.008473   
...      ...       ...      ...     ...      ...      ...      ...       ...   
2214  2462.0  0.000000      NaN     0.0  0.00000  0.00000  0.15108       NaN   
2215  2463.0  0.000000  0.00000     0.0  0.29111  0.00000  0.00000  0.000000   
2216  2464.0  0.043786      NaN     0.0  0.37372  0.00000  0.00000  0.000000   
2217  2465.0  0.000000  1.05120     0.0  0.00000  0.00000  0.00000  0.000000   
2218  2466.0  0.000000  0.00000     0.0  0.00000      NaN      NaN  1.373200   

       CNNs.7   CNNs.8  ...  GIST.503  

In [274]:
#fetch label confidence
confidence = pd.read_csv('data/annotation_confidence.csv', delimiter= ',')
print(confidence)

        ID  confidence
0        1        0.66
1        2        0.66
2        3        1.00
3        4        1.00
4        5        1.00
...    ...         ...
2461  2462        0.66
2462  2463        0.66
2463  2464        1.00
2464  2465        0.66
2465  2466        0.66

[2466 rows x 2 columns]


In [275]:
#append confidence to training data
confidence_temp = confidence['confidence'].to_numpy()
combined_training['confidence'] = confidence_temp 
print(combined_training)

          ID      CNNs   CNNs.1  CNNs.2   CNNs.3   CNNs.4   CNNs.5    CNNs.6  \
0        1.0  0.000000  0.00000     0.0  0.00000  0.44033  0.00000  0.277470   
1        2.0  0.000000  0.00000     0.0  0.00000  0.00000  1.84800  0.000000   
2        3.0  0.000000  0.00000     0.0  0.30395  0.00000  0.00000  0.119300   
3        4.0  0.211680  0.53031     0.0  0.00000  0.17560  0.00000  0.000000   
4        5.0  0.000000  0.00000     0.0  0.60434  0.00000  0.00000  0.008473   
...      ...       ...      ...     ...      ...      ...      ...       ...   
2214  2462.0  0.000000      NaN     0.0  0.00000  0.00000  0.15108       NaN   
2215  2463.0  0.000000  0.00000     0.0  0.29111  0.00000  0.00000  0.000000   
2216  2464.0  0.043786      NaN     0.0  0.37372  0.00000  0.00000  0.000000   
2217  2465.0  0.000000  1.05120     0.0  0.00000  0.00000  0.00000  0.000000   
2218  2466.0  0.000000  0.00000     0.0  0.00000      NaN      NaN  1.373200   

       CNNs.7   CNNs.8  ...  GIST.504  

In [276]:
#drop samples with a low confidence and a prediction of 1.0 in the incomplete data
#preserve all samples with either a 0 prediction or from original (feature-complete) samples
combined_training_dropped = combined_training[(combined_training['prediction'] == 0.0) | 
                                          (combined_training['confidence'] == 1.00)]
                                             

In [277]:
"""
prints the label counts for each prediction
"""
def print_predictions(data):
    zeros = 0
    ones = 0
    for val in data:
        if val == 1.0:
            ones += 1
        else:
            zeros += 1
    print("zeros:" + str(zeros), "ones:" + str(ones))


In [278]:
#print predictions to confirm new bias after dropping
print_predictions(combined_training_dropped['prediction'])

zeros:326 ones:1142


In [279]:
#split samples into data and labels, drop ID and Confidence columns
training_labels = combined_training_dropped['prediction']
training_labels = training_labels.to_numpy(dtype = np.int)
#extract CNN columns
training_CNN_temp = combined_training_dropped.loc[:, combined_training_dropped.columns.str.startswith('CN')]
#extract GIST columns
training_GIST_temp = combined_training_dropped.loc[:, combined_training_dropped.columns.str.startswith('G')]

#convert from pandas to numpy
training_CNN = training_CNN_temp.to_numpy(dtype = np.float64)
training_GIST = training_GIST_temp.to_numpy(dtype = np.float64)

#repeat above data extraction with test data
testing_CNN_temp = testing.loc[:, testing.columns.str.startswith('C')]
testing_GIST_temp = testing.loc[:, testing.columns.str.startswith('G')]
testing_CNN = testing_CNN_temp.to_numpy()
testing_GIST = testing_GIST_temp.to_numpy()

In [280]:

"""
Imputes NaN values in the data given using K Nearest neighbours imputation
"""
def impute_KNN( data, neighbours ):
  imputer = KNNImputer(n_neighbors=neighbours, missing_values= np.nan, weights="distance")
  imputer.fit(data)
  new_data = imputer.transform(data)
  return new_data



In [281]:
#impute missing values
training_CNN = impute_KNN(training_CNN, 5)

## Feature Selection

In [282]:
#feature select after imputation

#fit extra trees classifier
clf = ExtraTreesClassifier(n_estimators=50)
clf = clf.fit(training_CNN, training_labels)

#select most important from extra trees classifier
model = SelectFromModel(clf, prefit=True)

#training
training_CNN_reduced = model.transform(training_CNN)
print(training_CNN_reduced.shape)
#testing
testing_CNN_reduced = model.transform(testing_CNN)
print(testing_CNN_reduced.shape)

(1468, 1733)
(11874, 1733)


## Data Augmentation

In [283]:

"""
Apply SMOTE oversampling to supplied data and labels using supplied sampling strategy float
"""
def oversample(data, labels, strategy):
    oversample = imblearn.over_sampling.SMOTE(sampling_strategy = strategy)
    data, labels = oversample.fit_resample(training_CNN_reduced, training_labels)
    return data, labels
    
#use SMOTE to oversample from zero label samples
training_CNN_reduced_augmented, training_labels_augmented = oversample(training_CNN_reduced, training_labels, 0.44)

In [284]:
#pritn augmented data details
print(training_labels_augmented.shape)
print_predictions(training_labels_augmented)

(1644,)
zeros:502 ones:1142


## Pre Processing


In [285]:
#perform standard scaling
#bring values between 0 and 1

#--CNN--
scaler = MinMaxScaler()
training_CNN = scaler.fit_transform(training_CNN)
training_CNN_reduced = scaler.fit_transform(training_CNN_reduced)
testing_CNN = scaler.fit_transform(testing_CNN)
training_CNN_reduced_augmented = scaler.fit_transform(training_CNN_reduced_augmented)
#--GIST--
training_GIST = scaler.fit_transform(training_GIST)
testing_GIST = scaler.fit_transform(testing_GIST)

## Model Baseline and Pre Processing Based Experiments

In [286]:
#create basic model
model =  MLPClassifier(solver = 'lbfgs', hidden_layer_sizes=(1468,1468), activation='relu',alpha = 0.001, max_iter=400)

In [287]:
#evaluate model with cross validation

"""
Evaluate model using 5-Fold cross validation and print accuracy
"""
def k_fold_cross(model,data,labels):
    seed = 98978
    np.random.seed(seed)

    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    results = cross_val_score(model, data, labels, cv=kfold)

    print("Accuracy: %.2f%%" % (results.mean()*100.0))
    return results.mean()
    


In [264]:
#feature selected data
k_fold_cross(model, training_CNN_reduced, training_labels)

Accuracy: 69.96%


0.699568154907014

In [265]:
#non-feature selected data
k_fold_cross(model, training_CNN, training_labels)

Accuracy: 68.19%


0.6818601843467762

In [266]:
#augmented data, moving forward with reduced data
k_fold_cross(model, training_CNN_reduced_augmented, training_labels_augmented)

Accuracy: 75.55%


0.7554544443620728

## Model Selection


In [189]:
#model selection for augmented data

#parameter space
parameters = {
    'hidden_layer_sizes':[500,1000,1644, (1644,1000,500),(1644,1000), (1644,1644)],
    'alpha':list(np.linspace(0.0001, 0.1, num = 10)),
    'batch_size':list(np.linspace(100,1644, num = 10, dtype = int))
 }


In [190]:
#run randomized search with default cross validation
model = MLPClassifier(solver = 'lbfgs', activation='relu', max_iter = 1000)
clf = RandomizedSearchCV(model, parameters)
clf.fit(training_CNN_reduced_augmented, training_labels_augmented)

RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=MLPClassifier(activation='relu', alpha=0.0,
                                           batch_size='auto', beta_1=0.9,
                                           beta_2=0.999, early_stopping=False,
                                           epsilon=1e-08,
                                           hidden_layer_sizes=(100,),
                                           learning_rate='constant',
                                           learning_rate_init=0.001,
                                           max_fun=15000, max_iter=1000,
                                           momentum=0.9, n_iter_no_change=10,
                                           nesterovs_momentum=True, power_t=0.5,
                                           rando...
                                                  0.033400000000000006,
                                                  0.044500000000000005,
                                   

In [191]:
clf.best_params_

{'hidden_layer_sizes': (1644, 1000), 'batch_size': 1472, 'alpha': 0.0001}

In [192]:
clf.best_score_

0.7731688783453184

In [193]:
clf.cv_results_

{'mean_fit_time': array([ 9.06735377, 41.79442697, 31.00786786, 19.43682055, 39.16632762,
        21.34947157, 38.34925013, 12.58994303, 95.48936315, 12.35708094]),
 'std_fit_time': array([ 1.43220022,  4.68634484,  8.55232379,  1.29683947,  5.96427403,
         5.21952943,  8.05112854,  0.21344576, 11.71797745,  1.50639327]),
 'mean_score_time': array([0.01257095, 0.01994672, 0.01954794, 0.02792525, 0.02034583,
        0.02174411, 0.01994715, 0.02035112, 0.0375001 , 0.02134314]),
 'std_score_time': array([0.00102331, 0.00154483, 0.00119686, 0.00087936, 0.00135264,
        0.00074715, 0.00089164, 0.00161252, 0.00439699, 0.00079803]),
 'param_hidden_layer_sizes': masked_array(data=[1000, 1644, 1644, (1644, 1000), 1644, 1644, 1644, 1644,
                    (1644, 1644), 1644],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_batch_size': masked_array(data=[1472, 1472

In [289]:
#model selection for non-augmented data

#parameter space
parameters = {
    'hidden_layer_sizes':[500,1000,1468, (1468,1000,500),(1468,1000), (1468,1468)],
    'alpha':list(np.linspace(0.0001, 0.1, num = 10)),
    'batch_size':list(np.linspace(100,1468, num = 10, dtype = int))
 }

#random grid search
model = MLPClassifier(solver = 'lbfgs', activation='relu', max_iter = 1000)
clf2 = RandomizedSearchCV(model, parameters)
clf2.fit(training_CNN_reduced, training_labels)

#print results
print(clf2.best_params_)
print(clf2.best_score_)
print(clf2.cv_results_)


{'hidden_layer_sizes': (1468, 1000, 500), 'batch_size': 1316, 'alpha': 0.0112}
0.7173086299366163
{'mean_fit_time': array([ 20.55122066,  82.41858273,   9.73875523,   3.28082604,
        40.3083961 , 110.09476628,   4.83347363, 128.92180781,
         6.52854004,  20.02563472]), 'std_fit_time': array([ 1.59737315, 42.78722911,  0.39224424,  0.18497637, 34.76904325,
       56.74823488,  0.41617294, 85.71681867,  0.60469809,  0.88422968]), 'mean_score_time': array([0.03050671, 0.02952065, 0.01575737, 0.00658231, 0.02753038,
       0.02413559, 0.00757961, 0.02793269, 0.0075798 , 0.02832437]), 'std_score_time': array([0.00284001, 0.0047037 , 0.00116324, 0.00048885, 0.00049174,
       0.00182782, 0.00135287, 0.00274931, 0.00101698, 0.00101717]), 'param_hidden_layer_sizes': masked_array(data=[(1468, 1000, 500), (1468, 1000, 500), 1468, 500,
                   (1468, 1000, 500), (1468, 1000), 500, (1468, 1468),
                   500, (1468, 1000, 500)],
             mask=[False, False, False,

AttributeError: 'ExtraTreesClassifier' object has no attribute 'params_'

In [295]:
clf2.cv_results_['mean_test_score']

array([0.71730863, 0.68325555, 0.71391423, 0.70778946, 0.70642195,
       0.6751039 , 0.70710223, 0.66216248, 0.7139212 , 0.71526317])

## Generate Submission

In [296]:
#kaggle submission generator
#augmented data
model =  MLPClassifier(solver = 'lbfgs', hidden_layer_sizes=(1644,1000), activation='relu',alpha = 0.0001, validation_fraction=0, max_iter = 500, batch_size = 1472)
model.fit(training_CNN_reduced_augmented, training_labels_augmented)
predictions = model.predict(testing_CNN_reduced)
#non-augmented
model2 =  MLPClassifier(solver = 'lbfgs', hidden_layer_sizes=(1468,1000,500), activation='relu',alpha = 0.0112, validation_fraction=0, max_iter = 500, batch_size = 1316)
model2.fit(training_CNN_reduced, training_labels)
predictions2 = model2.predict(testing_CNN_reduced)

In [210]:
"""
Creates a submission in the correct format and saves to csv
"""
def kaggle_submission(predictions, sub_num):
  submission = pd.DataFrame(predictions, columns = ['prediction'])
  submission.insert(0,"ID",range(1,len(submission)+1))
  file_name = "submission_" + str(sub_num) + ".csv"
  submission.to_csv(file_name ,index=False)

In [297]:
#create kaggle submissions
kaggle_submission(predictions, 35)
kaggle_submission(predictions2, 37)

In [269]:
#print predictions to confirm new bias after dropping
#target is roughly to match test proportions of 60:40
print_predictions(predictions)
print_predictions(predictions2)

zeros:6245 ones:5629
zeros:5783 ones:6091
