# Machine Learning Assignment

### Necessary Imports

In [1]:
#Necessary Imports
from imblearn.under_sampling import RandomUnderSampler
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import chi2_kernel
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import KernelPCA
from sklearn.model_selection import KFold
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.impute import KNNImputer #More in-depth: Use avg of K nearest neighbours
from sklearn import preprocessing
from collections import Counter
from sklearn import svm
import pandas as pd
import numpy as np
import statistics

In [2]:
#Read in all available data
trainingData = pd.read_csv("training.csv")
testData = pd.read_csv("testing.csv")
extraData = pd.read_csv('additional_training.csv')
annotationData = pd.read_csv("annotation_confidence.csv")
annotationConfidence = annotationData.iloc[:,1]

### Reformatting the Data

In [3]:
#Separate the CNN and GIST features
CNNFeatures = trainingData.iloc[:,1:4097]
GISTFeatures = trainingData.iloc[:,4097:4609]
expectedOutput1 = trainingData.iloc[:,4609]
test_CNN = testData.iloc[:,1:4097]
test_GIST = testData.iloc[:,4097:4609]

additionalCNN = extraData.iloc[:,1:4097]
additionalGIST = extraData.iloc[:,4097:4609]

#Join the two CNN training data sets
np1 = np.asarray(CNNFeatures)
np2 = np.asarray(additionalCNN)
CNN_raw = np.concatenate((np1, np2), axis = 0)
#Join the two GIST training data sets
np1 = np.asarray(GISTFeatures)
np2 = np.asarray(additionalGIST)
GIST_raw = np.concatenate((np1, np2), axis = 0)

In [15]:
#Add extra training data results
additionalExpectedOutputs = extraData.iloc[:,4609]
np3 = np.asarray(expectedOutput1)
np4 = np.asarray(additionalExpectedOutputs)
expectedOutput = np.concatenate((np3, np4), axis = 0)

### Pre-Processing

In [14]:
#K-Nearest Neighbours Impute

#Impute extra training missing values CNN
imp = KNNImputer(n_neighbors=2)
train_CNN = imp.fit_transform(CNN_raw)
#Impute extra training missing values GIST
train_GIST = imp.fit_transform(GIST_raw)

In [16]:
#Standardize the data CNN
scaler = StandardScaler()
#scaler.fit(train_CNN)
scaledTrain_CNN = scaler.fit_transform(train_CNN)#train_CNN
scaledTest_CNN = scaler.fit_transform(test_CNN)
#Standardize the data GIST
scaler.fit(train_GIST)
scaledTrain_GIST = scaler.transform(train_GIST)
scaledTest_GIST = scaler.transform(testGISTFeatures)

## Domain Adaptation Problem - Attempt to address class imbalace

In [20]:
#Check training proportions, highlights class imbalance - part of domain adaptation problem
memorable = 0
nonMemorable = 0
for x in range(len(expectedOutput)):#expectedOutput
    if expectedOutput[x] == 1: #expectedOutput
        memorable += 1
    else:
         nonMemorable = nonMemorable + 1        
            
#Display amount + ratio of each class in training data
print("1 ratio: ", memorable/len(expectedOutput), "|1 number: ", memorable)     #Testing 1 proportion = 0.3848
print("0 ratio: ", nonMemorable/len(expectedOutput),"|0 number: ", nonMemorable)#Testing 0 proportion = 0.6152

print("")
print("Therefore the following amount of +ve cases with low confidence must be flipped in training data to match testing proportions: ")
ratioToFlip = ((memorable/len(expectedOutput))-0.3848)#Difference between memoroable train + test proportions 
numberToFlip = round(ratioToFlip * 2466)#2466 = total no of training entries
print(numberToFlip)

1 ratio:  0.8678021086780211 |1 number:  2140
0 ratio:  0.1321978913219789 |0 number:  326

Therefore the following amount of +ve cases with low confidence must be flipped in training data to match testing proportions: 
1191


In [21]:
#Flip training 1's with low confidence to bridge train/test class imbalance
flipped = 0
index = 0
while flipped < numberToFlip and index < len(expectedOutput):
    if expectedOutput[index] == 1 and annotationConfidence[index] != 1:
        expectedOutput[index] = 0
        flipped += 1
    index += 1

#Only exist 998 +ve entries with confidence < 1
print(flipped)

998


In [26]:
#Random Undersampling to exactly match training/testing proportions
trainingDataToResample = scaledTrain_CNN #Adjust to control which pre-processing methods are used
sampler = RandomUnderSampler(sampling_strategy = {1: 800, 0:1279}) 

print('Dataset length before resampling: %s' % len(expectedOutput))
CNN, expectedOutput_res = sampler.fit_resample(trainingDataToResample, expectedOutput.ravel()) #was train_CNN for best score
print('Adjusted dataset length: %s' % len(expectedOutput_res))

print("1 ratio: ",Counter(expectedOutput.ravel())[1]/len(expectedOutput_res), "| 1 number: ", Counter(expectedOutput_res.ravel())[1])
print("0 ratio: ",Counter(expectedOutput.ravel())[0]/len(expectedOutput_res), "| 0 number: ", Counter(expectedOutput_res.ravel())[0])

Dataset length before resampling: 2466
Adjusted dataset length: 2079
1 ratio:  0.5493025493025493 | 1 number:  800
0 ratio:  0.6368446368446369 | 0 number:  1279


## K-Fold Cross Validation

In [27]:
#PCA
kf = KFold(n_splits=5)

score = []
count = 0

for train_index, test_index in kf.split(CNN): 
    X_train, X_test = CNN[train_index], CNN[test_index]
    y_train, y_test = expectedOutput_res[train_index], expectedOutput_res[test_index]
    
    classifier = MLPClassifier(max_iter = 500) #WithoutFlips - Impute+Scaled, 84.67% local, 58.83% Kaggle, domain adaptation problem
                                 #WithFlips - Impute+Scaled, 51.17% local,
    classifier = classifier.fit(X_train, y_train)
    
    score.append(classifier.score(X_test, y_test))
    
    count+=1

print(score)
print(statistics.mean(score))

[0.5, 0.4831730769230769, 0.5721153846153846, 0.1971153846153846, 0.13253012048192772]
0.3769867933271548


In [28]:
#Retrain using full dataset ready for Kaggle

#Initialize and train classifier
classifier = MLPClassifier(max_iter = 500)
classifier.fit(CNN, expectedOutput_res)

#Predict
predictions = classifier.predict(scaledTest_CNN)
print(predictions)

[1 0 1 ... 0 0 0]


## Export to CSV

In [29]:
evalPredictions = predictions #adjust which predictions to evaluate
#Convert to dataframe
predictionsNumpy = np.asarray(evalPredictions)
predictionsArray = predictionsNumpy.tolist()
predictionsDf = pd.DataFrame(data = predictionsArray, index=list(range(1, len(evalPredictions)+1)))

#Index from 1, label colums
toExportDf = predictionsDf
toExportDf.columns = ['prediction']
toExportDf.index = np.arange(1, len(toExportDf)+1)
toExportDf.index.name = 'id'

#Export to csv
print(toExportDf)
toExportDf.to_csv("Attempt_35.csv")

       prediction
id               
1               1
2               0
3               1
4               0
5               0
...           ...
11870           1
11871           1
11872           0
11873           0
11874           0

[11874 rows x 1 columns]


## Pre-Processing methods not currently in use

In [None]:
#Binarizer CNN   -------- NOT IN USE-------------
binarizer = preprocessing.Binarizer(threshold = 0.293)
binarised_scaled_trainCNN = binarizer.transform(scaledTrain_CNN)
binarised_scaled_testCNN = binarizer.transform(scaledTest_CNN)

In [None]:
#KernelPCA   -------- NOT IN USE------------
kpca = KernelPCA(n_components = 2440, kernel='precomputed')
KPCAtest_CNN = kpca.fit_transform(scaledTest_CNN)
KPCAtrain_CNN = kpca.fit_transform(scaledTrain_CNN)

In [None]:
#Normalization -------- NOT IN USE------------
norm = Normalizer()
normTrain_CNN = norm.fit_transform(train_CNN)
normTest_CNN = norm.fit_transform(test_CNN)
normTrain_GIST = norm.fit_transform(train_GIST)
normTest_GIST = norm.fit_transform(test_GIST)

In [None]:
#PCA  -------- NOT IN USE------------
pca = PCA(n_components = 100)#n_components = 100
PCAtest_CNN = pca.fit_transform(test_CNN)
PCAtrain_CNN = pca.fit_transform(train_CNN)

In [None]:
#Simple Impute  -------- NOT IN USE------------

#Impute extra training missing values CNN
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = imp.fit(CNN_raw)
train_CNN = imp.transform(CNN_raw)