This notebook is taught to show a KNN algorithm, coupled with a KNN for benchmarking purposes to the other models deployed.

In [1]:
#data sources:
txpath = 'toxicity_labels.csv'
data = 'data.csv'
un = 'unknown_data.csv'

In [2]:
#General imports
import numpy as np
from sklearn.preprocessing import StandardScaler
#method specific imports
from sklearn.decomposition import KernelPCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
import sklearn.metrics as metrics

In [3]:
#parsing

#labels
y_id = []
y = []
file = open(txpath, "r")
for line in file:
    ys = line.strip().split(",")
    y_id.append(ys[0])
    y.append(ys[1].replace('NA', '2'))
file.close()
#data
X_id = []
X = []
file = open(data, "r")
for line in file:
    ys = line.strip().split(",")
    X_id.append(ys[0])
    X.append(ys[1:])
file.close() 
#unknown data
Xu_id = []
Xu = []
file = open(un, "r")
for line in file:
    ys = line.strip().split(",")
    Xu_id.append(ys[0])
    Xu.append(ys[1:])    
file.close()

In [4]:
#conversion of lists to numpy arrays and re-definition of data types
y_id = np.array(y_id[1:])
X_id = np.array(X_id[1:])
Xu_id = np.array(Xu_id[1:])
y_head = np.array(y[0])
X_head = np.array(X[0])

#re-definition of data types
X = np.array(X[1:], dtype='float64')
Xu = np.array(Xu[1:], dtype='float64')
y = np.array(y[1:], dtype='i1')  # int8

In [5]:
#preprocessing

#cleaning of data with missing labels
#(Number of values with missing label NA: 3619)
rm = np.where(y == 2)
X = np.delete(X, rm[0], axis=0)
X_id = np.delete(X_id, rm[0])
y = np.delete(y, rm[0])
y_id = np.delete(y_id, rm[0])

In [6]:
#preprocessing

#normalization (zero-score method)
scaler = StandardScaler()
X = scaler.fit_transform(X)
Xu = scaler.transform(Xu)

Model: KPCA with KNN, optimized on the amount of neighbors

steps:
KPCA with a linear kernel
Cross validation (CV) wrapped around a train test split of 80/20
KNN: finding best amount of neighbors
Retrain KNN on with the best amount of neighbors
Calculation of metrices for model selection
data export to .csv file format

In [7]:
#KPCA

pca = KernelPCA(n_components=80, kernel='linear',fit_inverse_transform=True, alpha=1)
Xr=pca.fit_transform(X)
Xur=pca.transform(Xu)

In [8]:
#KNN

#lists to store metrices
best_acc_val = []
best_acc_test = []
best_neighbors = []
best_pre_test = []
best_mcc_test = []
best_auc_test = []
recall_values = []

#Perform cross validation and train test split
kf = StratifiedKFold(n_splits=5)
for train_index, test_index in kf.split(Xr,y):
    X_train = Xr[train_index]
    X_test = Xr[test_index]
    y_train = y[train_index]
    y_test = y[test_index]
    acc_train = [] 
    acc_val = []
    
    X_subtrain, X_val, y_subtrain, y_val = train_test_split(X_train, y_train, test_size = 0.2)
    neighbors = np.arange(1, 25)
    
    #Perform line search to find best k
    for k in neighbors:
        knn = KNeighborsClassifier(n_neighbors=k,algorithm='ball_tree', weights='distance', metric="minkowski",p=2, n_jobs=-1)
        knn.fit(X_subtrain, y_subtrain) 
        predictions_training = knn.predict(X_subtrain) 
        predictions_testing = knn.predict(X_val) 
    
        #Store Accuracy
        acc_train.append(metrics.matthews_corrcoef(y_subtrain,predictions_training))
        acc_val.append(metrics.matthews_corrcoef(y_val,predictions_testing))
    
    #Store best neighbor information
    best_k = neighbors[np.argmax(acc_val)]
    best_neighbors.append(best_k)
    best_acc_val.append(acc_val[np.argmax(acc_val)])
    
    #retrain Model with best k and predict it on test data
    knn = KNeighborsClassifier(n_neighbors=best_k, algorithm='ball_tree', weights='distance',metric="minkowski",p=2, n_jobs=-1)
    knn.fit(X_train, y_train) 
    y_prediction = knn.predict(X_test)
    
    best_acc_test.append(metrics.accuracy_score(y_test,y_prediction))
    best_pre_test.append(metrics.precision_score(y_test,y_prediction))
    best_mcc_test.append(metrics.matthews_corrcoef(y_test,y_prediction))
    best_auc_test.append(metrics.roc_auc_score(y_test,y_prediction))
    recall_values.append(metrics.recall_score(y_test,y_prediction))

In [9]:
#data processing and metrics calculation

#transform python list into numpy array
best_neighbors = np.array(best_neighbors)
best_acc_val = np.array(best_acc_val)
best_acc_test = np.array(best_acc_test)
best_pre_test = np.array(best_pre_test)
best_mcc_test = np.array(best_mcc_test)
best_auc_test = np.array(best_auc_test)
recall_values = np.array(recall_values)

#print metrics
print("Average k: %.2f (+- %.2f)" % (best_neighbors.mean(),best_neighbors.std()))
print("Average Acc (Val): %.2f (+- %.2f)" % (best_acc_val.mean(),best_acc_val.std()))
print("Average Acc (Test): %.2f (+- %.2f)" % (best_acc_test.mean(),best_acc_test.std()))
print("Average pre (Test): %.2f (+- %.2f)" % (best_pre_test.mean(),best_pre_test.std()))
print("Average mcc (Test): %.2f (+- %.2f)" % (best_mcc_test.mean(),best_mcc_test.std()))
print("AUC:\t%.2f (+-%.2f)" % (np.mean(best_auc_test), np.std(best_auc_test)))
print("Recall:\t\t%.2f (+-%.2f)" % (np.mean(recall_values),np.std(recall_values)))

Average k: 5.00 (+- 4.00)
Average Acc (Val): 0.50 (+- 0.04)
Average Acc (Test): 0.89 (+- 0.01)
Average pre (Test): 0.54 (+- 0.04)
Average mcc (Test): 0.40 (+- 0.03)
AUC:	0.67 (+-0.01)
Recall:		0.39 (+-0.01)


In [10]:
#export format for leaderboard submission

un_pred = knn.predict(Xur)
un_pred = np.array(un_pred)
exp = np.column_stack((Xu_id, un_pred))
np.savetxt('exp_kpca_knn.csv',exp, delimiter=',', fmt="%s")

print('Number of toxic labels of unknown data set %d' % np.sum(un_pred == 1))
print('Number of non-toxic labels of unknown data set %d' %np.sum(un_pred == 0))

Number of toxic labels of unknown data set 64
Number of non-toxic labels of unknown data set 546
