### Analyte Classification using k-nn
#### Edgar Acuna 
#### July 2020

In [1]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
%matplotlib inline

#### Loading the data

In [2]:
df1=pd.read_csv("c://onr2020/NRLset1_part1.csv",header=None)
df2=pd.read_csv("c://onr2020/NRLset1_part2.csv",header=None)
df3=pd.read_csv("c://onr2020/NRLset1_part3.csv",header=None)
df4=pd.read_csv("c://onr2020/NRLset1_part4.csv",header=None)
df5=pd.read_csv("c://onr2020/NRLset1_part5.csv",header=None)
df6=pd.read_csv("c://onr2020/NRLset1_part6.csv",header=None)
df7=pd.read_csv("c://onr2020/NRLset1_part7.csv",header=None)
df8=pd.read_csv("c://onr2020/NRLset1_part8.csv",header=None)
y=pd.read_csv("c://onr2020/labels.csv",header=None)
ys=pd.read_csv("c://onr2020/substrateIDs.csv",header=None)

In [3]:
dfset1=pd.concat([df1,df2,df3,df4,df5,df6,df7,df8],ignore_index=True)
dfset1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1691,1692,1693,1694,1695,1696,1697,1698,1699,1700
0,0.041418,0.041621,0.042198,0.042688,0.042924,0.042274,0.042542,0.042663,0.042715,0.042664,...,0.053219,0.053131,0.053091,0.05314,0.053249,0.053325,0.053364,0.053356,0.0535,0.055986
1,0.69471,0.69584,0.69718,0.70653,0.70397,0.70913,0.71148,0.71133,0.71237,0.71298,...,0.75426,0.75104,0.75363,0.75449,0.75298,0.75576,0.75291,0.75532,0.75303,0.75323
2,0.048978,0.048432,0.047685,0.047086,0.046811,0.046752,0.046624,0.046443,0.046241,0.045999,...,0.26032,0.25959,0.25891,0.25821,0.25744,0.25658,0.2557,0.2548,0.25416,0.25711
3,0.039762,0.039495,0.038982,0.038339,0.037769,0.037301,0.036799,0.036316,0.035921,0.035612,...,0.2576,0.25689,0.25616,0.25537,0.25454,0.25372,0.25288,0.25197,0.25107,0.25019
4,0.022387,0.022508,0.022091,0.023054,0.02301,0.02274,0.023889,0.023936,0.023464,0.02481,...,0.025036,0.024307,0.024636,0.022298,0.023536,0.025714,0.025306,0.025062,0.023609,0.023901


In [4]:
dfset2=dfset1.copy()
dfset2['Analyte']=y
dfset2['substrate']=ys
dfsub1=dfset2[dfset2['substrate']==1]
dfsub2=dfset2[dfset2['substrate']==2]
dfsub3=dfset2[dfset2['substrate']==3]
dfsub4=dfset2[dfset2['substrate']==4]
dfsub5=dfset2[dfset2['substrate']==5]
dfsub6=dfset2[dfset2['substrate']==6]
dfsub7=dfset2[dfset2['substrate']==7]
dfsub8=dfset2[dfset2['substrate']==8]
dfsub9=dfset2[dfset2['substrate']==9]
dfsubset0=pd.concat([dfsub3,dfsub4,dfsub9],ignore_index=True)
dfsub6.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1693,1694,1695,1696,1697,1698,1699,1700,Analyte,substrate
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,0.069546,0.069586,0.069735,0.069112,0.06876,0.068217,0.067672,0.066901,0.066063,0.065052,...,0.280335,0.279713,0.279134,0.278207,0.277731,0.276881,0.276449,0.27616,20.5,6.0
std,0.006162,0.006149,0.006214,0.006387,0.006215,0.0061,0.006069,0.006331,0.006086,0.006031,...,0.005679,0.005851,0.005607,0.005724,0.005911,0.005659,0.005555,0.005942,11.546283,0.0
min,0.036693,0.035003,0.032058,0.037037,0.032889,0.040848,0.038419,0.035071,0.03285,0.032244,...,0.25223,0.24642,0.25064,0.24987,0.23752,0.24883,0.25087,0.2453,1.0,6.0
25%,0.066411,0.0665,0.066563,0.065999,0.065412,0.064884,0.064378,0.06375,0.062728,0.061728,...,0.27781,0.27705,0.27655,0.27556,0.275087,0.27431,0.27392,0.27324,10.75,6.0
50%,0.069325,0.069335,0.069558,0.068904,0.068357,0.067882,0.067306,0.066754,0.065763,0.064586,...,0.28015,0.27955,0.27896,0.278085,0.277605,0.27677,0.27634,0.27602,20.5,6.0
75%,0.072539,0.072752,0.072908,0.07245,0.072001,0.071438,0.070937,0.06994,0.06919,0.067894,...,0.28272,0.28223,0.281502,0.28076,0.280253,0.2794,0.278852,0.27888,30.25,6.0
max,0.098757,0.098897,0.097614,0.096563,0.096449,0.10172,0.1064,0.099545,0.095092,0.095753,...,0.30998,0.3106,0.30632,0.30954,0.30527,0.30605,0.30766,0.30489,40.0,6.0


In [25]:
dfsub31 = dfsub3.drop(dfsub3[(dfsub3['Analyte']==13) | (dfsub3['Analyte']==24) | (dfsub3['Analyte']==34)].index)
dfsub41 = dfsub4.drop(dfsub4[(dfsub4['Analyte']==13) | (dfsub4['Analyte']==24) | (dfsub4['Analyte']==34)].index)
dfsub91.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1693,1694,1695,1696,1697,1698,1699,1700,Analyte,substrate
6,0.23532,0.23367,0.23625,0.24053,0.23726,0.23638,0.23661,0.23843,0.23874,0.23832,...,0.22309,0.22392,0.22221,0.223,0.22181,0.22201,0.22098,0.22148,5,9
11,0.62481,0.62296,0.61857,0.6131,0.6045,0.60094,0.59381,0.58911,0.59014,0.5875,...,0.47459,0.4742,0.47397,0.47378,0.47421,0.47082,0.47286,0.47047,36,9
16,0.26241,0.26114,0.26299,0.26278,0.26165,0.25819,0.25645,0.25475,0.25381,0.24845,...,0.38369,0.3839,0.38142,0.38227,0.38204,0.38286,0.38272,0.38236,10,9
24,0.30203,0.30009,0.31935,0.3552,0.35301,0.34536,0.34466,0.33122,0.31698,0.32534,...,0.40309,0.39981,0.40789,0.41994,0.40553,0.42968,0.40836,0.44263,20,9
26,0.21222,0.15344,0.17677,0.17939,0.19517,0.17923,0.16505,0.1672,0.17508,0.15312,...,0.37571,0.37403,0.39967,0.38927,0.38026,0.39495,0.37657,0.3956,3,9


In [27]:
neigh = KNeighborsClassifier(n_neighbors=3)
y=dfsub8["Analyte"]
X=dfsub8.iloc[:,0:1701]
neigh.fit(X, y) 
print("Accuracy estimated by resubstitution", neigh.score(X, y))
# We will find by inspection the best k according to the classifier accuracy on the test set
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,random_state=0)
neigh.fit(X_train, y_train) 
#Calculating  metrics of prediction
predictions = neigh.predict(X_test)
print("F1-score is=",f1_score(y_test,predictions, average="weighted"))
print("precision=",precision_score(y_test,predictions,average="weighted"))
print("The accuracy is=",neigh.score(X_test,y_test))

Accuracy estimated by resubstitution 0.9595
F1-score is= 0.8713410722551497
precision= 0.9012923257298258
The accuracy is= 0.87


### 6. Applying knn with k=3  and two classes: Analyte j (j=1,,..40) versus other analytes (class=0)). 

In [24]:
#Only the first twenty analytes are analyzed
labels=list(range(1,41))

In [56]:
for j in labels:
    yclass=dfsub1['Analyte'].copy()
    yclass[yclass!=j]=0 
    yclass[yclass==j]=1
    X_train, X_test, yclass_train, yclass_test = train_test_split(dfsub1.iloc[:,0:1071],yclass,test_size=0.1,random_state=0)
    neigh = KNeighborsClassifier(n_neighbors=3)
    neigh.fit(X_train, yclass_train) 
    predictions = neigh.predict(X_test)
    print("Metrics for predicting analyte %d : accuracy=%.3f, F1-score=%.3f, Precision=%.3f" %(j, neigh.score(X_test,yclass_test), f1_score(yclass_test,predictions),precision_score(yclass_test,predictions)))

Metrics for predicting analyte 1 : accuracy=1.000, F1-score=1.000, Precision=1.000
Metrics for predicting analyte 2 : accuracy=0.995, F1-score=0.909, Precision=1.000
Metrics for predicting analyte 3 : accuracy=1.000, F1-score=1.000, Precision=1.000
Metrics for predicting analyte 4 : accuracy=1.000, F1-score=1.000, Precision=1.000
Metrics for predicting analyte 5 : accuracy=0.980, F1-score=0.714, Precision=1.000
Metrics for predicting analyte 6 : accuracy=0.995, F1-score=0.909, Precision=0.833
Metrics for predicting analyte 7 : accuracy=0.995, F1-score=0.857, Precision=1.000
Metrics for predicting analyte 8 : accuracy=0.995, F1-score=0.933, Precision=1.000
Metrics for predicting analyte 9 : accuracy=0.990, F1-score=0.857, Precision=0.750
Metrics for predicting analyte 10 : accuracy=1.000, F1-score=1.000, Precision=1.000
Metrics for predicting analyte 11 : accuracy=1.000, F1-score=1.000, Precision=1.000
Metrics for predicting analyte 12 : accuracy=0.985, F1-score=0.769, Precision=1.000
M

In [87]:
for j in labels:
    yclass=dfsub6['Analyte'].copy()
    yclass[yclass!=j]=0 
    yclass[yclass==j]=1
    X_train, X_test, yclass_train, yclass_test = train_test_split(dfsub6.iloc[:,0:1071],yclass,test_size=0.1,random_state=0)
    neigh = KNeighborsClassifier(n_neighbors=3)
    neigh.fit(X_train, yclass_train) 
    predictions = neigh.predict(X_test)
    print("Metrics for predicting analyte %d : accuracy=%.3f, F1-score=%.3f, Precision=%.3f" %(j, neigh.score(X_test,yclass_test), f1_score(yclass_test,predictions),precision_score(yclass_test,predictions)))

Metrics for predicting analyte 1 : accuracy=0.995, F1-score=0.909, Precision=1.000
Metrics for predicting analyte 2 : accuracy=0.995, F1-score=0.889, Precision=1.000
Metrics for predicting analyte 3 : accuracy=0.995, F1-score=0.800, Precision=1.000
Metrics for predicting analyte 4 : accuracy=0.995, F1-score=0.667, Precision=1.000
Metrics for predicting analyte 5 : accuracy=0.995, F1-score=0.857, Precision=1.000
Metrics for predicting analyte 6 : accuracy=0.985, F1-score=0.571, Precision=0.500
Metrics for predicting analyte 7 : accuracy=0.995, F1-score=0.800, Precision=1.000
Metrics for predicting analyte 8 : accuracy=0.995, F1-score=0.923, Precision=1.000
Metrics for predicting analyte 9 : accuracy=1.000, F1-score=1.000, Precision=1.000
Metrics for predicting analyte 10 : accuracy=1.000, F1-score=1.000, Precision=1.000
Metrics for predicting analyte 11 : accuracy=1.000, F1-score=1.000, Precision=1.000
Metrics for predicting analyte 12 : accuracy=1.000, F1-score=1.000, Precision=1.000
M

In [94]:
for j in labels:
    yclass=dfsub9['Analyte'].copy()
    yclass[yclass!=j]=0 
    yclass[yclass==j]=1
    X_train, X_test, yclass_train, yclass_test = train_test_split(dfsub9.iloc[:,0:1071],yclass,test_size=0.1,random_state=0)
    neigh = KNeighborsClassifier(n_neighbors=3)
    neigh.fit(X_train, yclass_train) 
    predictions = neigh.predict(X_test)
    print("Metrics for predicting analyte %d : accuracy=%.3f, F1-score=%.3f, Precision=%.3f" %(j, neigh.score(X_test,yclass_test), f1_score(yclass_test,predictions),precision_score(yclass_test,predictions)))

Metrics for predicting analyte 1 : accuracy=1.000, F1-score=1.000, Precision=1.000
Metrics for predicting analyte 2 : accuracy=1.000, F1-score=1.000, Precision=1.000
Metrics for predicting analyte 3 : accuracy=1.000, F1-score=1.000, Precision=1.000
Metrics for predicting analyte 4 : accuracy=1.000, F1-score=1.000, Precision=1.000
Metrics for predicting analyte 5 : accuracy=0.995, F1-score=0.857, Precision=1.000
Metrics for predicting analyte 6 : accuracy=0.995, F1-score=0.923, Precision=1.000
Metrics for predicting analyte 7 : accuracy=1.000, F1-score=1.000, Precision=1.000
Metrics for predicting analyte 8 : accuracy=1.000, F1-score=1.000, Precision=1.000
Metrics for predicting analyte 9 : accuracy=1.000, F1-score=1.000, Precision=1.000
Metrics for predicting analyte 10 : accuracy=1.000, F1-score=1.000, Precision=1.000
Metrics for predicting analyte 11 : accuracy=1.000, F1-score=1.000, Precision=1.000
Metrics for predicting analyte 12 : accuracy=1.000, F1-score=1.000, Precision=1.000
M

In [95]:
labels1=[1,2,3,4,5,6,7,8,9,10,11,12,14,15,16,17,18,19,20,21,22,23,25,26,27,28,29,30,31,32,33,35,36,37,38,39,40]
for j in labels1:
    yclass=dfsub91['Analyte'].copy()
    yclass[yclass!=j]=0 
    yclass[yclass==j]=1
    X_train, X_test, yclass_train, yclass_test = train_test_split(dfsub91.iloc[:,0:1071],yclass,test_size=0.1,random_state=0)
    neigh = KNeighborsClassifier(n_neighbors=3)
    neigh.fit(X_train, yclass_train) 
    predictions = neigh.predict(X_test)
    print("Metrics for predicting analyte %d : accuracy=%.3f, F1-score=%.3f, Precision=%.3f" %(j, neigh.score(X_test,yclass_test), f1_score(yclass_test,predictions),precision_score(yclass_test,predictions)))

Metrics for predicting analyte 1 : accuracy=1.000, F1-score=1.000, Precision=1.000
Metrics for predicting analyte 2 : accuracy=1.000, F1-score=1.000, Precision=1.000
Metrics for predicting analyte 3 : accuracy=1.000, F1-score=1.000, Precision=1.000
Metrics for predicting analyte 4 : accuracy=1.000, F1-score=1.000, Precision=1.000
Metrics for predicting analyte 5 : accuracy=0.995, F1-score=0.889, Precision=1.000
Metrics for predicting analyte 6 : accuracy=1.000, F1-score=1.000, Precision=1.000
Metrics for predicting analyte 7 : accuracy=1.000, F1-score=1.000, Precision=1.000
Metrics for predicting analyte 8 : accuracy=1.000, F1-score=1.000, Precision=1.000
Metrics for predicting analyte 9 : accuracy=0.995, F1-score=0.933, Precision=1.000
Metrics for predicting analyte 10 : accuracy=1.000, F1-score=1.000, Precision=1.000
Metrics for predicting analyte 11 : accuracy=1.000, F1-score=1.000, Precision=1.000
Metrics for predicting analyte 12 : accuracy=1.000, F1-score=1.000, Precision=1.000
M

#### The F1-score is above .90 for six  analytes and,  between .80-.90 for 22 analytes .  The analytes 13(0.652),18(0.618),24(0.602) and 34(0.601) have the lowest F1-score. Also, they have the lowest auc.

#### The prediction results improve. Now, the F1-score is above .90 for twenty-eight analytes and, between .80-.90 for nine analytes . The analytes 13(0.757), 24(0.602) and 34(0.760) have the lowest F1-score. Also, they have the lowest auc.