### Data augmentation using autoencoders
#### Edgar Acuna
#### July 2021

In [1]:
import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf
import keras
from keras.models import Model, load_model
from keras.layers import Input, Dense
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras import regularizers
from sklearn.preprocessing import  StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.io
%matplotlib inline
mat = scipy.io.loadmat('C:/Users/eacun/Downloads/dataset55_release2.mat')
%matplotlib inline

In [2]:
y=mat['labels']

In [3]:
df=mat['spectra']
df=pd.DataFrame(df)
print(df.shape)

(49500, 1701)


In [4]:
ys=mat['substrateIDs']

In [5]:
subs=mat['substrateSpectra']
subs=pd.DataFrame(subs)
subs

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1691,1692,1693,1694,1695,1696,1697,1698,1699,1700
0,0.008192,0.008116,0.008042,0.00797,0.007899,0.00783,0.007763,0.007697,0.007633,0.00757,...,0.010966,0.010943,0.010919,0.010895,0.01087,0.010845,0.010818,0.010792,0.010765,0.010737
1,0.035105,0.035617,0.036458,0.037003,0.037084,0.036102,0.035552,0.035033,0.034687,0.034424,...,0.040834,0.040778,0.04076,0.040794,0.040865,0.040946,0.041008,0.041036,0.041051,0.04107
2,0.585617,0.585672,0.585179,0.584104,0.585759,0.587581,0.588336,0.589407,0.590642,0.591676,...,0.55962,0.559569,0.55952,0.559472,0.559426,0.559382,0.559341,0.559301,0.559264,0.55923
3,0.026414,0.026214,0.026014,0.02577,0.025449,0.02525,0.025171,0.025119,0.025165,0.025351,...,0.546925,0.547225,0.547537,0.547909,0.548301,0.548652,0.548971,0.549233,0.549387,0.549484
4,0.01106,0.011381,0.011618,0.011406,0.010922,0.010713,0.010692,0.010628,0.010722,0.010908,...,0.023413,0.023356,0.023243,0.023115,0.023055,0.023158,0.023328,0.023359,0.023251,0.02318
5,0.0644,0.064528,0.064613,0.064131,0.063398,0.062792,0.062214,0.061542,0.060586,0.059307,...,0.277668,0.27701,0.276353,0.275671,0.274983,0.274299,0.273613,0.272943,0.272329,0.271673
6,0.035515,0.035251,0.034737,0.034093,0.033518,0.033043,0.032536,0.032043,0.031658,0.031341,...,0.253511,0.252787,0.252041,0.251235,0.250392,0.24956,0.248698,0.247776,0.246856,0.245963
7,0.031312,0.031438,0.031556,0.031777,0.03178,0.031608,0.031588,0.031441,0.031011,0.030551,...,0.569969,0.569996,0.570024,0.57003,0.570001,0.569943,0.569884,0.569884,0.56993,0.569966
8,0.00088,0.001132,0.004165,0.006112,0.005633,0.004667,0.003701,0.002964,0.002877,0.002152,...,0.010077,0.010496,0.010808,0.010552,0.010001,0.009687,0.009309,0.008746,0.008464,0.008364


In [6]:
scaler = MinMaxScaler()
dfset2= scaler.fit_transform(df)
print(type(dfset2))
train_x =dfset2
train_x

<class 'numpy.ndarray'>


array([[0.04834693, 0.04609835, 0.04628576, ..., 0.05535224, 0.0400251 ,
        0.04239265],
       [0.05058388, 0.04161474, 0.05306406, ..., 0.0539126 , 0.0487035 ,
        0.0391527 ],
       [0.04771922, 0.04675994, 0.04713682, ..., 0.0560196 , 0.04215041,
        0.04527197],
       ...,
       [0.04951336, 0.0573455 , 0.0775761 , ..., 0.73555518, 0.74822352,
        0.75516237],
       [0.06631596, 0.06394713, 0.06823109, ..., 0.75254761, 0.75643096,
        0.75719126],
       [0.06788073, 0.06707189, 0.06746488, ..., 0.75518449, 0.76244708,
        0.75868926]])

In [7]:
# No of Neurons in each Layer 
nb_epoch = 20
batch_size = 50
input_dim = train_x.shape[1] #num of columns, 1071
encoding_dim = 64
hidden_dim = int(encoding_dim / 2) #i.e. 32
#decoding_dim = 64
learning_rate = 1e-7

input_layer = Input(shape=(input_dim, ))
encoder = Dense(encoding_dim, activation="tanh", activity_regularizer=regularizers.l1(learning_rate))(input_layer)
encoder = Dense(hidden_dim, activation="relu")(encoder)
decoder = Dense(hidden_dim, activation='tanh')(encoder)
#decoder = Dense(decoding_dim, activation='relu')(decoder)
decoder = Dense(input_dim, activation='relu')(decoder)
autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 1701)]            0         
_________________________________________________________________
dense (Dense)                (None, 64)                108928    
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_2 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_3 (Dense)              (None, 1701)              56133     
Total params: 168,197
Trainable params: 168,197
Non-trainable params: 0
_________________________________________________________________


In [8]:
import time
autoencoder.compile(optimizer='adam', loss='mse' )

start_time=time.time()
history = autoencoder.fit(train_x, train_x,
                        epochs=nb_epoch,
                        batch_size=batch_size,
                        shuffle=True,
                        validation_split=0.1,
                        verbose=0)
end_time=time.time()
df_history = pd.DataFrame(history.history) 

In [9]:
predictions = autoencoder.predict(train_x)
print(predictions)

[[0.03888115 0.03843936 0.0398795  ... 0.05292194 0.03889616 0.041952  ]
 [0.03888115 0.03843936 0.0398795  ... 0.05292194 0.03889616 0.041952  ]
 [0.03888115 0.03843936 0.0398795  ... 0.05292194 0.03889616 0.041952  ]
 ...
 [0.0728396  0.0722605  0.07375481 ... 0.7849377  0.79378587 0.789905  ]
 [0.07340327 0.07277191 0.07424609 ... 0.7870168  0.79592663 0.7920126 ]
 [0.07324378 0.07262775 0.07411077 ... 0.7864631  0.7953558  0.7914509 ]]


In [10]:
dfset2=df.copy()
dfset2['Analyte']=y
dfset2['substrate']=ys
dfset2.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1693,1694,1695,1696,1697,1698,1699,1700,Analyte,substrate
49495,0.037553,0.028473,0.036529,0.036093,0.016089,0.032855,0.029435,0.028151,0.030395,0.024624,...,0.531353,0.54342,0.535867,0.538912,0.536256,0.547188,0.539499,0.55374,55,8
49496,0.02987,0.032666,0.032152,0.033202,0.033766,0.031824,0.029444,0.035251,0.034372,0.031036,...,0.544204,0.544876,0.545065,0.546364,0.542205,0.539576,0.539069,0.543899,55,8
49497,0.017774,0.024707,0.040428,0.023651,0.02157,0.033614,0.025827,0.023418,0.016353,0.044331,...,0.534933,0.549713,0.55077,0.54689,0.538551,0.528992,0.533424,0.54126,55,8
49498,0.031123,0.029965,0.032992,0.033308,0.031816,0.031922,0.033099,0.032652,0.032133,0.030951,...,0.542406,0.542487,0.540469,0.54191,0.542449,0.5418,0.53943,0.542759,55,8
49499,0.032366,0.032454,0.032383,0.032554,0.032657,0.032594,0.03264,0.032542,0.032122,0.031636,...,0.543924,0.543932,0.543901,0.543841,0.543787,0.543787,0.543832,0.543866,55,8


In [11]:
dfsub1=dfset2[dfset2['substrate']==1]
dfsub2=dfset2[dfset2['substrate']==2]
dfsub3=dfset2[dfset2['substrate']==3]
dfsub4=dfset2[dfset2['substrate']==4]
dfsub5=dfset2[dfset2['substrate']==5]
dfsub6=dfset2[dfset2['substrate']==6]
dfsub7=dfset2[dfset2['substrate']==7]
dfsub8=dfset2[dfset2['substrate']==8]
dfsub9=dfset2[dfset2['substrate']==9]
#dfset1=pd.DataFrame(dfset1)
df['Analyte']=y
df['substrate']=ys
df.iloc[20000:20006,:]
#yt=dfset5["Analyte"]
dfsub1.head()
#dfset5.shape}

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1693,1694,1695,1696,1697,1698,1699,1700,Analyte,substrate
0,0.016847,0.01575,0.01553,0.014852,0.015202,0.01542,0.016572,0.01422,0.014954,0.014995,...,0.016104,0.017741,0.017533,0.016542,0.016122,0.016313,0.015238,0.014641,1,1
1,0.018624,0.012179,0.020924,0.003329,0.014498,0.019329,0.011184,0.010919,0.007558,0.017733,...,0.010701,0.018193,0.017089,0.018461,0.020192,0.015228,0.021588,0.012247,1,1
2,0.016348,0.016277,0.016207,0.016139,0.016066,0.015951,0.01578,0.015613,0.015557,0.015594,...,0.016926,0.016905,0.016884,0.016862,0.016839,0.016816,0.016793,0.016769,1,1
3,0.015725,0.018253,0.018749,0.017823,0.01685,0.01523,0.017195,0.011854,0.013581,0.011075,...,0.0202,0.018257,0.012629,0.01409,0.016704,0.016195,0.017715,0.011852,1,1
4,0.018868,0.024683,0.016563,0.020112,0.018959,0.017602,0.016099,0.009405,0.020006,0.019105,...,0.011581,0.019433,0.015924,0.00678,0.015658,0.017236,0.019725,0.015868,1,1


In [12]:
mdfsub1=dfsub1.iloc[:,0:1701]
mdfsub2=dfsub2.iloc[:,0:1701]
mdfsub3=dfsub3.iloc[:,0:1701]
mdfsub4=dfsub4.iloc[:,0:1701]
mdfsub5=dfsub5.iloc[:,0:1701]
mdfsub6=dfsub6.iloc[:,0:1701]
mdfsub7=dfsub7.iloc[:,0:1701]
mdfsub8=dfsub8.iloc[:,0:1701]
mdfsub9=dfsub9.iloc[:,0:1701]

In [13]:
s1=subs.loc[0,:]
s2=subs.loc[1,:]
s3=subs.loc[2,:]
s4=subs.loc[3,:]
s5=subs.loc[4,:]
s6=subs.loc[5,:]
s7=subs.loc[6,:]
s8=subs.loc[7,:]
s9=subs.loc[8,:]

In [14]:
modsub1=mdfsub1.apply(lambda x : x -(np.sum(np.array(x)*np.array(s1))/np.sum(np.array(s1)*np.array(s1)))*s1,axis=1)
modsub2=mdfsub2.apply(lambda x : x -(np.sum(np.array(x)*np.array(s2))/np.sum(np.array(s2)*np.array(s2)))*s2,axis=1)
modsub3=mdfsub3.apply(lambda x : x -(np.sum(np.array(x)*np.array(s3))/np.sum(np.array(s3)*np.array(s3)))*s3,axis=1)
modsub4=mdfsub4.apply(lambda x : x -(np.sum(np.array(x)*np.array(s4))/np.sum(np.array(s4)*np.array(s4)))*s4,axis=1)
modsub5=mdfsub5.apply(lambda x : x -(np.sum(np.array(x)*np.array(s5))/np.sum(np.array(s5)*np.array(s5)))*s5,axis=1)
modsub6=mdfsub6.apply(lambda x : x -(np.sum(np.array(x)*np.array(s6))/np.sum(np.array(s6)*np.array(s6)))*s6,axis=1)
modsub7=mdfsub7.apply(lambda x : x -(np.sum(np.array(x)*np.array(s7))/np.sum(np.array(s7)*np.array(s7)))*s7,axis=1)
modsub8=mdfsub8.apply(lambda x : x -(np.sum(np.array(x)*np.array(s8))/np.sum(np.array(s8)*np.array(s8)))*s8,axis=1)
modsub9=mdfsub9.apply(lambda x : x -(np.sum(np.array(x)*np.array(s9))/np.sum(np.array(s9)*np.array(s9)))*s9,axis=1)

In [15]:
subdf=[modsub1,modsub2,modsub3,modsub4,modsub5,modsub6,modsub7,modsub8,modsub9]
cent_subs=pd.concat(subdf)
cent_subs.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1691,1692,1693,1694,1695,1696,1697,1698,1699,1700
0,0.001835,0.000878,0.000793,0.000247,0.000728,0.001072,0.002347,0.000115,0.000966,0.001123,...,-0.003352,-0.004829,-0.003905,-0.002224,-0.002386,-0.003332,-0.003701,-0.003463,-0.004489,-0.005034
1,0.003758,-0.002548,0.00633,-0.011134,0.000164,0.00512,-0.002903,-0.003049,-0.006294,0.003996,...,-0.010454,0.004719,-0.009114,-0.001578,-0.002636,-0.001219,0.000561,-0.004356,0.002053,-0.007237
2,0.001106,0.001177,0.001245,0.00131,0.001369,0.001383,0.001337,0.001292,0.001355,0.001509,...,-0.003437,-0.003414,-0.00339,-0.003366,-0.003341,-0.003316,-0.003289,-0.003263,-0.003236,-0.003209
3,0.000363,0.003034,0.003668,0.002877,0.002038,0.000546,0.002638,-0.00258,-0.000733,-0.003121,...,-0.002536,-0.007373,-0.000276,-0.002173,-0.007754,-0.006247,-0.003583,-0.004043,-0.002472,-0.008282
4,0.003795,0.00975,0.001765,0.005447,0.004424,0.003195,0.001814,-0.004758,0.005961,0.005176,...,-0.003264,0.001135,-0.00851,-0.000614,-0.004077,-0.013175,-0.004247,-0.002622,-8.3e-05,-0.003888


In [16]:
cent_subs['substrate']=ys
cent_subs.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1692,1693,1694,1695,1696,1697,1698,1699,1700,substrate
0,0.001835,0.000878,0.000793,0.000247,0.000728,0.001072,0.002347,0.000115,0.000966,0.001123,...,-0.004829,-0.003905,-0.002224,-0.002386,-0.003332,-0.003701,-0.003463,-0.004489,-0.005034,1
1,0.003758,-0.002548,0.00633,-0.011134,0.000164,0.00512,-0.002903,-0.003049,-0.006294,0.003996,...,0.004719,-0.009114,-0.001578,-0.002636,-0.001219,0.000561,-0.004356,0.002053,-0.007237,1
2,0.001106,0.001177,0.001245,0.00131,0.001369,0.001383,0.001337,0.001292,0.001355,0.001509,...,-0.003414,-0.00339,-0.003366,-0.003341,-0.003316,-0.003289,-0.003263,-0.003236,-0.003209,1
3,0.000363,0.003034,0.003668,0.002877,0.002038,0.000546,0.002638,-0.00258,-0.000733,-0.003121,...,-0.007373,-0.000276,-0.002173,-0.007754,-0.006247,-0.003583,-0.004043,-0.002472,-0.008282,1
4,0.003795,0.00975,0.001765,0.005447,0.004424,0.003195,0.001814,-0.004758,0.005961,0.005176,...,0.001135,-0.00851,-0.000614,-0.004077,-0.013175,-0.004247,-0.002622,-8.3e-05,-0.003888,1


In [17]:
# Using sklearn
from sklearn.preprocessing import normalize
b=cent_subs.iloc[:,0:1701]
b.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1691,1692,1693,1694,1695,1696,1697,1698,1699,1700
0,0.001835,0.000878,0.000793,0.000247,0.000728,0.001072,0.002347,0.000115,0.000966,0.001123,...,-0.003352,-0.004829,-0.003905,-0.002224,-0.002386,-0.003332,-0.003701,-0.003463,-0.004489,-0.005034
1,0.003758,-0.002548,0.00633,-0.011134,0.000164,0.00512,-0.002903,-0.003049,-0.006294,0.003996,...,-0.010454,0.004719,-0.009114,-0.001578,-0.002636,-0.001219,0.000561,-0.004356,0.002053,-0.007237
2,0.001106,0.001177,0.001245,0.00131,0.001369,0.001383,0.001337,0.001292,0.001355,0.001509,...,-0.003437,-0.003414,-0.00339,-0.003366,-0.003341,-0.003316,-0.003289,-0.003263,-0.003236,-0.003209
3,0.000363,0.003034,0.003668,0.002877,0.002038,0.000546,0.002638,-0.00258,-0.000733,-0.003121,...,-0.002536,-0.007373,-0.000276,-0.002173,-0.007754,-0.006247,-0.003583,-0.004043,-0.002472,-0.008282
4,0.003795,0.00975,0.001765,0.005447,0.004424,0.003195,0.001814,-0.004758,0.005961,0.005176,...,-0.003264,0.001135,-0.00851,-0.000614,-0.004077,-0.013175,-0.004247,-0.002622,-8.3e-05,-0.003888


In [18]:
#normalize(b)
b1=b.apply(lambda x: x/(x**2).sum()**.5, axis=1)
b1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1691,1692,1693,1694,1695,1696,1697,1698,1699,1700
0,0.009607,0.004595,0.004153,0.001293,0.003809,0.005611,0.012284,0.000603,0.005059,0.005881,...,-0.017547,-0.025276,-0.020442,-0.011642,-0.012491,-0.01744,-0.019376,-0.018127,-0.023498,-0.026352
1,0.013683,-0.009277,0.023046,-0.040534,0.000598,0.01864,-0.010568,-0.011098,-0.022913,0.014546,...,-0.03806,0.017181,-0.033179,-0.005745,-0.009596,-0.004439,0.002042,-0.015857,0.007474,-0.026345
2,0.00592,0.006296,0.006659,0.007008,0.007325,0.007399,0.007152,0.006912,0.00725,0.008074,...,-0.018391,-0.018267,-0.018138,-0.018009,-0.017876,-0.017742,-0.017598,-0.01746,-0.017316,-0.017168
3,0.00172,0.014365,0.017369,0.013624,0.009649,0.002587,0.01249,-0.012215,-0.00347,-0.014777,...,-0.012008,-0.034912,-0.001307,-0.010292,-0.036719,-0.029583,-0.016964,-0.019143,-0.011705,-0.039219
4,0.014017,0.036012,0.006521,0.02012,0.016342,0.011801,0.006702,-0.017573,0.022017,0.019119,...,-0.012055,0.004192,-0.031434,-0.002269,-0.01506,-0.048664,-0.015689,-0.009685,-0.000306,-0.014363


In [19]:
train_x =b1.iloc[:,0:1701]
train_x=train_x.to_numpy()
train_x

array([[ 0.00960739,  0.00459505,  0.00415293, ..., -0.01812665,
        -0.02349759, -0.0263521 ],
       [ 0.01368298, -0.0092775 ,  0.0230458 , ..., -0.0158569 ,
         0.00747383, -0.0263455 ],
       [ 0.00592006,  0.00629565,  0.00665916, ..., -0.01745972,
        -0.01731626, -0.01716784],
       ...,
       [ 0.01227093,  0.0086006 ,  0.01291199, ...,  0.00176972,
        -0.01044183, -0.03953934],
       [-0.01367231, -0.00789796, -0.0107693 , ..., -0.00036861,
        -0.0053494 , -0.00715859],
       [ 0.02376845,  0.04457955, -0.0066374 , ..., -0.05305244,
         0.00288916, -0.01349044]])

In [20]:
# No of Neurons in each Layer 
nb_epoch = 20
batch_size = 50
input_dim = train_x.shape[1] #num of columns, 1701
encoding_dim = 64
hidden_dim = int(encoding_dim / 2) #i.e. 32
#decoding_dim = 64
learning_rate = 1e-7

input_layer = Input(shape=(input_dim, ))
encoder = Dense(encoding_dim, activation="tanh", activity_regularizer=regularizers.l1(learning_rate))(input_layer)
encoder = Dense(hidden_dim, activation="relu")(encoder)
decoder = Dense(hidden_dim, activation='tanh')(encoder)
#decoder = Dense(decoding_dim, activation='relu')(decoder)
decoder = Dense(input_dim, activation='relu')(decoder)
autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.summary()

Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 1701)]            0         
_________________________________________________________________
dense_4 (Dense)              (None, 64)                108928    
_________________________________________________________________
dense_5 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_6 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_7 (Dense)              (None, 1701)              56133     
Total params: 168,197
Trainable params: 168,197
Non-trainable params: 0
_________________________________________________________________


In [21]:
import datetime
autoencoder.compile(optimizer='adam', loss='mse' )

t_ini = datetime.datetime.now()
history = autoencoder.fit(train_x, train_x, epochs=nb_epoch,batch_size=batch_size, shuffle=True,validation_split=0.2,verbose=0)

t_fin = datetime.datetime.now()
print('Time to run the model: {} Sec.'.format((t_fin - 
t_ini).total_seconds()))



df_history = pd.DataFrame(history.history) 

Time to run the model: 37.047621 Sec.


In [22]:
predictions = autoencoder.predict(train_x)
print(predictions)

[[0.         0.00047023 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.00068953 0.00153927 0.00021678 ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.00159233 0.         ... 0.         0.         0.        ]]


In [23]:
predictions=pd.DataFrame(predictions)

In [24]:
predictions.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1691,1692,1693,1694,1695,1696,1697,1698,1699,1700
0,0.0,0.00047,0.0,0.001634,0.005972,0.006406,0.007733,0.009191,0.011177,0.008308,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001145,0.002872,0.001093,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.00069,0.001539,0.000217,0.002144,0.00628,0.006626,0.008068,0.009545,0.011654,0.008964,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.00025,0.00121,0.0,0.001175,0.004588,0.00478,0.006111,0.007259,0.009627,0.007441,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.008206,0.008899,0.00681,0.007146,0.010219,0.010488,0.011667,0.012746,0.013932,0.01224,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
train_x=pd.DataFrame(train_x)

In [26]:
augdata=pd.concat([train_x,predictions],ignore_index=True)

In [27]:
print(augdata.shape)

(99000, 1701)


In [28]:
a1=np.array(dfsub1['Analyte'].values.tolist())
a2=np.array(dfsub2['Analyte'].values.tolist())
a3=np.array(dfsub3['Analyte'].values.tolist())
a4=np.array(dfsub4['Analyte'].values.tolist())
a5=np.array(dfsub5['Analyte'].values.tolist())
a6=np.array(dfsub6['Analyte'].values.tolist())
a7=np.array(dfsub7['Analyte'].values.tolist())
a8=np.array(dfsub8['Analyte'].values.tolist())
a9=np.array(dfsub9['Analyte'].values.tolist())
y=np.concatenate((a1,a2,a3,a4,a5,a6,a7,a8,a9),axis=None)

In [29]:
augy=np.concatenate((y,y),axis=None)
augy=pd.DataFrame(augy)

In [30]:
augdata['Analyte']=augy
augdata.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1692,1693,1694,1695,1696,1697,1698,1699,1700,Analyte
98995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55
98996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55
98997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55
98998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55
98999,0.0,0.001592,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55


In [31]:
y=augy.to_numpy()

In [69]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
# We will find by inspection the best k according to the classifier accuracy on the test set
accuracies = []
X=augdata.iloc[:,0:1701]
# We will find by inspection the best k according to the classifier accuracy on the test set
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,random_state=0)
for k in range(1, 40, 2):
    # Entrenar el clasificador  con el valor actual de  `k`
    neigh = KNeighborsClassifier(n_neighbors=k)
    neigh.fit(X_train, y_train)

    # Evaluar los modelos e imprimiendo sus predicciones
    score = neigh.score(X_test, y_test)
    print("k=%d, accuracy=%.2f%%" % (k, score * 100))
    accuracies.append(score)

k=1, accuracy=84.08%
k=3, accuracy=78.28%
k=5, accuracy=78.17%
k=7, accuracy=78.14%
k=9, accuracy=78.39%
k=11, accuracy=78.18%
k=13, accuracy=78.10%
k=15, accuracy=78.00%
k=17, accuracy=78.03%
k=19, accuracy=77.75%
k=21, accuracy=77.44%
k=23, accuracy=77.22%
k=25, accuracy=76.96%
k=27, accuracy=76.69%
k=29, accuracy=76.49%
k=31, accuracy=76.09%
k=33, accuracy=75.52%
k=35, accuracy=75.55%
k=37, accuracy=75.19%
k=39, accuracy=74.91%


In [70]:
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten, MaxPooling1D, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.datasets import load_iris
from numpy import unique
import pandas as pd
import numpy as np
import time
import keras

In [72]:
x=augdata.iloc[:,0:1701].to_numpy()
x = x.reshape(x.shape[0], x.shape[1], 1)
print(x.shape)
y=y-1
#print(unique(y))
#print(unique(y).sum())

xtrain, xtest, ytrain, ytest=train_test_split(x, y, test_size=0.2)


(99000, 1701, 1)


In [73]:
model = Sequential()
model.add(Conv1D(32, 3, activation="relu", input_shape=(1701,1)))
model.add(Dense(64, activation="relu"))
model.add(MaxPooling1D(pool_size=3))
model.add(Dropout(0.5))
model.add(Conv1D(32, 3, activation="relu", input_shape=(1701,1)))
model.add(Dense(64, activation="relu"))
model.add(MaxPooling1D(pool_size=3))
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(55, activation = 'softmax'))
start_time = time.time()
model.compile(loss = 'sparse_categorical_crossentropy', optimizer = "adam", metrics = ['accuracy'])
model.summary()
baseline_history=model.fit(xtrain, ytrain, epochs=25, batch_size=256, verbose=1, validation_split=0.2)
#model.fit(xtrain, ytrain, batch_size=256,epochs=25,  validation_split=.2, verbose=1)
acc = model.evaluate(xtrain, ytrain)
print("Loss:", acc[0], " Accuracy:", acc[1])
print("--- %s seconds ---" % (time.time() - start_time))

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 1699, 32)          128       
_________________________________________________________________
dense_8 (Dense)              (None, 1699, 64)          2112      
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 566, 64)           0         
_________________________________________________________________
dropout (Dropout)            (None, 566, 64)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 564, 32)           6176      
_________________________________________________________________
dense_9 (Dense)              (None, 564, 64)           2112      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 188, 64)           0

In [74]:
start_time = time.time()
#Test the model after training
start_time=time.time()
test_results = model.evaluate(xtest, ytest, verbose=1)
print(test_results)
print(f'Test results - Loss: {test_results[0]} - Accuracy: {test_results[1]}%')
print("--- %s seconds ---" % (time.time() - start_time))

[0.9371163845062256, 0.7896464467048645]
Test results - Loss: 0.9371163845062256 - Accuracy: 0.7896464467048645%
--- 14.994240760803223 seconds ---
