In [1]:
import numpy as nmpy
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import confusion_matrix

In [2]:
train_input = nmpy.load('train_input.npy')
train_label = nmpy.load('train_label.npy')

In [3]:
def split_data(train_input):
    X1 = []
    X2 = []
    for index in range(len(train_input)):
        X1.append([train_input[index][0], train_input[index][1]])
        X2.append([train_input[index][2], train_input[index][3]])

    return nmpy.array(X1).astype('float64'), nmpy.array(X2).astype('float64')


In [4]:
def get_data(train_input,train_label):

    X_train, X_test, y_train, y_test = train_test_split(train_input, train_label, test_size=0.20, random_state=42)
    X_train1, X_train2 = split_data(X_train)
    X_test1,  X_test2   = split_data(X_test)

    X_train1 = nmpy.transpose(X_train1, (0, 2, 1))
    X_train2 = nmpy.reshape(X_train2, (X_train2.shape[0], X_train2.shape[1], 1))
    X_test1  = nmpy.transpose(X_test1, (0, 2, 1))
    X_test2  = nmpy.reshape(X_test2, (X_test2.shape[0], X_test2.shape[1], 1))

    return X_train1, X_train2, y_train, X_test1, X_test2, y_test

Checking the shape of Data

In [5]:
X_train1, X_train2, y_train, X_test1, X_test2, y_test = get_data(train_input,train_label)

In [6]:
X_train1.shape

(12345, 240, 2)

In [7]:
X_train2.shape

(12345, 2, 1)

In [8]:
y_test.shape

(4116,)

In [9]:
X_test2.shape

(4116, 2, 1)

In [10]:
X_train1[1,:]

array([[-1.05093706e+01,  1.08651085e-01],
       [-1.21729348e+01,  1.82867376e-02],
       [-1.18151890e+01, -7.03477055e-02],
       [-6.64668334e+00, -1.43534885e-01],
       [ 3.58714814e+00, -1.92063482e-01],
       [ 1.61760048e+01, -2.10999125e-01],
       [ 2.82412969e+01, -1.95579088e-01],
       [ 3.72488312e+01, -1.47640312e-01],
       [ 4.32097188e+01, -8.98165190e-02],
       [ 4.72758837e+01, -4.88276416e-02],
       [ 5.06043875e+01, -5.05704869e-02],
       [ 5.37530220e+01, -9.53810494e-02],
       [ 5.53911693e+01, -1.53698722e-01],
       [ 5.38171711e+01, -1.94265936e-01],
       [ 4.73672287e+01, -1.89697095e-01],
       [ 3.62672984e+01, -1.49904904e-01],
       [ 2.34717650e+01, -1.05646465e-01],
       [ 1.21477328e+01, -8.79066011e-02],
       [ 5.04418874e+00, -1.17780515e-01],
       [ 1.81997115e+00, -1.71756361e-01],
       [ 7.48904715e-01, -2.13751370e-01],
       [ 1.08472385e-01, -2.08414532e-01],
       [-8.90991380e-01, -1.48869908e-01],
       [-1.

From above we can see values present are on different scale hence we need to standardize them so that our gradient descent can reach the minima at a faster pace

In [11]:
def scale(train_input):
    scalers = {}
    for i in range(train_input.shape[2]):
        scalers[i] = StandardScaler()
        train_input[:, i, :] = scalers[i].fit_transform(train_input[:, i, :]) 
    return train_input

In [12]:
X_train1 = scale(X_train1)
X_train2 = scale(X_train2)


In [13]:
X_test1 = scale(X_test1) 
X_test2 = scale(X_test2)

In [14]:
X_train1[1,:]

array([[-9.28799841e-02,  3.44765828e-01],
       [-1.09582693e-01,  5.66334112e-02],
       [-1.18151890e+01, -7.03477055e-02],
       [-6.64668334e+00, -1.43534885e-01],
       [ 3.58714814e+00, -1.92063482e-01],
       [ 1.61760048e+01, -2.10999125e-01],
       [ 2.82412969e+01, -1.95579088e-01],
       [ 3.72488312e+01, -1.47640312e-01],
       [ 4.32097188e+01, -8.98165190e-02],
       [ 4.72758837e+01, -4.88276416e-02],
       [ 5.06043875e+01, -5.05704869e-02],
       [ 5.37530220e+01, -9.53810494e-02],
       [ 5.53911693e+01, -1.53698722e-01],
       [ 5.38171711e+01, -1.94265936e-01],
       [ 4.73672287e+01, -1.89697095e-01],
       [ 3.62672984e+01, -1.49904904e-01],
       [ 2.34717650e+01, -1.05646465e-01],
       [ 1.21477328e+01, -8.79066011e-02],
       [ 5.04418874e+00, -1.17780515e-01],
       [ 1.81997115e+00, -1.71756361e-01],
       [ 7.48904715e-01, -2.13751370e-01],
       [ 1.08472385e-01, -2.08414532e-01],
       [-8.90991380e-01, -1.48869908e-01],
       [-1.

Values are scaled and now they can be in Support Vector Classifier

In [15]:
nsamples, nx, ny = X_train1.shape
d2_train_dataset = X_train1.reshape((nsamples,nx*ny))
nsamples, nx, ny = X_test1.shape
d2_test_dataset = X_test1.reshape((nsamples, nx*ny))


In [16]:
from sklearn.svm import SVC

svc = SVC(C=1.0, kernel='rbf', degree=3, gamma='scale',random_state=0)

In [17]:
svc_fit = svc.fit(d2_train_dataset,y_train)
y_pred_svc = svc_fit.predict(d2_test_dataset)
confusionMatrixSVC = pd.crosstab(y_test,y_pred_svc, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusionMatrixSVC)

from sklearn import metrics
AccuracySVM = metrics.accuracy_score(y_test, y_pred_svc)
print ('Accuracy',AccuracySVM)

Predicted   0.0  1.0   All
Actual                    
0.0        2528    2  2530
1.0        1584    2  1586
All        4112    4  4116
Accuracy 0.6146744412050534


In [18]:
from sklearn.ensemble import  RandomForestClassifier
rfc = RandomForestClassifier()

In [37]:
d2_train_dataset[0:1000].shape
y_test.shape

(4116,)

In [38]:
rfc_fit=rfc.fit(d2_train_dataset[0:1000,:],train_label[0:1000])     #just used 1000 samples for training the model remove the [] for training on 
                                                                    #on complete dataset
y_pred_RFC = rfc_fit.predict(d2_test_dataset[0:1000,:])
print(y_pred_RFC.shape)

from sklearn import metrics
AccuracyRFC = metrics.accuracy_score(y_test[0:1000], y_pred_RFC)
print ('Accuracy',AccuracyRFC)

(1000,)
Accuracy 0.378


In [39]:
#decesion trees
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc_fit = dtc.fit(d2_train_dataset[0:1000,:],train_label[0:1000])
dtc.score(d2_train_dataset,y_train)
y_pred_dt = svc_fit.predict(d2_test_dataset)
print(dtc.score(d2_test_dataset,y_pred_dt))
confusionMatrixDTC = pd.crosstab(y_test,y_pred_dt, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusionMatrixDTC)

from sklearn import metrics
AccuracyDTC = metrics.accuracy_score(y_test, y_pred_dt)
print ('Accuracy',AccuracyDTC)

0.21671525753158405
Predicted   0.0  1.0   All
Actual                    
0.0        2528    2  2530
1.0        1584    2  1586
All        4112    4  4116
Accuracy 0.6146744412050534


In [26]:
#KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn_fit = knn.fit(d2_train_dataset[0:1000,:],train_label[0:1000])
knn.score(d2_train_dataset,y_train)
y_pred_KNN = knn_fit.predict(d2_test_dataset)

confusionMatrixKNN = pd.crosstab(y_test,y_pred_KNN, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusionMatrixKNN)
from sklearn import metrics
AccuracyKNN = metrics.accuracy_score(y_test, y_pred_KNN)
print ('Accuracy',AccuracyKNN)

Predicted  0.0   1.0   All
Actual                    
0.0        130  2400  2530
1.0         43  1543  1586
All        173  3943  4116
Accuracy 0.4064625850340136


In [28]:
#AdaBoost
from sklearn.ensemble import AdaBoostClassifier
ADA_fit = AdaBoostClassifier(random_state=0).fit(d2_train_dataset[0:1000,:],train_label[0:1000])
y_pred_ADA = ADA_fit.predict(d2_test_dataset)
confusionMatrixADA = pd.crosstab(y_test,y_pred_ADA, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusionMatrixADA)

from sklearn import metrics
AccuracyADA = metrics.accuracy_score(y_test, y_pred_KNN)
print ('Accuracy',AccuracyKNN)


Predicted  0.0   1.0   All
Actual                    
0.0        143  2387  2530
1.0        107  1479  1586
All        250  3866  4116
Accuracy 0.4064625850340136
