In [7]:
### Import Libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import pprint
from tabulate import tabulate
import warnings

%matplotlib inline
pd.set_option("display.max_columns",40)
warnings.filterwarnings("ignore")

from IPython.display import HTML


import keras
from keras.layers import Dense,Dropout
from keras.models import Sequential
from keras.wrappers.scikit_learn import KerasClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier,ExtraTreeClassifier
from sklearn.decomposition import PCA
from sklearn.ensemble import VotingClassifier


from sklearn.metrics import confusion_matrix,accuracy_score,auc,roc_curve,f1_score
from sklearn.model_selection import GridSearchCV,StratifiedKFold,train_test_split,cross_val_score
from sklearn.preprocessing import StandardScaler,binarize

from mlxtend.classifier import StackingClassifier
from xgboost import XGBClassifier
from vecstack import stacking
from pandas_ml import ConfusionMatrix

In [8]:
def get_roc_curve(model,x_test,y_test):
    prob=model.predict_proba(x_test)
    preds=prob[:,1]
    fpr, tpr, threshold = roc_curve(y_test, preds)
    roc_auc =auc(fpr, tpr)
    plt.figure(figsize=(10,8))
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
    i = np.arange(len(tpr)) 
    roc = pd.DataFrame({'fpr' : pd.Series(fpr, index=i),'tpr' : pd.Series(tpr, index = i), '1-fpr' : pd.Series(1-fpr, index = i), 'tf' : pd.Series(tpr - (1-fpr), index = i), 'thresholds' : pd.Series(threshold, index = i)})
    threshold=roc.ix[(roc.tf-0).abs().argsort()[:1]]['thresholds'].values[0]
    print("Optimal Threshold is",threshold)
    y_pred_class=binarize(preds.reshape(1,-1),threshold)[0]
    accuracy=accuracy_score(y_test,y_pred_class)
    print("Accuracy of Logistic Model on training data is:",accuracy)
    cm=ConfusionMatrix(y_test,y_pred_class)
    display(cm)
    return cm,accuracy,threshold,roc_auc

In [9]:
cric=['eee73a52c55dbef6c5b4ec7ec0cd3d200a90f528b7344b7337a546f7734774db',
       '1510909c55f8994397a5aced5728fca609aff1b6bab31080e919795aeb2db97d',
       'cff22bd1423ddbbbf00818b3d866cb4a5f4a8603ee607a824ad96224072f4076',
       '1ed642700929c99329b1ec5e6ddaa5b55db8ec04f776a690e4e22b86e89136db',
       'c96be1bc82e9c8d37ba23c7077d47f003a88fe0b5e0ef79ffa830d9552eeed1e',
       'db663ee7c30f6e366215d111f745205c73789819f5781028b573a197b0388e23',
       'cca0d130b4966f6bcc4e7093295e2637b48f421c120f4d6bfece4938e1c61ae7',
       'cf4dffb6fb12b9a95d1cc43717950e389db7e3c7c9e6c4a3d56d8e0c7a98891b',
       '4121c37e4f4081f7e533940466b0962159f00840b14dca4479eb17e0d5807b15',
       '996116b7d5b47888082dd8caf8911ef352ebe6311210d813fbc18710e12a0091',
       '5672b291263e608577995d50cd51efd4662af2b9a3d327db7d6c264112bd7d57',
       '5b2109fa7d9154dc8b15a1ecee9f520330519e9e4d8ea51a61e01cbc44600a86']

In [10]:


med=['07a805db92f77828144af02c110b4107381c8608ff69cbe8322b1619550dca26',
       'eee73a52c55dbef6c5b4ec7ec0cd3d200a90f528b7344b7337a546f7734774db',
       'cef0d1cb38af8ad44a4c03c5718984ae948de73fefefc7f16e8e87a27bd719ff',
       'b1d81f7de972e44a6845b9556ed3485a9b9a4a27c499d707ca91d3f5aa2334cc',
       '5c95fd730ce2612f44cb7a9c5802d9c5c9e90d7efdeb81026d8a89ab5a3c0a8a',
       '9861528d3962a11deb48775c63026bc8cbdf39aaf3f90aa0b87fef5445d5926c',
       'ce91776ab66403168967f46d8170b6949035822ba671e2ae1a4e5b012f71fc73',
       '26490d34b73ba281b2a586fdbfc9d13b48deaa204944733b859bf412f1fa9b0e',
       '2b9d80a899a0e34177f2b0ad451f18509d8b73540ce2b51fc2dc652495c99417',
       '0caa319489c018ddc7862bf2dafc054889ec8066de3f5b166d1a26e9f12920a4',
       '9bf55dee2eb78fca489c15f4c05644d7e259c413f4f97ab1d622e9999703ff7a',
       '95fef8a0d3377bb3fb41d589e5091eece852026be1b1224383265748863b6671',
       '8b7d9bb36cbb3b6d3defe024d4b980f5bedb71d8875456642e2b173dede2a3c7',
       'f71994f88f63bd783f50648de55e879dc8580e20c771703b852c0e3036667f46',
       'f52846069af3ef50f230f86ac40cdfe41e461a5e749951f112cd0f7c8e605a9e',
       '4677f4c9612858664db2eac9f788886ba53d3b48d79320c1fb15aaf04e00d970',
       '989678db8131e49fd29adc46c2e5e1a6e9ab6823203b37954a24b2b68a98083e',
       '2587a851f09c2bb4d22b7daf8ec30af164ad94a7f5d1620d60d271b85fff6f64',
       '682eb13643f153128c9c47afe6be08554cd042d0b38815b6745a2ec45b472a70',
       '27b0bf1efe11e967f3ddb1b1c2760eaff1659bc221ae60ba772c972de59bf4c1',
       'bb445f8ffb0724b780364707b75ade0df70a8ee22fd1a9f89af3f9d469928b3c',
       'bcca3c58857d6029838ff9d34f3c576d76ac8dbccbe85c39961f0a9a8abe99e9',
       'c9c33a60aaec935cd9d3d1420c46c0e4bcec43463aac55dd454705b04f6a9f8c',
       'a14cbfea9a24e5720c98ad25ccdde36970b5ede89f9ac783a4bbebb8b61a1568',
       '7a4da56250e41de51576ffbfbc5db2aa6b38cc35867a20b4d28318a6e2c1c813',
       '705d59380dfcee0454a0d3d9063900bf5e6706ab9f642b68a00f53099af0eafd',
       '6f6d4c27d6351b6dd1209847771f8aeda37ec301b11b442620e32bbda346bcb7',
       'f55a241cd8a27c7392b8ab9049495758f6895fefa5538e8aca2c27a204ecb0c3',
       '201b65b57be8702dfa4593b39f0d5a9d79662627dc885199c1373b188d4a4a47',
       '58d466231ab231ae49c7b315b995b8e1b4916c0c3e79fe3c057b06ab8c3d6ec8',
       '005030a1caa49429f288b79917630d9a991690b95dde910c8f00a6b926e68c31',
       'eed1b477ab451eb20e847c90b3752b5a7d6c83d3b0c36285450f6731add2063e',
       'adbaa75ca91a86b069ec2810a47b226803cfcc90fb537c09a6945d9a13f19845',
       'c2b4a8f51a1f1eddc7a459bbc3911da76708a878fcd0a16cb5bb6d2ab3dfb5d0',
       'ed91f04fbd495e2d5a00f327756aa4882f8b9d4f5d58b444150f5d89b2befebc',
       'b85725fc8c972f70e3e8a290fc32b75df44bff1fbf31555ff512074003b6387b',
       'a905175974a71113bb8794ec07b0e03add816a329c9e971c833710d4bdd52abf',
       '376ffc521ffbf5d3cb640e186b9db7a98795f7656a82b73207f6884ca1194589',
       '421548d50c73674295d827ff088f1d353441fdd64a0fa7ff74863cd8113bf991',
       '54b356aa258f805ac028589e97883ab66d011a00dd9e2a409eb5579f4e81abd4',
       '22a6f0f1cec1769ea1d0898f041c36ebbfa5b1bf6c6b14d9d11b9119d02ae6ea',
       '204fa067ef1d8b8b034acfa1693f8c53e86c84305ee7503a199f40d3722ee57e',
       '8eba4debb95ef877faed7041269483166a3145a7a590cba966da0a5aa394f2f3',
       '84a578c10ef1a342ad03312393548815acba50632dd302b60f96e273abab64c8',
       '802f787a8d4ebe3d4830943c5f4d50572512c76002ffdc4f9acaf238da85fb36',
       'a6d0045c1c7ffb7db8fdcbe34d7ea0f46ce12606114b63b3702d312be308f9de',
       '94e77fdd5917ebf29fc743bf64798a5dd7c60068fcd029559e4e03afae2cda39',
       '5f78d8ea5d220fd4371a9d5b67f9704172b7e95aa85756242ea58a09ad6a2189',
       '645ee32ae1c84817505808b1a84e53565542cebd89d00d56207c3a8d111c60e5',
       '571fdff9a66b4534942db0655d35eef2dde903259323d6a36128a00a48b94efa',
       '68e9f31a207a753cc3460b17afd291ef4972d57cb149dbdac093fc4c986ad994',
       '775b40dfdcf5074b07b4c6639073b185e1133e0aaf468ede9ffd5fab9dd9943f']

In [11]:
def get_stats(a):
    x=0
    y=0
    for i in med:
        if i in m:
            x=x+a[a['Source_User']==i][0].values
            y=y+a[a['Source_User']==i]['Target'].values
    print("For high Severity:")
    print("identified:",x)
    print("Actual",y)
    print("For critical Severity")
    x=0
    y=0
    for i in cric:
        if i in m:
            x=x+a[a['Source_User']==i][0].values
            y=y+a[a['Source_User']==i]['Target'].values
    print("identified:",x)
    print("Actual",y)

## 80:20 Split

### random State 0

In [30]:
## Read the csv file which contains the data aggregated on user level 
z=pd.read_csv('Feature_Vector_50_50_3_17_random_state_0.csv',index_col='Source_User')

z.columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','Target','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]


columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]

print("The feature vector for infected users and sample of uninfected users")
print('Infected/Unifected Users')
display(z.Target.value_counts())
## Create a feature set
X=z.iloc[:,z.columns!='Target']
display(X.head(2))
X=X.values

### Build our target variable
print("1 -> Infected\n0 -> Uninfected")
Y=z.Target
Y=Y.apply(lambda x: 1 if x=='infected' else 0)
display(Y.head(10))
# Y=Y.values

### Scale feature set  using Standard Scaler
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaled_X=scaler.fit_transform(X)
print("The train/test split ratio is 80:20")
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(scaled_X,Y,random_state=0,test_size=0.2)

The feature vector for infected users and sample of uninfected users
Infected/Unifected Users


uninfected    1923
infected      1923
Name: Target, dtype: int64

Unnamed: 0_level_0,NREC,PAR,PAS,LEN,BYS,BYR,SP,DP,NAPP,SIP,DIP,TDIF,0000,0100,0200,0300,0400,0500,0600,0700,0800,0900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,2200,2300
Source_User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
8d80d7e04a232fa309845a28eeb50b2e9902b83fcc2ec60f2b14eb15d59dd611,342,10.783626,9.897661,83.982456,2232.368421,7639.97076,337.0,14.0,22,30,123,13642.195906,12,5,0,0,0,0,0,0,3,6,16,5,4,14,9,25,46,27,98,16,38,12,6,0
e9e209be7e582708e1551ced368c8bd13ada27cf2857f35cc79ee014e30c7f27,42,9.47619,9.595238,119.285714,1357.809524,5211.809524,42.0,3.0,7,3,20,2319.0,0,0,0,0,0,0,0,0,0,0,0,0,8,1,9,24,0,0,0,0,0,0,0,0


1 -> Infected
0 -> Uninfected


Source_User
8d80d7e04a232fa309845a28eeb50b2e9902b83fcc2ec60f2b14eb15d59dd611    1
e9e209be7e582708e1551ced368c8bd13ada27cf2857f35cc79ee014e30c7f27    0
73e7f427173ebaafa2447cafcfa05e11aac3dfda1d0a63d542395a7c9f3e41dd    0
e931f318e6d489b8c1fefcdbcc9f1678c3f5fc95b51fa9675b0130c2003885ef    1
bae671359bbc14c02f9f8ebc82ce508a719fc67b20861aeac0eae50765512ad3    1
97435f766a7f2b4ddea9ef289247eb80b7949d19813e662b41df10ee82d45dff    1
e83f41f7465c7b228abaffb01c879bfa306d44170328d408d6da9a1e0bfb9d30    1
d0e61d1a66521a1545404096ea9271bb53d3e19eefa1331754ca58c1cfbaff31    1
aaca5cdcccf23e9d5f1debe1ec0528fc5550ffe99f2f4a55a71e89ee95af52c9    1
5c5d234bdd7acfc32b04705b11fd0a5f4557d8d386e1a39a22cf728208b003e9    0
Name: Target, dtype: int64

The train/test split ratio is 80:20


### Adaboost

In [31]:
ada=AdaBoostClassifier(algorithm='SAMME',learning_rate=1,n_estimators=400,random_state=0)
ada.fit(X_train,Y_train)

train=pd.DataFrame(Y_test)
test=pd.DataFrame(ada.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [10]
Actual [12]
For critical Severity
identified: [3]
Actual [3]


### Gradient Boosting Classifier

In [32]:
gbc=GradientBoostingClassifier(criterion='mse', learning_rate=0.03,loss='deviance', min_samples_leaf=5, min_samples_split=0.1,
                               n_estimators = 100,random_state = 0,subsample=1 )

gbc.fit(X_train,Y_train)

train=pd.DataFrame(Y_test)
test=pd.DataFrame(gbc.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [9]
Actual [12]
For critical Severity
identified: [2]
Actual [3]


### Bagging Classifier

In [33]:
bag=BaggingClassifier(n_estimators=300, max_samples=0.6,max_features=0.6,
                      bootstrap=True,bootstrap_features=True,oob_score=True,random_state=0,n_jobs=-1)

bag.fit(X_train,Y_train)

train=pd.DataFrame(Y_test)
test=pd.DataFrame(bag.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values


get_stats(a)

For high Severity:
identified: [10]
Actual [12]
For critical Severity
identified: [3]
Actual [3]


### Neural Network - 2 

In [35]:
def build_keras_classifier(optimizer):
    deep_learning=Sequential()
    deep_learning.add(Dense(units=64,activation='relu',use_bias=True,kernel_initializer='uniform',input_dim=36))
    deep_learning.add(Dense(units=128,activation='relu',use_bias=True,kernel_initializer='uniform'))
    deep_learning.add(Dense(units=256,activation='relu',use_bias=True,kernel_initializer='uniform'))
    deep_learning.add(Dense(units=512,activation='relu',use_bias=True,kernel_initializer='uniform'))
    deep_learning.add(Dense(units=1,activation='sigmoid',use_bias=True,kernel_initializer='uniform'))
    deep_learning.compile(optimizer=optimizer,loss='binary_crossentropy',metrics=['accuracy'])
    return deep_learning
classifier=KerasClassifier(build_fn=build_keras_classifier,batch_size=20,nb_epoch=100,verbose=0)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
parmas= {'batch_size':[20],'nb_epoch':[120],'optimizer':['adam','rmsprop']}
grid_search=GridSearchCV(estimator=classifier,param_grid=parmas,scoring='accuracy',cv=kfold,verbose=0)
grid_search.fit(X_train,Y_train)
train=pd.DataFrame(Y_test)
test=pd.DataFrame(grid_search.predict(X_test).reshape(1,len(X_test))[0])
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [11]
Actual [12]
For critical Severity
identified: [3]
Actual [3]


### Random State 10

In [36]:
## Read the csv file which contains the data aggregated on user level 
z=pd.read_csv('Feature_Vector_50_50_3_17_random_state_10.csv',index_col='Source_User')

z.columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','Target','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]


columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]

print("The feature vector for infected users and sample of uninfected users")
print('Infected/Unifected Users')
display(z.Target.value_counts())
## Create a feature set
X=z.iloc[:,z.columns!='Target']
display(X.head(2))
X=X.values

The feature vector for infected users and sample of uninfected users
Infected/Unifected Users


uninfected    1923
infected      1923
Name: Target, dtype: int64

Unnamed: 0_level_0,NREC,PAR,PAS,LEN,BYS,BYR,SP,DP,NAPP,SIP,DIP,TDIF,0000,0100,0200,0300,0400,0500,0600,0700,0800,0900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,2200,2300
Source_User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
8c48feddaa03131d5816111a9aef28e9ecbe168bb9b32a5bed0d545a1327be88,2984,216.771448,74.177949,82.084115,9038.178284,286763.437332,2689.0,13.0,45,19,784,1642.074397,265,188,248,39,82,43,112,20,50,46,105,89,81,80,105,70,200,116,195,158,176,167,116,233
7d72b802c8934804b499a7b417a31857b112353b42885b2290ff09aeef55bebf,264,25.242424,35.80303,69.193182,30610.265152,11302.988636,263.0,7.0,19,36,97,17986.772727,0,5,0,16,0,0,0,0,6,51,45,11,8,9,20,2,6,13,4,42,18,8,0,0


In [37]:
### Build our target variable
print("1 -> Infected\n0 -> Uninfected")
Y=z.Target
Y=Y.apply(lambda x: 1 if x=='infected' else 0)
display(Y.head(10))
# Y=Y.values

1 -> Infected
0 -> Uninfected


Source_User
8c48feddaa03131d5816111a9aef28e9ecbe168bb9b32a5bed0d545a1327be88    1
7d72b802c8934804b499a7b417a31857b112353b42885b2290ff09aeef55bebf    0
c59c78cbcb78f0f7140cef977c9016a08b9d4d91bc5b2e537327b2ab2b791f95    1
8b4f22fd3269f43630ef23cc553c80f72740dcc05fec14b0caad367768ce21a4    1
caea0e5257613ba78f187ac0616758755573aa9b58006fb8a5b671496fc3e392    1
f7cbcf0d21066d971d1948c618f66de0188cb7326b8c198ddd66d18d7e4d91bd    1
d487fb3eaf4e620ba1acd93bfce4387f10e162062f03f15f813c62481bf0a518    1
60430a2cacf66446dae118e6f4b204221901dcd2f3a3faffee16848f2ae9d0ec    1
becf4f677fab6f341e04c1c6ef46ae9d5c90175660426fb33dbd7ed850aec06d    0
f0ae081c92bfcbd5f316ae0d19db53b6a9edfdfc687166f2a5b4e6eb60d9c9ed    1
Name: Target, dtype: int64

In [38]:
### Scale feature set  using Standard Scaler
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaled_X=scaler.fit_transform(X)
print("The train/test split ratio is 80:20")
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(scaled_X,Y,random_state=0,test_size=0.2)

The train/test split ratio is 80:20


### Adaboost

In [39]:
ada=AdaBoostClassifier(algorithm='SAMME',learning_rate=1,n_estimators=400,random_state=0)
ada.fit(X_train,Y_train)

train=pd.DataFrame(Y_test)
test=pd.DataFrame(ada.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [8]
Actual [11]
For critical Severity
identified: [1]
Actual [1]


### Gradient Boosting Classifier

In [40]:
gbc=GradientBoostingClassifier(criterion='mse', learning_rate=0.05,loss='exponential', min_samples_leaf=10, min_samples_split=0.3,
                               n_estimators = 150,random_state = 0,subsample=0.7 )

gbc.fit(X_train,Y_train)

train=pd.DataFrame(Y_test)
test=pd.DataFrame(gbc.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [9]
Actual [11]
For critical Severity
identified: [1]
Actual [1]


### Bagging Classifier

In [41]:
bag=BaggingClassifier(n_estimators=200, max_samples=0.6,max_features=0.6,
                      bootstrap=True,bootstrap_features=True,oob_score=True,random_state=0,n_jobs=-1)

bag.fit(X_train,Y_train)

train=pd.DataFrame(Y_test)
test=pd.DataFrame(bag.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values


get_stats(a)

For high Severity:
identified: [8]
Actual [11]
For critical Severity
identified: [1]
Actual [1]


### Neural Network - 2 

In [42]:
def build_keras_classifier(optimizer):
    deep_learning=Sequential()
    deep_learning.add(Dense(units=64,activation='relu',use_bias=True,kernel_initializer='uniform',input_dim=36))
    deep_learning.add(Dense(units=128,activation='relu',use_bias=True,kernel_initializer='uniform'))
    deep_learning.add(Dense(units=256,activation='relu',use_bias=True,kernel_initializer='uniform'))
    deep_learning.add(Dense(units=512,activation='relu',use_bias=True,kernel_initializer='uniform'))
    deep_learning.add(Dense(units=1,activation='sigmoid',use_bias=True,kernel_initializer='uniform'))
    deep_learning.compile(optimizer=optimizer,loss='binary_crossentropy',metrics=['accuracy'])
    return deep_learning
classifier=KerasClassifier(build_fn=build_keras_classifier,batch_size=20,nb_epoch=100,verbose=0)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
parmas= {'batch_size':[20],'nb_epoch':[120],'optimizer':['adam','rmsprop']}
grid_search=GridSearchCV(estimator=classifier,param_grid=parmas,scoring='accuracy',cv=kfold,verbose=0)
grid_search.fit(X_train,Y_train)
train=pd.DataFrame(Y_test)
test=pd.DataFrame(grid_search.predict(X_test).reshape(1,len(X_test))[0])
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [7]
Actual [11]
For critical Severity
identified: [1]
Actual [1]


### Random State 20

In [43]:
## Read the csv file which contains the data aggregated on user level 
z=pd.read_csv('Feature_Vector_50_50_3_17_random_state_20.csv',index_col='Source_User')

z.columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','Target','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]


columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]

print("The feature vector for infected users and sample of uninfected users")
print('Infected/Unifected Users')
display(z.Target.value_counts())
## Create a feature set
X=z.iloc[:,z.columns!='Target']
display(X.head(2))
X=X.values

The feature vector for infected users and sample of uninfected users
Infected/Unifected Users


uninfected    1923
infected      1923
Name: Target, dtype: int64

Unnamed: 0_level_0,NREC,PAR,PAS,LEN,BYS,BYR,SP,DP,NAPP,SIP,DIP,TDIF,0000,0100,0200,0300,0400,0500,0600,0700,0800,0900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,2200,2300
Source_User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
16414beb68334398f2f4d732aafedc2bf231b27b8507879ba41e0c18baec3847,3475,119.307626,60.322302,94.924029,6983.720863,151221.195108,3079.0,106.0,54,74,911,1409.110216,419,383,267,141,95,48,31,11,30,33,39,68,187,89,79,110,56,187,161,174,303,126,303,135
afbe75a93f02da27be2a836bef717ffeef19714de5a1f2121d5a93fe043e70f1,558,31.100358,18.324373,120.193548,2986.801075,31418.557348,555.0,5.0,25,53,227,8542.105735,13,44,2,6,5,3,4,1,0,8,25,35,44,67,94,27,18,19,30,32,40,32,6,3


In [44]:
### Build our target variable
print("1 -> Infected\n0 -> Uninfected")
Y=z.Target
Y=Y.apply(lambda x: 1 if x=='infected' else 0)
display(Y.head(10))
# Y=Y.values

1 -> Infected
0 -> Uninfected


Source_User
16414beb68334398f2f4d732aafedc2bf231b27b8507879ba41e0c18baec3847    1
afbe75a93f02da27be2a836bef717ffeef19714de5a1f2121d5a93fe043e70f1    1
28f9fffb669c004731f11c34913ce913981fd3534ea37ce2bb0e9b942091ff57    0
c966f4446da082203a9bfc84af2a2db29baa017722ac03471cf412932a6c0e90    1
b938ef4078ace510c6696a63e7ee215b9443b59f0d5c7a5f3d79bc1a672c0ca8    1
c1d72e5d7b7f24020163240ecbb1e648745a058b5f18c44f1cf8ceea6be1dc8b    1
8b76ca850d1587f46c2740a3dc662f9730d183fc173a25c6c7fcae215adde149    1
1b93b31e3d3282a98216531e95ddd1400a7728383f6f348ce19b05d873a6122a    1
f0629dde75324f00bdd60ba5ff96dd8f198c45117c089691e44c6c0e871d5947    1
bb235609630753fadf401ecbf9b9c323c35273de648930ce0b76cbd128c882fe    0
Name: Target, dtype: int64

In [45]:
### Scale feature set  using Standard Scaler
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaled_X=scaler.fit_transform(X)
print("The train/test split ratio is 80:20")
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(scaled_X,Y,random_state=0,test_size=0.2)

The train/test split ratio is 80:20


### Adaboost

In [46]:
ada=AdaBoostClassifier(algorithm='SAMME',learning_rate=1,n_estimators=400,random_state=0)
ada.fit(X_train,Y_train)

train=pd.DataFrame(Y_test)
test=pd.DataFrame(ada.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [10]
Actual [14]
For critical Severity
identified: [1]
Actual [3]


### Gradient Boosting Classifier

In [47]:
gbc=GradientBoostingClassifier(criterion='mse', learning_rate=0.02,loss='exponential', min_samples_leaf=10, min_samples_split=0.2,
                               n_estimators = 50,random_state = 0,subsample=0.7 )

gbc.fit(X_train,Y_train)

train=pd.DataFrame(Y_test)
test=pd.DataFrame(gbc.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [11]
Actual [14]
For critical Severity
identified: [1]
Actual [3]


### Bagging Classifier

In [48]:
bag=BaggingClassifier(n_estimators=200, max_samples=0.4,max_features=0.6,
                      bootstrap=True,bootstrap_features=True,oob_score=True,random_state=0,n_jobs=-1)

bag.fit(X_train,Y_train)

train=pd.DataFrame(Y_test)
test=pd.DataFrame(bag.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values


get_stats(a)

For high Severity:
identified: [10]
Actual [14]
For critical Severity
identified: [1]
Actual [3]


### Neural Network - 2 

In [49]:
def build_keras_classifier(optimizer):
    deep_learning=Sequential()
    deep_learning.add(Dense(units=64,activation='relu',use_bias=True,kernel_initializer='uniform',input_dim=36))
    deep_learning.add(Dense(units=128,activation='relu',use_bias=True,kernel_initializer='uniform'))
    deep_learning.add(Dense(units=256,activation='relu',use_bias=True,kernel_initializer='uniform'))
    deep_learning.add(Dense(units=512,activation='relu',use_bias=True,kernel_initializer='uniform'))
    deep_learning.add(Dense(units=1,activation='sigmoid',use_bias=True,kernel_initializer='uniform'))
    deep_learning.compile(optimizer=optimizer,loss='binary_crossentropy',metrics=['accuracy'])
    return deep_learning
classifier=KerasClassifier(build_fn=build_keras_classifier,batch_size=20,nb_epoch=100,verbose=0)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
parmas= {'batch_size':[20],'nb_epoch':[120],'optimizer':['adam','rmsprop']}
grid_search=GridSearchCV(estimator=classifier,param_grid=parmas,scoring='accuracy',cv=kfold,verbose=0)
grid_search.fit(X_train,Y_train)
train=pd.DataFrame(Y_test)
test=pd.DataFrame(grid_search.predict(X_test).reshape(1,len(X_test))[0])
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [10]
Actual [14]
For critical Severity
identified: [1]
Actual [3]


## 70:30 Split

### Random state 0

In [50]:
## Read the csv file which contains the data aggregated on user level 
z=pd.read_csv('Feature_Vector_50_50_3_17_random_state_0.csv',index_col='Source_User')

z.columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','Target','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]


columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]

print("The feature vector for infected users and sample of uninfected users")
print('Infected/Unifected Users')
display(z.Target.value_counts())
## Create a feature set
X=z.iloc[:,z.columns!='Target']
display(X.head(2))
X=X.values

The feature vector for infected users and sample of uninfected users
Infected/Unifected Users


uninfected    1923
infected      1923
Name: Target, dtype: int64

Unnamed: 0_level_0,NREC,PAR,PAS,LEN,BYS,BYR,SP,DP,NAPP,SIP,DIP,TDIF,0000,0100,0200,0300,0400,0500,0600,0700,0800,0900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,2200,2300
Source_User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
8d80d7e04a232fa309845a28eeb50b2e9902b83fcc2ec60f2b14eb15d59dd611,342,10.783626,9.897661,83.982456,2232.368421,7639.97076,337.0,14.0,22,30,123,13642.195906,12,5,0,0,0,0,0,0,3,6,16,5,4,14,9,25,46,27,98,16,38,12,6,0
e9e209be7e582708e1551ced368c8bd13ada27cf2857f35cc79ee014e30c7f27,42,9.47619,9.595238,119.285714,1357.809524,5211.809524,42.0,3.0,7,3,20,2319.0,0,0,0,0,0,0,0,0,0,0,0,0,8,1,9,24,0,0,0,0,0,0,0,0


In [51]:
### Build our target variable
print("1 -> Infected\n0 -> Uninfected")
Y=z.Target
Y=Y.apply(lambda x: 1 if x=='infected' else 0)
display(Y.head(10))
# Y=Y.values

1 -> Infected
0 -> Uninfected


Source_User
8d80d7e04a232fa309845a28eeb50b2e9902b83fcc2ec60f2b14eb15d59dd611    1
e9e209be7e582708e1551ced368c8bd13ada27cf2857f35cc79ee014e30c7f27    0
73e7f427173ebaafa2447cafcfa05e11aac3dfda1d0a63d542395a7c9f3e41dd    0
e931f318e6d489b8c1fefcdbcc9f1678c3f5fc95b51fa9675b0130c2003885ef    1
bae671359bbc14c02f9f8ebc82ce508a719fc67b20861aeac0eae50765512ad3    1
97435f766a7f2b4ddea9ef289247eb80b7949d19813e662b41df10ee82d45dff    1
e83f41f7465c7b228abaffb01c879bfa306d44170328d408d6da9a1e0bfb9d30    1
d0e61d1a66521a1545404096ea9271bb53d3e19eefa1331754ca58c1cfbaff31    1
aaca5cdcccf23e9d5f1debe1ec0528fc5550ffe99f2f4a55a71e89ee95af52c9    1
5c5d234bdd7acfc32b04705b11fd0a5f4557d8d386e1a39a22cf728208b003e9    0
Name: Target, dtype: int64

In [52]:
### Scale feature set  using Standard Scaler
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaled_X=scaler.fit_transform(X)
print("The train/test split ratio is 70:30")
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(scaled_X,Y,random_state=0,test_size=0.3)

The train/test split ratio is 70:30


### Adaboost

In [53]:
ada=AdaBoostClassifier(n_estimators=150,learning_rate=0.5,algorithm='SAMME.R',random_state=0)
ada.fit(X_train,Y_train)

train=pd.DataFrame(Y_test)
test=pd.DataFrame(ada.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values

get_stats(a)

For high Severity:
identified: [9]
Actual [16]
For critical Severity
identified: [4]
Actual [4]


### GBC

In [54]:
gbc=GradientBoostingClassifier(loss='exponential', learning_rate=0.05, criterion='mse', min_samples_split=0.3,
                               min_samples_leaf=10,subsample=0.5,random_state = 0, n_estimators = 150)

gbc.fit(X_train,Y_train)

train=pd.DataFrame(Y_test)
test=pd.DataFrame(gbc.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [12]
Actual [16]
For critical Severity
identified: [4]
Actual [4]


### Random State 10

In [55]:
## Read the csv file which contains the data aggregated on user level 
z=pd.read_csv('Feature_Vector_50_50_3_17_random_state_10.csv',index_col='Source_User')

z.columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','Target','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]


columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]

print("The feature vector for infected users and sample of uninfected users")
print('Infected/Unifected Users')
display(z.Target.value_counts())
## Create a feature set
X=z.iloc[:,z.columns!='Target']
display(X.head(2))
X=X.values

The feature vector for infected users and sample of uninfected users
Infected/Unifected Users


uninfected    1923
infected      1923
Name: Target, dtype: int64

Unnamed: 0_level_0,NREC,PAR,PAS,LEN,BYS,BYR,SP,DP,NAPP,SIP,DIP,TDIF,0000,0100,0200,0300,0400,0500,0600,0700,0800,0900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,2200,2300
Source_User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
8c48feddaa03131d5816111a9aef28e9ecbe168bb9b32a5bed0d545a1327be88,2984,216.771448,74.177949,82.084115,9038.178284,286763.437332,2689.0,13.0,45,19,784,1642.074397,265,188,248,39,82,43,112,20,50,46,105,89,81,80,105,70,200,116,195,158,176,167,116,233
7d72b802c8934804b499a7b417a31857b112353b42885b2290ff09aeef55bebf,264,25.242424,35.80303,69.193182,30610.265152,11302.988636,263.0,7.0,19,36,97,17986.772727,0,5,0,16,0,0,0,0,6,51,45,11,8,9,20,2,6,13,4,42,18,8,0,0


In [56]:
### Build our target variable
print("1 -> Infected\n0 -> Uninfected")
Y=z.Target
Y=Y.apply(lambda x: 1 if x=='infected' else 0)
display(Y.head(10))
# Y=Y.values
### Scale feature set  using Standard Scaler
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaled_X=scaler.fit_transform(X)
print("The train/test split ratio is 70:30")
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(scaled_X,Y,random_state=0,test_size=0.3)

1 -> Infected
0 -> Uninfected


Source_User
8c48feddaa03131d5816111a9aef28e9ecbe168bb9b32a5bed0d545a1327be88    1
7d72b802c8934804b499a7b417a31857b112353b42885b2290ff09aeef55bebf    0
c59c78cbcb78f0f7140cef977c9016a08b9d4d91bc5b2e537327b2ab2b791f95    1
8b4f22fd3269f43630ef23cc553c80f72740dcc05fec14b0caad367768ce21a4    1
caea0e5257613ba78f187ac0616758755573aa9b58006fb8a5b671496fc3e392    1
f7cbcf0d21066d971d1948c618f66de0188cb7326b8c198ddd66d18d7e4d91bd    1
d487fb3eaf4e620ba1acd93bfce4387f10e162062f03f15f813c62481bf0a518    1
60430a2cacf66446dae118e6f4b204221901dcd2f3a3faffee16848f2ae9d0ec    1
becf4f677fab6f341e04c1c6ef46ae9d5c90175660426fb33dbd7ed850aec06d    0
f0ae081c92bfcbd5f316ae0d19db53b6a9edfdfc687166f2a5b4e6eb60d9c9ed    1
Name: Target, dtype: int64

The train/test split ratio is 70:30


### Adaboost

In [57]:
ada=AdaBoostClassifier(n_estimators=400,learning_rate=1,algorithm='SAMME',random_state=0)
ada.fit(X_train,Y_train)

train=pd.DataFrame(Y_test)
test=pd.DataFrame(ada.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values

get_stats(a)

For high Severity:
identified: [14]
Actual [18]
For critical Severity
identified: [3]
Actual [3]


### GBC

In [58]:
gbc=GradientBoostingClassifier(loss='exponential', learning_rate=0.03, criterion='mse', min_samples_split=0.3,
                               min_samples_leaf=15,subsample=0.5,random_state = 0, n_estimators = 150)

gbc.fit(X_train,Y_train)

train=pd.DataFrame(Y_test)
test=pd.DataFrame(gbc.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [15]
Actual [18]
For critical Severity
identified: [3]
Actual [3]


### Random State 20

In [59]:
## Read the csv file which contains the data aggregated on user level 
z=pd.read_csv('Feature_Vector_50_50_3_17_random_state_20.csv',index_col='Source_User')

z.columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','Target','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]


columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]

print("The feature vector for infected users and sample of uninfected users")
print('Infected/Unifected Users')
display(z.Target.value_counts())
## Create a feature set
X=z.iloc[:,z.columns!='Target']
display(X.head(2))
X=X.values

The feature vector for infected users and sample of uninfected users
Infected/Unifected Users


uninfected    1923
infected      1923
Name: Target, dtype: int64

Unnamed: 0_level_0,NREC,PAR,PAS,LEN,BYS,BYR,SP,DP,NAPP,SIP,DIP,TDIF,0000,0100,0200,0300,0400,0500,0600,0700,0800,0900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,2200,2300
Source_User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
16414beb68334398f2f4d732aafedc2bf231b27b8507879ba41e0c18baec3847,3475,119.307626,60.322302,94.924029,6983.720863,151221.195108,3079.0,106.0,54,74,911,1409.110216,419,383,267,141,95,48,31,11,30,33,39,68,187,89,79,110,56,187,161,174,303,126,303,135
afbe75a93f02da27be2a836bef717ffeef19714de5a1f2121d5a93fe043e70f1,558,31.100358,18.324373,120.193548,2986.801075,31418.557348,555.0,5.0,25,53,227,8542.105735,13,44,2,6,5,3,4,1,0,8,25,35,44,67,94,27,18,19,30,32,40,32,6,3


In [60]:
### Build our target variable
print("1 -> Infected\n0 -> Uninfected")
Y=z.Target
Y=Y.apply(lambda x: 1 if x=='infected' else 0)
display(Y.head(10))
# Y=Y.values

1 -> Infected
0 -> Uninfected


Source_User
16414beb68334398f2f4d732aafedc2bf231b27b8507879ba41e0c18baec3847    1
afbe75a93f02da27be2a836bef717ffeef19714de5a1f2121d5a93fe043e70f1    1
28f9fffb669c004731f11c34913ce913981fd3534ea37ce2bb0e9b942091ff57    0
c966f4446da082203a9bfc84af2a2db29baa017722ac03471cf412932a6c0e90    1
b938ef4078ace510c6696a63e7ee215b9443b59f0d5c7a5f3d79bc1a672c0ca8    1
c1d72e5d7b7f24020163240ecbb1e648745a058b5f18c44f1cf8ceea6be1dc8b    1
8b76ca850d1587f46c2740a3dc662f9730d183fc173a25c6c7fcae215adde149    1
1b93b31e3d3282a98216531e95ddd1400a7728383f6f348ce19b05d873a6122a    1
f0629dde75324f00bdd60ba5ff96dd8f198c45117c089691e44c6c0e871d5947    1
bb235609630753fadf401ecbf9b9c323c35273de648930ce0b76cbd128c882fe    0
Name: Target, dtype: int64

In [61]:
### Scale feature set  using Standard Scaler
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaled_X=scaler.fit_transform(X)
print("The train/test split ratio is 70:30")
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(scaled_X,Y,random_state=0,test_size=0.3)

The train/test split ratio is 70:30


### Adaboost

In [62]:
ada=AdaBoostClassifier(n_estimators=50,learning_rate=0.5,algorithm='SAMME',random_state=0)
ada.fit(X_train,Y_train)

train=pd.DataFrame(Y_test)
test=pd.DataFrame(ada.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values

get_stats(a)

For high Severity:
identified: [12]
Actual [17]
For critical Severity
identified: [2]
Actual [4]


### GBC

In [63]:
gbc=GradientBoostingClassifier(loss='deviance', learning_rate=0.05, criterion='mse', min_samples_split=0.1,
                               min_samples_leaf=10,subsample=0.7,random_state = 0, n_estimators = 50)

gbc.fit(X_train,Y_train)

train=pd.DataFrame(Y_test)
test=pd.DataFrame(gbc.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [12]
Actual [17]
For critical Severity
identified: [2]
Actual [4]


### 80:20 with PCA

#### Random State 0

In [64]:
## Read the csv file which contains the data aggregated on user level 
z=pd.read_csv('Feature_Vector_50_50_3_17_random_state_0.csv',index_col='Source_User')

z.columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','Target','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]


columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]

print("The feature vector for infected users and sample of uninfected users")
print('Infected/Unifected Users')
display(z.Target.value_counts())
## Create a feature set
X=z.iloc[:,z.columns!='Target']
display(X.head(2))
X=X.values

The feature vector for infected users and sample of uninfected users
Infected/Unifected Users


uninfected    1923
infected      1923
Name: Target, dtype: int64

Unnamed: 0_level_0,NREC,PAR,PAS,LEN,BYS,BYR,SP,DP,NAPP,SIP,DIP,TDIF,0000,0100,0200,0300,0400,0500,0600,0700,0800,0900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,2200,2300
Source_User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
8d80d7e04a232fa309845a28eeb50b2e9902b83fcc2ec60f2b14eb15d59dd611,342,10.783626,9.897661,83.982456,2232.368421,7639.97076,337.0,14.0,22,30,123,13642.195906,12,5,0,0,0,0,0,0,3,6,16,5,4,14,9,25,46,27,98,16,38,12,6,0
e9e209be7e582708e1551ced368c8bd13ada27cf2857f35cc79ee014e30c7f27,42,9.47619,9.595238,119.285714,1357.809524,5211.809524,42.0,3.0,7,3,20,2319.0,0,0,0,0,0,0,0,0,0,0,0,0,8,1,9,24,0,0,0,0,0,0,0,0


In [65]:
print("1 -> Infected\n0 -> Uninfected")
Y=z.Target
Y=Y.apply(lambda x: 1 if x=='infected' else 0)
display(Y.head(10))
# Y=Y.values

1 -> Infected
0 -> Uninfected


Source_User
8d80d7e04a232fa309845a28eeb50b2e9902b83fcc2ec60f2b14eb15d59dd611    1
e9e209be7e582708e1551ced368c8bd13ada27cf2857f35cc79ee014e30c7f27    0
73e7f427173ebaafa2447cafcfa05e11aac3dfda1d0a63d542395a7c9f3e41dd    0
e931f318e6d489b8c1fefcdbcc9f1678c3f5fc95b51fa9675b0130c2003885ef    1
bae671359bbc14c02f9f8ebc82ce508a719fc67b20861aeac0eae50765512ad3    1
97435f766a7f2b4ddea9ef289247eb80b7949d19813e662b41df10ee82d45dff    1
e83f41f7465c7b228abaffb01c879bfa306d44170328d408d6da9a1e0bfb9d30    1
d0e61d1a66521a1545404096ea9271bb53d3e19eefa1331754ca58c1cfbaff31    1
aaca5cdcccf23e9d5f1debe1ec0528fc5550ffe99f2f4a55a71e89ee95af52c9    1
5c5d234bdd7acfc32b04705b11fd0a5f4557d8d386e1a39a22cf728208b003e9    0
Name: Target, dtype: int64

In [66]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaled_X=scaler.fit_transform(X)

pca=PCA(n_components=13,whiten=True,random_state=0)
X_pca_13=pca.fit_transform(scaled_X)
print("Variance explained using 13 components is:",sum(pca.explained_variance_ratio_))

print("The train/test split ratio is 80:20")
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_pca_13,Y,random_state=0,test_size=0.2)

Variance explained using 13 components is: 0.900113362114
The train/test split ratio is 80:20


### GBC

In [67]:
gbc=GradientBoostingClassifier(loss='deviance', learning_rate=0.05, criterion='mse', min_samples_split=0.2,
                               min_samples_leaf=10,subsample=0.7,random_state = 0, n_estimators = 150)

gbc.fit(X_train,Y_train)

train=pd.DataFrame(Y_test)
test=pd.DataFrame(gbc.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [9]
Actual [12]
For critical Severity
identified: [2]
Actual [3]


### ETC

In [68]:
ext_tree=ExtraTreeClassifier(criterion='gini',max_depth=10,max_features='auto',min_impurity_decrease=1e-3,
                             min_samples_leaf=2,min_samples_split=5,splitter='best',random_state=0)

ext_tree.fit(X_train,Y_train)
train=pd.DataFrame(Y_test)
test=pd.DataFrame(ext_tree.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [9]
Actual [12]
For critical Severity
identified: [3]
Actual [3]


### NN2

In [69]:
def build_keras_classifier(optimizer):
    deep_learning=Sequential()
    deep_learning.add(Dense(units=64,activation='relu',use_bias=True,kernel_initializer='uniform',input_dim=13))
    deep_learning.add(Dense(units=128,activation='relu',use_bias=True,kernel_initializer='uniform'))
    deep_learning.add(Dense(units=256,activation='relu',use_bias=True,kernel_initializer='uniform'))
    deep_learning.add(Dense(units=512,activation='relu',use_bias=True,kernel_initializer='uniform'))
    deep_learning.add(Dense(units=1,activation='sigmoid',use_bias=True,kernel_initializer='uniform'))
    deep_learning.compile(optimizer=optimizer,loss='binary_crossentropy',metrics=['accuracy'])
    return deep_learning
classifier=KerasClassifier(build_fn=build_keras_classifier,batch_size=20,nb_epoch=100,verbose=0)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
parmas= {'batch_size':[20],'nb_epoch':[120],'optimizer':['adam','rmsprop']}
grid_search=GridSearchCV(estimator=classifier,param_grid=parmas,scoring='accuracy',cv=kfold,verbose=0)
grid_search.fit(X_train,Y_train)
train=pd.DataFrame(Y_test)
test=pd.DataFrame(grid_search.predict(X_test).reshape(1,len(X_test))[0])
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [9]
Actual [12]
For critical Severity
identified: [3]
Actual [3]


### Random State 10

In [70]:
## Read the csv file which contains the data aggregated on user level 
z=pd.read_csv('Feature_Vector_50_50_3_17_random_state_10.csv',index_col='Source_User')

z.columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','Target','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]


columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]

print("The feature vector for infected users and sample of uninfected users")
print('Infected/Unifected Users')
display(z.Target.value_counts())
## Create a feature set
X=z.iloc[:,z.columns!='Target']
display(X.head(2))
X=X.values

The feature vector for infected users and sample of uninfected users
Infected/Unifected Users


uninfected    1923
infected      1923
Name: Target, dtype: int64

Unnamed: 0_level_0,NREC,PAR,PAS,LEN,BYS,BYR,SP,DP,NAPP,SIP,DIP,TDIF,0000,0100,0200,0300,0400,0500,0600,0700,0800,0900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,2200,2300
Source_User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
8c48feddaa03131d5816111a9aef28e9ecbe168bb9b32a5bed0d545a1327be88,2984,216.771448,74.177949,82.084115,9038.178284,286763.437332,2689.0,13.0,45,19,784,1642.074397,265,188,248,39,82,43,112,20,50,46,105,89,81,80,105,70,200,116,195,158,176,167,116,233
7d72b802c8934804b499a7b417a31857b112353b42885b2290ff09aeef55bebf,264,25.242424,35.80303,69.193182,30610.265152,11302.988636,263.0,7.0,19,36,97,17986.772727,0,5,0,16,0,0,0,0,6,51,45,11,8,9,20,2,6,13,4,42,18,8,0,0


In [71]:
print("1 -> Infected\n0 -> Uninfected")
Y=z.Target
Y=Y.apply(lambda x: 1 if x=='infected' else 0)
display(Y.head(10))
# Y=Y.values

1 -> Infected
0 -> Uninfected


Source_User
8c48feddaa03131d5816111a9aef28e9ecbe168bb9b32a5bed0d545a1327be88    1
7d72b802c8934804b499a7b417a31857b112353b42885b2290ff09aeef55bebf    0
c59c78cbcb78f0f7140cef977c9016a08b9d4d91bc5b2e537327b2ab2b791f95    1
8b4f22fd3269f43630ef23cc553c80f72740dcc05fec14b0caad367768ce21a4    1
caea0e5257613ba78f187ac0616758755573aa9b58006fb8a5b671496fc3e392    1
f7cbcf0d21066d971d1948c618f66de0188cb7326b8c198ddd66d18d7e4d91bd    1
d487fb3eaf4e620ba1acd93bfce4387f10e162062f03f15f813c62481bf0a518    1
60430a2cacf66446dae118e6f4b204221901dcd2f3a3faffee16848f2ae9d0ec    1
becf4f677fab6f341e04c1c6ef46ae9d5c90175660426fb33dbd7ed850aec06d    0
f0ae081c92bfcbd5f316ae0d19db53b6a9edfdfc687166f2a5b4e6eb60d9c9ed    1
Name: Target, dtype: int64

In [72]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaled_X=scaler.fit_transform(X)

pca=PCA(n_components=13,whiten=True,random_state=0)
X_pca_13=pca.fit_transform(scaled_X)
print("Variance explained using 13 components is:",sum(pca.explained_variance_ratio_))

print("The train/test split ratio is 80:20")
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_pca_13,Y,random_state=0,test_size=0.2)

Variance explained using 13 components is: 0.902209458443
The train/test split ratio is 80:20


### GBC

In [73]:
gbc=GradientBoostingClassifier(loss='deviance', learning_rate=0.1, criterion='mse', min_samples_split=0.3,
                               min_samples_leaf=15,subsample=0.5,random_state = 0, n_estimators = 100)

gbc.fit(X_train,Y_train)

train=pd.DataFrame(Y_test)
test=pd.DataFrame(gbc.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [9]
Actual [11]
For critical Severity
identified: [1]
Actual [1]


### ETC

In [74]:
ext_tree=ExtraTreeClassifier(criterion='gini',max_depth=20,max_features='auto',min_impurity_decrease=1e-3,
                             min_samples_leaf=3,min_samples_split=2,splitter='best',random_state=0)

ext_tree.fit(X_train,Y_train)
train=pd.DataFrame(Y_test)
test=pd.DataFrame(ext_tree.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [9]
Actual [11]
For critical Severity
identified: [1]
Actual [1]


### NN2

In [75]:
def build_keras_classifier(optimizer):
    deep_learning=Sequential()
    deep_learning.add(Dense(units=64,activation='relu',use_bias=True,kernel_initializer='uniform',input_dim=13))
    deep_learning.add(Dense(units=128,activation='relu',use_bias=True,kernel_initializer='uniform'))
    deep_learning.add(Dense(units=256,activation='relu',use_bias=True,kernel_initializer='uniform'))
    deep_learning.add(Dense(units=512,activation='relu',use_bias=True,kernel_initializer='uniform'))
    deep_learning.add(Dense(units=1,activation='sigmoid',use_bias=True,kernel_initializer='uniform'))
    deep_learning.compile(optimizer=optimizer,loss='binary_crossentropy',metrics=['accuracy'])
    return deep_learning
classifier=KerasClassifier(build_fn=build_keras_classifier,batch_size=20,nb_epoch=100,verbose=0)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
parmas= {'batch_size':[20],'nb_epoch':[120],'optimizer':['adam','rmsprop']}
grid_search=GridSearchCV(estimator=classifier,param_grid=parmas,scoring='accuracy',cv=kfold,verbose=0)
grid_search.fit(X_train,Y_train)
train=pd.DataFrame(Y_test)
test=pd.DataFrame(grid_search.predict(X_test).reshape(1,len(X_test))[0])
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [9]
Actual [11]
For critical Severity
identified: [1]
Actual [1]


### Random State 20

In [76]:
## Read the csv file which contains the data aggregated on user level 
z=pd.read_csv('Feature_Vector_50_50_3_17_random_state_20.csv',index_col='Source_User')

z.columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','Target','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]


columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]

print("The feature vector for infected users and sample of uninfected users")
print('Infected/Unifected Users')
display(z.Target.value_counts())
## Create a feature set
X=z.iloc[:,z.columns!='Target']
display(X.head(2))
X=X.values

The feature vector for infected users and sample of uninfected users
Infected/Unifected Users


uninfected    1923
infected      1923
Name: Target, dtype: int64

Unnamed: 0_level_0,NREC,PAR,PAS,LEN,BYS,BYR,SP,DP,NAPP,SIP,DIP,TDIF,0000,0100,0200,0300,0400,0500,0600,0700,0800,0900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,2200,2300
Source_User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
16414beb68334398f2f4d732aafedc2bf231b27b8507879ba41e0c18baec3847,3475,119.307626,60.322302,94.924029,6983.720863,151221.195108,3079.0,106.0,54,74,911,1409.110216,419,383,267,141,95,48,31,11,30,33,39,68,187,89,79,110,56,187,161,174,303,126,303,135
afbe75a93f02da27be2a836bef717ffeef19714de5a1f2121d5a93fe043e70f1,558,31.100358,18.324373,120.193548,2986.801075,31418.557348,555.0,5.0,25,53,227,8542.105735,13,44,2,6,5,3,4,1,0,8,25,35,44,67,94,27,18,19,30,32,40,32,6,3


In [77]:
print("1 -> Infected\n0 -> Uninfected")
Y=z.Target
Y=Y.apply(lambda x: 1 if x=='infected' else 0)
display(Y.head(10))
# Y=Y.values

1 -> Infected
0 -> Uninfected


Source_User
16414beb68334398f2f4d732aafedc2bf231b27b8507879ba41e0c18baec3847    1
afbe75a93f02da27be2a836bef717ffeef19714de5a1f2121d5a93fe043e70f1    1
28f9fffb669c004731f11c34913ce913981fd3534ea37ce2bb0e9b942091ff57    0
c966f4446da082203a9bfc84af2a2db29baa017722ac03471cf412932a6c0e90    1
b938ef4078ace510c6696a63e7ee215b9443b59f0d5c7a5f3d79bc1a672c0ca8    1
c1d72e5d7b7f24020163240ecbb1e648745a058b5f18c44f1cf8ceea6be1dc8b    1
8b76ca850d1587f46c2740a3dc662f9730d183fc173a25c6c7fcae215adde149    1
1b93b31e3d3282a98216531e95ddd1400a7728383f6f348ce19b05d873a6122a    1
f0629dde75324f00bdd60ba5ff96dd8f198c45117c089691e44c6c0e871d5947    1
bb235609630753fadf401ecbf9b9c323c35273de648930ce0b76cbd128c882fe    0
Name: Target, dtype: int64

In [78]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaled_X=scaler.fit_transform(X)

pca=PCA(n_components=13,whiten=True,random_state=0)
X_pca_13=pca.fit_transform(scaled_X)
print("Variance explained using 13 components is:",sum(pca.explained_variance_ratio_))

print("The train/test split ratio is 80:20")
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_pca_13,Y,random_state=0,test_size=0.2)

Variance explained using 13 components is: 0.980948185529
The train/test split ratio is 80:20


### GBC

In [79]:
gbc=GradientBoostingClassifier(loss='deviance', learning_rate=0.05, criterion='mse', min_samples_split=0.3,
                               min_samples_leaf=5,subsample=0.7,random_state = 0, n_estimators = 100)

gbc.fit(X_train,Y_train)

train=pd.DataFrame(Y_test)
test=pd.DataFrame(gbc.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [10]
Actual [14]
For critical Severity
identified: [1]
Actual [3]


### ETC

In [80]:
ext_tree=ExtraTreeClassifier(criterion='gini',max_depth=20,max_features='auto',min_impurity_decrease=1e-3,
                             min_samples_leaf=2,min_samples_split=5,splitter='best',random_state=0)

ext_tree.fit(X_train,Y_train)
train=pd.DataFrame(Y_test)
test=pd.DataFrame(ext_tree.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [11]
Actual [14]
For critical Severity
identified: [1]
Actual [3]


### NN2

In [81]:
def build_keras_classifier(optimizer):
    deep_learning=Sequential()
    deep_learning.add(Dense(units=64,activation='relu',use_bias=True,kernel_initializer='uniform',input_dim=13))
    deep_learning.add(Dense(units=128,activation='relu',use_bias=True,kernel_initializer='uniform'))
    deep_learning.add(Dense(units=256,activation='relu',use_bias=True,kernel_initializer='uniform'))
    deep_learning.add(Dense(units=512,activation='relu',use_bias=True,kernel_initializer='uniform'))
    deep_learning.add(Dense(units=1,activation='sigmoid',use_bias=True,kernel_initializer='uniform'))
    deep_learning.compile(optimizer=optimizer,loss='binary_crossentropy',metrics=['accuracy'])
    return deep_learning
classifier=KerasClassifier(build_fn=build_keras_classifier,batch_size=20,nb_epoch=100,verbose=0)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
parmas= {'batch_size':[20],'nb_epoch':[120],'optimizer':['adam','rmsprop']}
grid_search=GridSearchCV(estimator=classifier,param_grid=parmas,scoring='accuracy',cv=kfold,verbose=0)
grid_search.fit(X_train,Y_train)
train=pd.DataFrame(Y_test)
test=pd.DataFrame(grid_search.predict(X_test).reshape(1,len(X_test))[0])
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [11]
Actual [14]
For critical Severity
identified: [1]
Actual [3]


### 70:30 Split with PCA

#### Random State 0

In [82]:
## Read the csv file which contains the data aggregated on user level 
z=pd.read_csv('Feature_Vector_50_50_3_17_random_state_0.csv',index_col='Source_User')

z.columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','Target','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]


columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]

print("The feature vector for infected users and sample of uninfected users")
print('Infected/Unifected Users')
display(z.Target.value_counts())
## Create a feature set
X=z.iloc[:,z.columns!='Target']
display(X.head(2))
X=X.values

The feature vector for infected users and sample of uninfected users
Infected/Unifected Users


uninfected    1923
infected      1923
Name: Target, dtype: int64

Unnamed: 0_level_0,NREC,PAR,PAS,LEN,BYS,BYR,SP,DP,NAPP,SIP,DIP,TDIF,0000,0100,0200,0300,0400,0500,0600,0700,0800,0900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,2200,2300
Source_User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
8d80d7e04a232fa309845a28eeb50b2e9902b83fcc2ec60f2b14eb15d59dd611,342,10.783626,9.897661,83.982456,2232.368421,7639.97076,337.0,14.0,22,30,123,13642.195906,12,5,0,0,0,0,0,0,3,6,16,5,4,14,9,25,46,27,98,16,38,12,6,0
e9e209be7e582708e1551ced368c8bd13ada27cf2857f35cc79ee014e30c7f27,42,9.47619,9.595238,119.285714,1357.809524,5211.809524,42.0,3.0,7,3,20,2319.0,0,0,0,0,0,0,0,0,0,0,0,0,8,1,9,24,0,0,0,0,0,0,0,0


In [83]:
print("1 -> Infected\n0 -> Uninfected")
Y=z.Target
Y=Y.apply(lambda x: 1 if x=='infected' else 0)
display(Y.head(10))
# Y=Y.values

1 -> Infected
0 -> Uninfected


Source_User
8d80d7e04a232fa309845a28eeb50b2e9902b83fcc2ec60f2b14eb15d59dd611    1
e9e209be7e582708e1551ced368c8bd13ada27cf2857f35cc79ee014e30c7f27    0
73e7f427173ebaafa2447cafcfa05e11aac3dfda1d0a63d542395a7c9f3e41dd    0
e931f318e6d489b8c1fefcdbcc9f1678c3f5fc95b51fa9675b0130c2003885ef    1
bae671359bbc14c02f9f8ebc82ce508a719fc67b20861aeac0eae50765512ad3    1
97435f766a7f2b4ddea9ef289247eb80b7949d19813e662b41df10ee82d45dff    1
e83f41f7465c7b228abaffb01c879bfa306d44170328d408d6da9a1e0bfb9d30    1
d0e61d1a66521a1545404096ea9271bb53d3e19eefa1331754ca58c1cfbaff31    1
aaca5cdcccf23e9d5f1debe1ec0528fc5550ffe99f2f4a55a71e89ee95af52c9    1
5c5d234bdd7acfc32b04705b11fd0a5f4557d8d386e1a39a22cf728208b003e9    0
Name: Target, dtype: int64

In [84]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaled_X=scaler.fit_transform(X)

pca=PCA(n_components=13,whiten=True,random_state=0)
X_pca_13=pca.fit_transform(scaled_X)
print("Variance explained using 13 components is:",sum(pca.explained_variance_ratio_))

print("The train/test split ratio is 70:30")
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_pca_13,Y,random_state=0,test_size=0.3)

Variance explained using 13 components is: 0.900113362114
The train/test split ratio is 70:30


### Adaboost

In [85]:
ada=AdaBoostClassifier(n_estimators=50,learning_rate=0.5,algorithm='SAMME',random_state=0)
ada.fit(X_train,Y_train)

train=pd.DataFrame(Y_test)
test=pd.DataFrame(ada.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values

get_stats(a)

For high Severity:
identified: [14]
Actual [16]
For critical Severity
identified: [3]
Actual [4]


### GBC

In [86]:
gbc=GradientBoostingClassifier(loss='exponential', learning_rate=0.1, criterion='mse', min_samples_split=0.1,
                               min_samples_leaf=20,subsample=0.7,random_state = 0, n_estimators = 50)

gbc.fit(X_train,Y_train)

train=pd.DataFrame(Y_test)
test=pd.DataFrame(gbc.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [13]
Actual [16]
For critical Severity
identified: [4]
Actual [4]


### Extra Tree

In [87]:
ext_tree=ExtraTreeClassifier(criterion='gini',max_depth=20,max_features='auto',min_impurity_decrease=1e-3,
                             min_samples_leaf=3,min_samples_split=2,splitter='best',random_state=0)

ext_tree.fit(X_train,Y_train)
train=pd.DataFrame(Y_test)
test=pd.DataFrame(ext_tree.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [15]
Actual [16]
For critical Severity
identified: [4]
Actual [4]


### Random State 10

In [88]:
## Read the csv file which contains the data aggregated on user level 
z=pd.read_csv('Feature_Vector_50_50_3_17_random_state_10.csv',index_col='Source_User')

z.columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','Target','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]


columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]

print("The feature vector for infected users and sample of uninfected users")
print('Infected/Unifected Users')
display(z.Target.value_counts())
## Create a feature set
X=z.iloc[:,z.columns!='Target']
display(X.head(2))
X=X.values

The feature vector for infected users and sample of uninfected users
Infected/Unifected Users


uninfected    1923
infected      1923
Name: Target, dtype: int64

Unnamed: 0_level_0,NREC,PAR,PAS,LEN,BYS,BYR,SP,DP,NAPP,SIP,DIP,TDIF,0000,0100,0200,0300,0400,0500,0600,0700,0800,0900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,2200,2300
Source_User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
8c48feddaa03131d5816111a9aef28e9ecbe168bb9b32a5bed0d545a1327be88,2984,216.771448,74.177949,82.084115,9038.178284,286763.437332,2689.0,13.0,45,19,784,1642.074397,265,188,248,39,82,43,112,20,50,46,105,89,81,80,105,70,200,116,195,158,176,167,116,233
7d72b802c8934804b499a7b417a31857b112353b42885b2290ff09aeef55bebf,264,25.242424,35.80303,69.193182,30610.265152,11302.988636,263.0,7.0,19,36,97,17986.772727,0,5,0,16,0,0,0,0,6,51,45,11,8,9,20,2,6,13,4,42,18,8,0,0


In [89]:
print("1 -> Infected\n0 -> Uninfected")
Y=z.Target
Y=Y.apply(lambda x: 1 if x=='infected' else 0)
display(Y.head(10))
# Y=Y.values

1 -> Infected
0 -> Uninfected


Source_User
8c48feddaa03131d5816111a9aef28e9ecbe168bb9b32a5bed0d545a1327be88    1
7d72b802c8934804b499a7b417a31857b112353b42885b2290ff09aeef55bebf    0
c59c78cbcb78f0f7140cef977c9016a08b9d4d91bc5b2e537327b2ab2b791f95    1
8b4f22fd3269f43630ef23cc553c80f72740dcc05fec14b0caad367768ce21a4    1
caea0e5257613ba78f187ac0616758755573aa9b58006fb8a5b671496fc3e392    1
f7cbcf0d21066d971d1948c618f66de0188cb7326b8c198ddd66d18d7e4d91bd    1
d487fb3eaf4e620ba1acd93bfce4387f10e162062f03f15f813c62481bf0a518    1
60430a2cacf66446dae118e6f4b204221901dcd2f3a3faffee16848f2ae9d0ec    1
becf4f677fab6f341e04c1c6ef46ae9d5c90175660426fb33dbd7ed850aec06d    0
f0ae081c92bfcbd5f316ae0d19db53b6a9edfdfc687166f2a5b4e6eb60d9c9ed    1
Name: Target, dtype: int64

In [90]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaled_X=scaler.fit_transform(X)

pca=PCA(n_components=13,whiten=True,random_state=0)
X_pca_13=pca.fit_transform(scaled_X)
print("Variance explained using 13 components is:",sum(pca.explained_variance_ratio_))

print("The train/test split ratio is 70:30")
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_pca_13,Y,random_state=0,test_size=0.3)

Variance explained using 13 components is: 0.902209458443
The train/test split ratio is 70:30


### Adaboost

In [91]:
ada=AdaBoostClassifier(n_estimators=400,learning_rate=1,algorithm='SAMME',random_state=0)
ada.fit(X_train,Y_train)

train=pd.DataFrame(Y_test)
test=pd.DataFrame(ada.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values

get_stats(a)

For high Severity:
identified: [14]
Actual [18]
For critical Severity
identified: [3]
Actual [3]


### GBC

In [92]:
gbc=GradientBoostingClassifier(loss='deviance', learning_rate=0.2, criterion='mse', min_samples_split=0.3,
                               min_samples_leaf=10,subsample=0.7,random_state = 0, n_estimators = 50)

gbc.fit(X_train,Y_train)

train=pd.DataFrame(Y_test)
test=pd.DataFrame(gbc.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [15]
Actual [18]
For critical Severity
identified: [3]
Actual [3]


### Extra Tree

In [93]:
ext_tree=ExtraTreeClassifier(criterion='gini',max_depth=10,max_features='auto',min_impurity_decrease=1e-3,
                             min_samples_leaf=5,min_samples_split=2,splitter='best',random_state=0)

ext_tree.fit(X_train,Y_train)
train=pd.DataFrame(Y_test)
test=pd.DataFrame(ext_tree.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [17]
Actual [18]
For critical Severity
identified: [3]
Actual [3]


### Random State 20

In [94]:
## Read the csv file which contains the data aggregated on user level 
z=pd.read_csv('Feature_Vector_50_50_3_17_random_state_20.csv',index_col='Source_User')

z.columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','Target','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]


columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]

print("The feature vector for infected users and sample of uninfected users")
print('Infected/Unifected Users')
display(z.Target.value_counts())
## Create a feature set
X=z.iloc[:,z.columns!='Target']
display(X.head(2))
X=X.values

The feature vector for infected users and sample of uninfected users
Infected/Unifected Users


uninfected    1923
infected      1923
Name: Target, dtype: int64

Unnamed: 0_level_0,NREC,PAR,PAS,LEN,BYS,BYR,SP,DP,NAPP,SIP,DIP,TDIF,0000,0100,0200,0300,0400,0500,0600,0700,0800,0900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,2200,2300
Source_User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
16414beb68334398f2f4d732aafedc2bf231b27b8507879ba41e0c18baec3847,3475,119.307626,60.322302,94.924029,6983.720863,151221.195108,3079.0,106.0,54,74,911,1409.110216,419,383,267,141,95,48,31,11,30,33,39,68,187,89,79,110,56,187,161,174,303,126,303,135
afbe75a93f02da27be2a836bef717ffeef19714de5a1f2121d5a93fe043e70f1,558,31.100358,18.324373,120.193548,2986.801075,31418.557348,555.0,5.0,25,53,227,8542.105735,13,44,2,6,5,3,4,1,0,8,25,35,44,67,94,27,18,19,30,32,40,32,6,3


In [95]:
print("1 -> Infected\n0 -> Uninfected")
Y=z.Target
Y=Y.apply(lambda x: 1 if x=='infected' else 0)
display(Y.head(10))
# Y=Y.values

1 -> Infected
0 -> Uninfected


Source_User
16414beb68334398f2f4d732aafedc2bf231b27b8507879ba41e0c18baec3847    1
afbe75a93f02da27be2a836bef717ffeef19714de5a1f2121d5a93fe043e70f1    1
28f9fffb669c004731f11c34913ce913981fd3534ea37ce2bb0e9b942091ff57    0
c966f4446da082203a9bfc84af2a2db29baa017722ac03471cf412932a6c0e90    1
b938ef4078ace510c6696a63e7ee215b9443b59f0d5c7a5f3d79bc1a672c0ca8    1
c1d72e5d7b7f24020163240ecbb1e648745a058b5f18c44f1cf8ceea6be1dc8b    1
8b76ca850d1587f46c2740a3dc662f9730d183fc173a25c6c7fcae215adde149    1
1b93b31e3d3282a98216531e95ddd1400a7728383f6f348ce19b05d873a6122a    1
f0629dde75324f00bdd60ba5ff96dd8f198c45117c089691e44c6c0e871d5947    1
bb235609630753fadf401ecbf9b9c323c35273de648930ce0b76cbd128c882fe    0
Name: Target, dtype: int64

In [96]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaled_X=scaler.fit_transform(X)

pca=PCA(n_components=13,whiten=True,random_state=0)
X_pca_13=pca.fit_transform(scaled_X)
print("Variance explained using 13 components is:",sum(pca.explained_variance_ratio_))

print("The train/test split ratio is 70:30")
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_pca_13,Y,random_state=0,test_size=0.3)

Variance explained using 13 components is: 0.980948185529
The train/test split ratio is 70:30


### Adaboost

In [97]:
ada=AdaBoostClassifier(n_estimators=50,learning_rate=0.5,algorithm='SAMME.R',random_state=0)
ada.fit(X_train,Y_train)

train=pd.DataFrame(Y_test)
test=pd.DataFrame(ada.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values

get_stats(a)

For high Severity:
identified: [12]
Actual [17]
For critical Severity
identified: [2]
Actual [4]


### GBC

In [98]:
gbc=GradientBoostingClassifier(loss='deviance', learning_rate=0.03, criterion='mse', min_samples_split=0.2,
                               min_samples_leaf=10,subsample=0.7,random_state = 0, n_estimators = 150)

gbc.fit(X_train,Y_train)

train=pd.DataFrame(Y_test)
test=pd.DataFrame(gbc.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [12]
Actual [17]
For critical Severity
identified: [2]
Actual [4]


### Extra Tree

In [99]:
ext_tree=ExtraTreeClassifier(criterion='gini',max_depth=20,max_features='auto',min_impurity_decrease=1e-3,
                             min_samples_leaf=3,min_samples_split=2,splitter='best',random_state=0)

ext_tree.fit(X_train,Y_train)
train=pd.DataFrame(Y_test)
test=pd.DataFrame(ext_tree.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [13]
Actual [17]
For critical Severity
identified: [2]
Actual [4]


In [78]:
### Original Creator : Darshan Bhansali
### HTML code to hide the input cells 
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
To toggle code, click <a href="javascript:code_toggle()">here</a>.''')