In [62]:
### Import Libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import pprint
from tabulate import tabulate
import warnings

%matplotlib inline
pd.set_option("display.max_columns",40)
warnings.filterwarnings("ignore")

from IPython.display import HTML


import keras
from keras.layers import Dense,Dropout
from keras.models import Sequential
from keras.wrappers.scikit_learn import KerasClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier,ExtraTreeClassifier
from sklearn.decomposition import PCA
from sklearn.ensemble import VotingClassifier


from sklearn.metrics import confusion_matrix,accuracy_score,auc,roc_curve,f1_score
from sklearn.model_selection import GridSearchCV,StratifiedKFold,train_test_split,cross_val_score
from sklearn.preprocessing import StandardScaler,binarize

from mlxtend.classifier import StackingClassifier
from xgboost import XGBClassifier
from vecstack import stacking
from pandas_ml import ConfusionMatrix

In [63]:
def get_roc_curve(model,x_test,y_test):
    prob=model.predict_proba(x_test)
    preds=prob[:,1]
    fpr, tpr, threshold = roc_curve(y_test, preds)
    roc_auc =auc(fpr, tpr)
    plt.figure(figsize=(10,8))
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
    i = np.arange(len(tpr)) 
    roc = pd.DataFrame({'fpr' : pd.Series(fpr, index=i),'tpr' : pd.Series(tpr, index = i), '1-fpr' : pd.Series(1-fpr, index = i), 'tf' : pd.Series(tpr - (1-fpr), index = i), 'thresholds' : pd.Series(threshold, index = i)})
    threshold=roc.ix[(roc.tf-0).abs().argsort()[:1]]['thresholds'].values[0]
    print("Optimal Threshold is",threshold)
    y_pred_class=binarize(preds.reshape(1,-1),threshold)[0]
    accuracy=accuracy_score(y_test,y_pred_class)
    print("Accuracy of Logistic Model on training data is:",accuracy)
    cm=ConfusionMatrix(y_test,y_pred_class)
    display(cm)
    return cm,accuracy,threshold,roc_auc

In [64]:
cric=['eee73a52c55dbef6c5b4ec7ec0cd3d200a90f528b7344b7337a546f7734774db',
       '1510909c55f8994397a5aced5728fca609aff1b6bab31080e919795aeb2db97d',
       'cff22bd1423ddbbbf00818b3d866cb4a5f4a8603ee607a824ad96224072f4076',
       '1ed642700929c99329b1ec5e6ddaa5b55db8ec04f776a690e4e22b86e89136db',
       'c96be1bc82e9c8d37ba23c7077d47f003a88fe0b5e0ef79ffa830d9552eeed1e',
       'db663ee7c30f6e366215d111f745205c73789819f5781028b573a197b0388e23',
       'cca0d130b4966f6bcc4e7093295e2637b48f421c120f4d6bfece4938e1c61ae7',
       'cf4dffb6fb12b9a95d1cc43717950e389db7e3c7c9e6c4a3d56d8e0c7a98891b',
       '4121c37e4f4081f7e533940466b0962159f00840b14dca4479eb17e0d5807b15',
       '996116b7d5b47888082dd8caf8911ef352ebe6311210d813fbc18710e12a0091',
       '5672b291263e608577995d50cd51efd4662af2b9a3d327db7d6c264112bd7d57',
       '5b2109fa7d9154dc8b15a1ecee9f520330519e9e4d8ea51a61e01cbc44600a86']

In [65]:


med=['07a805db92f77828144af02c110b4107381c8608ff69cbe8322b1619550dca26',
       'eee73a52c55dbef6c5b4ec7ec0cd3d200a90f528b7344b7337a546f7734774db',
       'cef0d1cb38af8ad44a4c03c5718984ae948de73fefefc7f16e8e87a27bd719ff',
       'b1d81f7de972e44a6845b9556ed3485a9b9a4a27c499d707ca91d3f5aa2334cc',
       '5c95fd730ce2612f44cb7a9c5802d9c5c9e90d7efdeb81026d8a89ab5a3c0a8a',
       '9861528d3962a11deb48775c63026bc8cbdf39aaf3f90aa0b87fef5445d5926c',
       'ce91776ab66403168967f46d8170b6949035822ba671e2ae1a4e5b012f71fc73',
       '26490d34b73ba281b2a586fdbfc9d13b48deaa204944733b859bf412f1fa9b0e',
       '2b9d80a899a0e34177f2b0ad451f18509d8b73540ce2b51fc2dc652495c99417',
       '0caa319489c018ddc7862bf2dafc054889ec8066de3f5b166d1a26e9f12920a4',
       '9bf55dee2eb78fca489c15f4c05644d7e259c413f4f97ab1d622e9999703ff7a',
       '95fef8a0d3377bb3fb41d589e5091eece852026be1b1224383265748863b6671',
       '8b7d9bb36cbb3b6d3defe024d4b980f5bedb71d8875456642e2b173dede2a3c7',
       'f71994f88f63bd783f50648de55e879dc8580e20c771703b852c0e3036667f46',
       'f52846069af3ef50f230f86ac40cdfe41e461a5e749951f112cd0f7c8e605a9e',
       '4677f4c9612858664db2eac9f788886ba53d3b48d79320c1fb15aaf04e00d970',
       '989678db8131e49fd29adc46c2e5e1a6e9ab6823203b37954a24b2b68a98083e',
       '2587a851f09c2bb4d22b7daf8ec30af164ad94a7f5d1620d60d271b85fff6f64',
       '682eb13643f153128c9c47afe6be08554cd042d0b38815b6745a2ec45b472a70',
       '27b0bf1efe11e967f3ddb1b1c2760eaff1659bc221ae60ba772c972de59bf4c1',
       'bb445f8ffb0724b780364707b75ade0df70a8ee22fd1a9f89af3f9d469928b3c',
       'bcca3c58857d6029838ff9d34f3c576d76ac8dbccbe85c39961f0a9a8abe99e9',
       'c9c33a60aaec935cd9d3d1420c46c0e4bcec43463aac55dd454705b04f6a9f8c',
       'a14cbfea9a24e5720c98ad25ccdde36970b5ede89f9ac783a4bbebb8b61a1568',
       '7a4da56250e41de51576ffbfbc5db2aa6b38cc35867a20b4d28318a6e2c1c813',
       '705d59380dfcee0454a0d3d9063900bf5e6706ab9f642b68a00f53099af0eafd',
       '6f6d4c27d6351b6dd1209847771f8aeda37ec301b11b442620e32bbda346bcb7',
       'f55a241cd8a27c7392b8ab9049495758f6895fefa5538e8aca2c27a204ecb0c3',
       '201b65b57be8702dfa4593b39f0d5a9d79662627dc885199c1373b188d4a4a47',
       '58d466231ab231ae49c7b315b995b8e1b4916c0c3e79fe3c057b06ab8c3d6ec8',
       '005030a1caa49429f288b79917630d9a991690b95dde910c8f00a6b926e68c31',
       'eed1b477ab451eb20e847c90b3752b5a7d6c83d3b0c36285450f6731add2063e',
       'adbaa75ca91a86b069ec2810a47b226803cfcc90fb537c09a6945d9a13f19845',
       'c2b4a8f51a1f1eddc7a459bbc3911da76708a878fcd0a16cb5bb6d2ab3dfb5d0',
       'ed91f04fbd495e2d5a00f327756aa4882f8b9d4f5d58b444150f5d89b2befebc',
       'b85725fc8c972f70e3e8a290fc32b75df44bff1fbf31555ff512074003b6387b',
       'a905175974a71113bb8794ec07b0e03add816a329c9e971c833710d4bdd52abf',
       '376ffc521ffbf5d3cb640e186b9db7a98795f7656a82b73207f6884ca1194589',
       '421548d50c73674295d827ff088f1d353441fdd64a0fa7ff74863cd8113bf991',
       '54b356aa258f805ac028589e97883ab66d011a00dd9e2a409eb5579f4e81abd4',
       '22a6f0f1cec1769ea1d0898f041c36ebbfa5b1bf6c6b14d9d11b9119d02ae6ea',
       '204fa067ef1d8b8b034acfa1693f8c53e86c84305ee7503a199f40d3722ee57e',
       '8eba4debb95ef877faed7041269483166a3145a7a590cba966da0a5aa394f2f3',
       '84a578c10ef1a342ad03312393548815acba50632dd302b60f96e273abab64c8',
       '802f787a8d4ebe3d4830943c5f4d50572512c76002ffdc4f9acaf238da85fb36',
       'a6d0045c1c7ffb7db8fdcbe34d7ea0f46ce12606114b63b3702d312be308f9de',
       '94e77fdd5917ebf29fc743bf64798a5dd7c60068fcd029559e4e03afae2cda39',
       '5f78d8ea5d220fd4371a9d5b67f9704172b7e95aa85756242ea58a09ad6a2189',
       '645ee32ae1c84817505808b1a84e53565542cebd89d00d56207c3a8d111c60e5',
       '571fdff9a66b4534942db0655d35eef2dde903259323d6a36128a00a48b94efa',
       '68e9f31a207a753cc3460b17afd291ef4972d57cb149dbdac093fc4c986ad994',
       '775b40dfdcf5074b07b4c6639073b185e1133e0aaf468ede9ffd5fab9dd9943f']

In [66]:
def get_stats(a):
    x=0
    y=0
    for i in med:
        if i in m:
            x=x+a[a['Source_User']==i][0].values
            y=y+a[a['Source_User']==i]['Target'].values
    print("For high Severity:")
    print("identified:",x)
    print("Actual",y)
    print("For critical Severity")
    x=0
    y=0
    for i in cric:
        if i in m:
            x=x+a[a['Source_User']==i][0].values
            y=y+a[a['Source_User']==i]['Target'].values
    print("identified:",x)
    print("Actual",y)

## 80:20 Split

### random State 0

In [67]:
## Read the csv file which contains the data aggregated on user level 
z=pd.read_csv('Feature_Vector_70_30_3_17_random_state_0.csv',index_col='Source_User')

z.columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','Target','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]


columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]

print("The feature vector for infected users and sample of uninfected users")
print('Infected/Unifected Users')
display(z.Target.value_counts())
## Create a feature set
X=z.iloc[:,z.columns!='Target']
display(X.head(2))
X=X.values

The feature vector for infected users and sample of uninfected users
Infected/Unifected Users


uninfected    4487
infected      1923
Name: Target, dtype: int64

Unnamed: 0_level_0,NREC,PAR,PAS,LEN,BYS,BYR,SP,DP,NAPP,SIP,DIP,TDIF,0000,0100,0200,0300,0400,0500,0600,0700,0800,0900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,2200,2300
Source_User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
faa73b3f65d10c31ffdaddc8709cc96679970d7bca5f9ce66a9a4968c19e7923,441,46.138322,29.460317,66.585034,4755.390023,54162.571429,438.0,6.0,21,25,132,10734.321995,14,15,2,0,1,1,6,19,15,9,12,18,18,21,33,49,86,72,13,4,14,11,2,6
0f12269fba4fbd7f6a9229708650d1511638053b127dc8df6d8ed628067c1c17,254,24.370079,21.968504,33.976378,3180.610236,25776.055118,252.0,4.0,12,2,108,1006.488189,0,0,0,0,0,0,0,6,231,1,3,1,2,2,1,3,1,3,0,0,0,0,0,0


In [68]:
### Build our target variable
print("1 -> Infected\n0 -> Uninfected")
Y=z.Target
Y=Y.apply(lambda x: 1 if x=='infected' else 0)
display(Y.head(10))
# Y=Y.values

1 -> Infected
0 -> Uninfected


Source_User
faa73b3f65d10c31ffdaddc8709cc96679970d7bca5f9ce66a9a4968c19e7923    0
0f12269fba4fbd7f6a9229708650d1511638053b127dc8df6d8ed628067c1c17    0
b343b9f652a841f55180dec62614b6f3f4ef123aa6c7bd4e11ba330f1f20ab1e    0
d4522f4975663af6ea4d8c261c8ff4ccca09ed099f1b4bf18c22a92219746d40    0
fd3ce5c94c3c37ad6e64bcd3fea156bce5f73d66ff8d5e38c668fd65d2fbad35    1
9bd426dc9dd76d55870bfa1d277228863be08643c1c251bfa01fbeea6befa548    0
64c7c7ab1002f853fec50dcb65b8d4ce104aa67df24a116fe47ab67d9816fda6    1
b232f1a6b8db7b92cce166503c5faafe30609a93860360536391ef9f827958ab    1
89085fa53e0136faacd2e660766c2af3e4f45890a9fe3954c72b9abd9706be63    0
13e3cf1c083c78d4313f1f000c24f9d1051d78d19a69725479c0c59cf105df0c    0
Name: Target, dtype: int64

In [69]:
### Scale feature set  using Standard Scaler
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaled_X=scaler.fit_transform(X)
print("The train/test split ratio is 80:20")
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(scaled_X,Y,random_state=0,test_size=0.2)

The train/test split ratio is 80:20


### GBC

In [70]:
gbc=GradientBoostingClassifier(loss='exponential', learning_rate=0.1, criterion='mse', min_samples_split=0.3,
                               min_samples_leaf=10,subsample=0.5,random_state = 0, n_estimators = 50)

gbc.fit(X_train,Y_train)

train=pd.DataFrame(Y_test)
test=pd.DataFrame(gbc.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [3]
Actual [5]
For critical Severity
identified: [0]
Actual [1]


### Extra Tree

In [71]:
ext_tree=ExtraTreeClassifier(criterion='gini',max_depth=10,max_features='auto',min_impurity_decrease=1e-3,
                             min_samples_leaf=2,min_samples_split=2,splitter='best',random_state=0)

ext_tree.fit(X_train,Y_train)
train=pd.DataFrame(Y_test)
test=pd.DataFrame(ext_tree.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [3]
Actual [5]
For critical Severity
identified: [0]
Actual [1]


### Random State 10

In [72]:
## Read the csv file which contains the data aggregated on user level 
z=pd.read_csv('Feature_Vector_70_30_3_17_random_state_10.csv',index_col='Source_User')

z.columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','Target','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]


columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]

print("The feature vector for infected users and sample of uninfected users")
print('Infected/Unifected Users')
display(z.Target.value_counts())
## Create a feature set
X=z.iloc[:,z.columns!='Target']
display(X.head(2))
X=X.values

The feature vector for infected users and sample of uninfected users
Infected/Unifected Users


uninfected    4487
infected      1923
Name: Target, dtype: int64

Unnamed: 0_level_0,NREC,PAR,PAS,LEN,BYS,BYR,SP,DP,NAPP,SIP,DIP,TDIF,0000,0100,0200,0300,0400,0500,0600,0700,0800,0900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,2200,2300
Source_User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
23a784294c34545e19b709e6b22286bd5cfda913b32d55224d18437016ca9135,5164,438.195198,231.486445,116.998644,25566.860767,578623.737026,4106.0,42.0,54,69,1296,952.22134,521,405,398,211,281,225,163,170,68,59,36,74,99,142,201,218,307,284,219,257,223,225,173,205
69881d3627335af7ed5613256f13b575318085c62d0a4eb30ef45663cdf794ef,1347,462.67706,125.536748,90.790646,13083.769859,633355.270973,1269.0,14.0,38,39,396,3335.414996,63,65,145,123,147,56,61,73,32,19,19,6,26,52,33,135,75,21,24,23,15,28,56,50


In [73]:
### Build our target variable
print("1 -> Infected\n0 -> Uninfected")
Y=z.Target
Y=Y.apply(lambda x: 1 if x=='infected' else 0)
display(Y.head(10))
# Y=Y.values

1 -> Infected
0 -> Uninfected


Source_User
23a784294c34545e19b709e6b22286bd5cfda913b32d55224d18437016ca9135    1
69881d3627335af7ed5613256f13b575318085c62d0a4eb30ef45663cdf794ef    0
4336978528c6a752e441b177d8222c6477449660ce0c772e567e6d7f32c7e896    0
6507f3f2e0c22751060868de62ee74aed8c0c37d331ab7ab39d0f9b17e8ef426    0
6944129302f4af2ade449b755a7cb0eace943209459b735222b74dbd369ed94e    0
60a7e83d06013e0ecddb5488d101b31bdc44a0738f64667b46945011e3061980    0
9695f657a7d9b2c3202a6a25c401a660dd4fff43b396060141f23b9eefe1bed1    1
da1de679d352a2e7977a412053149f36d9e70e9bf3a059d44111774d3d8dd73f    0
4f663930f46b6dbf3e88f4a24101abb6ec2d202593d5042e26aebb10b705acf4    0
ded957eff469c307b079c16459b5caffca56d2ec1b3e36a05326c99daff1888e    0
Name: Target, dtype: int64

In [74]:
### Scale feature set  using Standard Scaler
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaled_X=scaler.fit_transform(X)
print("The train/test split ratio is 80:20")
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(scaled_X,Y,random_state=0,test_size=0.2)

The train/test split ratio is 80:20


### GBC

In [75]:
gbc=GradientBoostingClassifier(loss='deviance', learning_rate=0.05, criterion='mse', min_samples_split=0.2,
                               min_samples_leaf=10,subsample=1,random_state = 0, n_estimators = 50)

gbc.fit(X_train,Y_train)

train=pd.DataFrame(Y_test)
test=pd.DataFrame(gbc.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [6]
Actual [10]
For critical Severity
identified: [2]
Actual [3]


### Extra Tree

In [76]:
ext_tree=ExtraTreeClassifier(criterion='gini',max_depth=10,max_features='auto',min_impurity_decrease=1e-3,
                             min_samples_leaf=2,min_samples_split=2,splitter='best',random_state=0)

ext_tree.fit(X_train,Y_train)
train=pd.DataFrame(Y_test)
test=pd.DataFrame(ext_tree.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [7]
Actual [10]
For critical Severity
identified: [2]
Actual [3]


### Random State 20

In [77]:
## Read the csv file which contains the data aggregated on user level 
z=pd.read_csv('Feature_Vector_70_30_3_17_random_state_20.csv',index_col='Source_User')

z.columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','Target','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]


columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]

print("The feature vector for infected users and sample of uninfected users")
print('Infected/Unifected Users')
display(z.Target.value_counts())
## Create a feature set
X=z.iloc[:,z.columns!='Target']
display(X.head(2))
X=X.values

The feature vector for infected users and sample of uninfected users
Infected/Unifected Users


uninfected    4487
infected      1923
Name: Target, dtype: int64

Unnamed: 0_level_0,NREC,PAR,PAS,LEN,BYS,BYR,SP,DP,NAPP,SIP,DIP,TDIF,0000,0100,0200,0300,0400,0500,0600,0700,0800,0900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,2200,2300
Source_User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
73592328e1aba504b315fc06ce72a444a3afa3ca071ce135c6519b653b782900,1580,297.743038,121.938608,75.572152,10785.806962,373515.271519,1513.0,12.0,30,34,415,2816.703797,18,34,54,15,18,46,81,30,43,44,47,92,79,47,56,60,40,119,88,127,191,167,45,39
5393c0df6a44651953827b1afab0c1c821dbdc352b1b88263f9d325b00bf1b7d,2071,92.849831,68.0845,85.015451,9264.509416,113797.491067,1908.0,18.0,44,58,538,2378.489136,200,135,60,81,56,117,50,80,174,78,145,77,57,41,41,49,29,66,37,72,45,122,68,191


In [78]:
### Build our target variable
print("1 -> Infected\n0 -> Uninfected")
Y=z.Target
Y=Y.apply(lambda x: 1 if x=='infected' else 0)
display(Y.head(10))
# Y=Y.values

1 -> Infected
0 -> Uninfected


Source_User
73592328e1aba504b315fc06ce72a444a3afa3ca071ce135c6519b653b782900    0
5393c0df6a44651953827b1afab0c1c821dbdc352b1b88263f9d325b00bf1b7d    0
20e2144674cba74b4658c48eb02e290912cdf7d9815e9f0a8913e8f253d40b3c    0
0a8a122a1d0a74f656a8c288853357fe1e8ee2681f2308f75b9217f8dd73489d    0
e39c9093096a6ba89ad4d2e3987b430f8f6f8a86a6e73fc80bf4b5e0408e0edd    0
c82e979deebbc9c33760931128b2552d94d73128806ee7f3ebe1bdd3f7ca5c21    0
de1af22a9df53353a6f2c79696966a215a429a47150cd06de7b4f39b2f8e9976    0
a4a89c63ce9691738b6b014ae723928e73b2bc57ddb4bce98beb47cdce1ab45e    0
d8ccad69e30b55cf1048301d2fb7cd696a0ecab46a80e73f992ac427cd241e77    0
b7826c03e71919e65e0236c1bb7fb63e1de351fbc74c65b9ddb5cfe7c19881b0    0
Name: Target, dtype: int64

In [79]:
### Scale feature set  using Standard Scaler
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaled_X=scaler.fit_transform(X)
print("The train/test split ratio is 80:20")
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(scaled_X,Y,random_state=0,test_size=0.2)

The train/test split ratio is 80:20


### GBC

In [80]:
gbc=GradientBoostingClassifier(loss='exponential', learning_rate=0.1, criterion='mse', min_samples_split=0.1,
                               min_samples_leaf=20,subsample=0.7,random_state = 0, n_estimators = 50)

gbc.fit(X_train,Y_train)

train=pd.DataFrame(Y_test)
test=pd.DataFrame(gbc.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [9]
Actual [17]
For critical Severity
identified: [1]
Actual [3]


### Extra Tree

In [81]:
ext_tree=ExtraTreeClassifier(criterion='gini',max_depth=10,max_features='auto',min_impurity_decrease=1e-3,
                             min_samples_leaf=5,min_samples_split=2,splitter='best',random_state=0)

ext_tree.fit(X_train,Y_train)
train=pd.DataFrame(Y_test)
test=pd.DataFrame(ext_tree.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [11]
Actual [17]
For critical Severity
identified: [1]
Actual [3]


## 70:30 Split

### Random state 0

In [82]:
## Read the csv file which contains the data aggregated on user level 
z=pd.read_csv('Feature_Vector_70_30_3_17_random_state_0.csv',index_col='Source_User')

z.columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','Target','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]


columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]

print("The feature vector for infected users and sample of uninfected users")
print('Infected/Unifected Users')
display(z.Target.value_counts())
## Create a feature set
X=z.iloc[:,z.columns!='Target']
display(X.head(2))
X=X.values

The feature vector for infected users and sample of uninfected users
Infected/Unifected Users


uninfected    4487
infected      1923
Name: Target, dtype: int64

Unnamed: 0_level_0,NREC,PAR,PAS,LEN,BYS,BYR,SP,DP,NAPP,SIP,DIP,TDIF,0000,0100,0200,0300,0400,0500,0600,0700,0800,0900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,2200,2300
Source_User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
faa73b3f65d10c31ffdaddc8709cc96679970d7bca5f9ce66a9a4968c19e7923,441,46.138322,29.460317,66.585034,4755.390023,54162.571429,438.0,6.0,21,25,132,10734.321995,14,15,2,0,1,1,6,19,15,9,12,18,18,21,33,49,86,72,13,4,14,11,2,6
0f12269fba4fbd7f6a9229708650d1511638053b127dc8df6d8ed628067c1c17,254,24.370079,21.968504,33.976378,3180.610236,25776.055118,252.0,4.0,12,2,108,1006.488189,0,0,0,0,0,0,0,6,231,1,3,1,2,2,1,3,1,3,0,0,0,0,0,0


In [83]:
### Build our target variable
print("1 -> Infected\n0 -> Uninfected")
Y=z.Target
Y=Y.apply(lambda x: 1 if x=='infected' else 0)
display(Y.head(10))
# Y=Y.values

1 -> Infected
0 -> Uninfected


Source_User
faa73b3f65d10c31ffdaddc8709cc96679970d7bca5f9ce66a9a4968c19e7923    0
0f12269fba4fbd7f6a9229708650d1511638053b127dc8df6d8ed628067c1c17    0
b343b9f652a841f55180dec62614b6f3f4ef123aa6c7bd4e11ba330f1f20ab1e    0
d4522f4975663af6ea4d8c261c8ff4ccca09ed099f1b4bf18c22a92219746d40    0
fd3ce5c94c3c37ad6e64bcd3fea156bce5f73d66ff8d5e38c668fd65d2fbad35    1
9bd426dc9dd76d55870bfa1d277228863be08643c1c251bfa01fbeea6befa548    0
64c7c7ab1002f853fec50dcb65b8d4ce104aa67df24a116fe47ab67d9816fda6    1
b232f1a6b8db7b92cce166503c5faafe30609a93860360536391ef9f827958ab    1
89085fa53e0136faacd2e660766c2af3e4f45890a9fe3954c72b9abd9706be63    0
13e3cf1c083c78d4313f1f000c24f9d1051d78d19a69725479c0c59cf105df0c    0
Name: Target, dtype: int64

In [84]:
### Scale feature set  using Standard Scaler
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaled_X=scaler.fit_transform(X)
print("The train/test split ratio is 70:30")
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(scaled_X,Y,random_state=0,test_size=0.3)

The train/test split ratio is 70:30


### KNN

In [85]:
knn=KNeighborsClassifier(algorithm='auto',leaf_size=2,metric='manhattan',n_neighbors=50,weights='distance')

knn.fit(X_train,Y_train)
train=pd.DataFrame(Y_test)
test=pd.DataFrame(knn.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [9]
Actual [13]
For critical Severity
identified: [0]
Actual [3]


### Extra Tree Classifier

In [87]:
ext_tree=ExtraTreeClassifier(criterion='gini',max_depth=10,max_features='auto',min_impurity_decrease=1e-3,
                             min_samples_leaf=2,min_samples_split=2,splitter='best',random_state=0)

ext_tree.fit(X_train,Y_train)
train=pd.DataFrame(Y_test)
test=pd.DataFrame(ext_tree.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [8]
Actual [13]
For critical Severity
identified: [1]
Actual [3]


### GBC

In [88]:
gbc=GradientBoostingClassifier(loss='deviance', learning_rate=0.1, criterion='mse', min_samples_split=0.1,
                               min_samples_leaf=5,subsample=1,random_state = 0, n_estimators = 50)

gbc.fit(X_train,Y_train)

train=pd.DataFrame(Y_test)
test=pd.DataFrame(gbc.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [9]
Actual [13]
For critical Severity
identified: [1]
Actual [3]


### Random State 10

In [89]:
## Read the csv file which contains the data aggregated on user level 
z=pd.read_csv('Feature_Vector_70_30_3_17_random_state_10.csv',index_col='Source_User')

z.columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','Target','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]


columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]

print("The feature vector for infected users and sample of uninfected users")
print('Infected/Unifected Users')
display(z.Target.value_counts())
## Create a feature set
X=z.iloc[:,z.columns!='Target']
display(X.head(2))
X=X.values

The feature vector for infected users and sample of uninfected users
Infected/Unifected Users


uninfected    4487
infected      1923
Name: Target, dtype: int64

Unnamed: 0_level_0,NREC,PAR,PAS,LEN,BYS,BYR,SP,DP,NAPP,SIP,DIP,TDIF,0000,0100,0200,0300,0400,0500,0600,0700,0800,0900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,2200,2300
Source_User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
23a784294c34545e19b709e6b22286bd5cfda913b32d55224d18437016ca9135,5164,438.195198,231.486445,116.998644,25566.860767,578623.737026,4106.0,42.0,54,69,1296,952.22134,521,405,398,211,281,225,163,170,68,59,36,74,99,142,201,218,307,284,219,257,223,225,173,205
69881d3627335af7ed5613256f13b575318085c62d0a4eb30ef45663cdf794ef,1347,462.67706,125.536748,90.790646,13083.769859,633355.270973,1269.0,14.0,38,39,396,3335.414996,63,65,145,123,147,56,61,73,32,19,19,6,26,52,33,135,75,21,24,23,15,28,56,50


In [90]:
### Build our target variable
print("1 -> Infected\n0 -> Uninfected")
Y=z.Target
Y=Y.apply(lambda x: 1 if x=='infected' else 0)
display(Y.head(10))
# Y=Y.values

### Scale feature set  using Standard Scaler
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaled_X=scaler.fit_transform(X)
print("The train/test split ratio is 70:30")
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(scaled_X,Y,random_state=0,test_size=0.3)

1 -> Infected
0 -> Uninfected


Source_User
23a784294c34545e19b709e6b22286bd5cfda913b32d55224d18437016ca9135    1
69881d3627335af7ed5613256f13b575318085c62d0a4eb30ef45663cdf794ef    0
4336978528c6a752e441b177d8222c6477449660ce0c772e567e6d7f32c7e896    0
6507f3f2e0c22751060868de62ee74aed8c0c37d331ab7ab39d0f9b17e8ef426    0
6944129302f4af2ade449b755a7cb0eace943209459b735222b74dbd369ed94e    0
60a7e83d06013e0ecddb5488d101b31bdc44a0738f64667b46945011e3061980    0
9695f657a7d9b2c3202a6a25c401a660dd4fff43b396060141f23b9eefe1bed1    1
da1de679d352a2e7977a412053149f36d9e70e9bf3a059d44111774d3d8dd73f    0
4f663930f46b6dbf3e88f4a24101abb6ec2d202593d5042e26aebb10b705acf4    0
ded957eff469c307b079c16459b5caffca56d2ec1b3e36a05326c99daff1888e    0
Name: Target, dtype: int64

The train/test split ratio is 70:30


### KNN

In [91]:
knn=KNeighborsClassifier(algorithm='auto',leaf_size=2,metric='manhattan',n_neighbors=50,weights='uniform')

knn.fit(X_train,Y_train)
train=pd.DataFrame(Y_test)
test=pd.DataFrame(knn.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [6]
Actual [11]
For critical Severity
identified: [3]
Actual [3]


### Extra Tree Classifier

In [92]:
ext_tree=ExtraTreeClassifier(criterion='gini',max_depth=10,max_features='auto',min_impurity_decrease=1e-3,
                             min_samples_leaf=5,min_samples_split=2,splitter='best',random_state=0)

ext_tree.fit(X_train,Y_train)
train=pd.DataFrame(Y_test)
test=pd.DataFrame(ext_tree.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [7]
Actual [11]
For critical Severity
identified: [1]
Actual [3]


### GBC

In [93]:
gbc=GradientBoostingClassifier(loss='exponential', learning_rate=0.1, criterion='mse', min_samples_split=0.1,
                               min_samples_leaf=5,subsample=0.7,random_state = 0, n_estimators = 100)

gbc.fit(X_train,Y_train)

train=pd.DataFrame(Y_test)
test=pd.DataFrame(gbc.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [6]
Actual [11]
For critical Severity
identified: [2]
Actual [3]


### Random State 20

In [94]:
## Read the csv file which contains the data aggregated on user level 
z=pd.read_csv('Feature_Vector_70_30_3_17_random_state_20.csv',index_col='Source_User')

z.columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','Target','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]


columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]

print("The feature vector for infected users and sample of uninfected users")
print('Infected/Unifected Users')
display(z.Target.value_counts())
## Create a feature set
X=z.iloc[:,z.columns!='Target']
display(X.head(2))
X=X.values

The feature vector for infected users and sample of uninfected users
Infected/Unifected Users


uninfected    4487
infected      1923
Name: Target, dtype: int64

Unnamed: 0_level_0,NREC,PAR,PAS,LEN,BYS,BYR,SP,DP,NAPP,SIP,DIP,TDIF,0000,0100,0200,0300,0400,0500,0600,0700,0800,0900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,2200,2300
Source_User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
73592328e1aba504b315fc06ce72a444a3afa3ca071ce135c6519b653b782900,1580,297.743038,121.938608,75.572152,10785.806962,373515.271519,1513.0,12.0,30,34,415,2816.703797,18,34,54,15,18,46,81,30,43,44,47,92,79,47,56,60,40,119,88,127,191,167,45,39
5393c0df6a44651953827b1afab0c1c821dbdc352b1b88263f9d325b00bf1b7d,2071,92.849831,68.0845,85.015451,9264.509416,113797.491067,1908.0,18.0,44,58,538,2378.489136,200,135,60,81,56,117,50,80,174,78,145,77,57,41,41,49,29,66,37,72,45,122,68,191


In [95]:
### Build our target variable
print("1 -> Infected\n0 -> Uninfected")
Y=z.Target
Y=Y.apply(lambda x: 1 if x=='infected' else 0)
display(Y.head(10))
# Y=Y.values

### Scale feature set  using Standard Scaler
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaled_X=scaler.fit_transform(X)
print("The train/test split ratio is 70:30")
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(scaled_X,Y,random_state=0,test_size=0.3)

1 -> Infected
0 -> Uninfected


Source_User
73592328e1aba504b315fc06ce72a444a3afa3ca071ce135c6519b653b782900    0
5393c0df6a44651953827b1afab0c1c821dbdc352b1b88263f9d325b00bf1b7d    0
20e2144674cba74b4658c48eb02e290912cdf7d9815e9f0a8913e8f253d40b3c    0
0a8a122a1d0a74f656a8c288853357fe1e8ee2681f2308f75b9217f8dd73489d    0
e39c9093096a6ba89ad4d2e3987b430f8f6f8a86a6e73fc80bf4b5e0408e0edd    0
c82e979deebbc9c33760931128b2552d94d73128806ee7f3ebe1bdd3f7ca5c21    0
de1af22a9df53353a6f2c79696966a215a429a47150cd06de7b4f39b2f8e9976    0
a4a89c63ce9691738b6b014ae723928e73b2bc57ddb4bce98beb47cdce1ab45e    0
d8ccad69e30b55cf1048301d2fb7cd696a0ecab46a80e73f992ac427cd241e77    0
b7826c03e71919e65e0236c1bb7fb63e1de351fbc74c65b9ddb5cfe7c19881b0    0
Name: Target, dtype: int64

The train/test split ratio is 70:30


### KNN

In [96]:
knn=KNeighborsClassifier(algorithm='auto',leaf_size=2,metric='manhattan',n_neighbors=50,weights='uniform')

knn.fit(X_train,Y_train)
train=pd.DataFrame(Y_test)
test=pd.DataFrame(knn.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [12]
Actual [21]
For critical Severity
identified: [3]
Actual [4]


### Extra Tree Classifier

In [97]:
ext_tree=ExtraTreeClassifier(criterion='gini',max_depth=10,max_features='auto',min_impurity_decrease=1e-3,
                             min_samples_leaf=2,min_samples_split=2,splitter='best',random_state=0)

ext_tree.fit(X_train,Y_train)
train=pd.DataFrame(Y_test)
test=pd.DataFrame(ext_tree.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [13]
Actual [21]
For critical Severity
identified: [3]
Actual [4]


### GBC

In [98]:
gbc=GradientBoostingClassifier(loss='deviance', learning_rate=0.05, criterion='mse', min_samples_split=0.3,
                               min_samples_leaf=10,subsample=0.7,random_state = 0, n_estimators = 100)

gbc.fit(X_train,Y_train)

train=pd.DataFrame(Y_test)
test=pd.DataFrame(gbc.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [12]
Actual [21]
For critical Severity
identified: [2]
Actual [4]


### 80:20 with PCA

#### Random State 0

In [99]:
## Read the csv file which contains the data aggregated on user level 
z=pd.read_csv('Feature_Vector_70_30_3_17_random_state_0.csv',index_col='Source_User')

z.columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','Target','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]


columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]

print("The feature vector for infected users and sample of uninfected users")
print('Infected/Unifected Users')
display(z.Target.value_counts())
## Create a feature set
X=z.iloc[:,z.columns!='Target']
display(X.head(2))
X=X.values

The feature vector for infected users and sample of uninfected users
Infected/Unifected Users


uninfected    4487
infected      1923
Name: Target, dtype: int64

Unnamed: 0_level_0,NREC,PAR,PAS,LEN,BYS,BYR,SP,DP,NAPP,SIP,DIP,TDIF,0000,0100,0200,0300,0400,0500,0600,0700,0800,0900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,2200,2300
Source_User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
faa73b3f65d10c31ffdaddc8709cc96679970d7bca5f9ce66a9a4968c19e7923,441,46.138322,29.460317,66.585034,4755.390023,54162.571429,438.0,6.0,21,25,132,10734.321995,14,15,2,0,1,1,6,19,15,9,12,18,18,21,33,49,86,72,13,4,14,11,2,6
0f12269fba4fbd7f6a9229708650d1511638053b127dc8df6d8ed628067c1c17,254,24.370079,21.968504,33.976378,3180.610236,25776.055118,252.0,4.0,12,2,108,1006.488189,0,0,0,0,0,0,0,6,231,1,3,1,2,2,1,3,1,3,0,0,0,0,0,0


In [100]:
print("1 -> Infected\n0 -> Uninfected")
Y=z.Target
Y=Y.apply(lambda x: 1 if x=='infected' else 0)
display(Y.head(10))
# Y=Y.values

1 -> Infected
0 -> Uninfected


Source_User
faa73b3f65d10c31ffdaddc8709cc96679970d7bca5f9ce66a9a4968c19e7923    0
0f12269fba4fbd7f6a9229708650d1511638053b127dc8df6d8ed628067c1c17    0
b343b9f652a841f55180dec62614b6f3f4ef123aa6c7bd4e11ba330f1f20ab1e    0
d4522f4975663af6ea4d8c261c8ff4ccca09ed099f1b4bf18c22a92219746d40    0
fd3ce5c94c3c37ad6e64bcd3fea156bce5f73d66ff8d5e38c668fd65d2fbad35    1
9bd426dc9dd76d55870bfa1d277228863be08643c1c251bfa01fbeea6befa548    0
64c7c7ab1002f853fec50dcb65b8d4ce104aa67df24a116fe47ab67d9816fda6    1
b232f1a6b8db7b92cce166503c5faafe30609a93860360536391ef9f827958ab    1
89085fa53e0136faacd2e660766c2af3e4f45890a9fe3954c72b9abd9706be63    0
13e3cf1c083c78d4313f1f000c24f9d1051d78d19a69725479c0c59cf105df0c    0
Name: Target, dtype: int64

In [101]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaled_X=scaler.fit_transform(X)

pca=PCA(n_components=13,whiten=True,random_state=0)
X_pca_13=pca.fit_transform(scaled_X)
print("Variance explained using 13 components is:",sum(pca.explained_variance_ratio_))

print("The train/test split ratio is 80:20")
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_pca_13,Y,random_state=0,test_size=0.2)

Variance explained using 13 components is: 0.90269079243
The train/test split ratio is 80:20


### ETC

In [102]:
ext_tree=ExtraTreeClassifier(criterion='gini',max_depth=10,max_features='auto',min_impurity_decrease=1e-3,
                             min_samples_leaf=2,min_samples_split=2,splitter='best',random_state=0)

ext_tree.fit(X_train,Y_train)
train=pd.DataFrame(Y_test)
test=pd.DataFrame(ext_tree.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [3]
Actual [5]
For critical Severity
identified: [0]
Actual [1]


### NN2

In [103]:
def build_keras_classifier(optimizer):
    deep_learning=Sequential()
    deep_learning.add(Dense(units=64,activation='relu',use_bias=True,kernel_initializer='uniform',input_dim=13))
    deep_learning.add(Dense(units=128,activation='relu',use_bias=True,kernel_initializer='uniform'))
    deep_learning.add(Dense(units=256,activation='relu',use_bias=True,kernel_initializer='uniform'))
    deep_learning.add(Dense(units=512,activation='relu',use_bias=True,kernel_initializer='uniform'))
    deep_learning.add(Dense(units=1,activation='sigmoid',use_bias=True,kernel_initializer='uniform'))
    deep_learning.compile(optimizer=optimizer,loss='binary_crossentropy',metrics=['accuracy'])
    return deep_learning
classifier=KerasClassifier(build_fn=build_keras_classifier,batch_size=20,nb_epoch=100,verbose=0)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
parmas= {'batch_size':[20],'nb_epoch':[120],'optimizer':['adam','rmsprop']}
grid_search=GridSearchCV(estimator=classifier,param_grid=parmas,scoring='accuracy',cv=kfold,verbose=0)
grid_search.fit(X_train,Y_train)
train=pd.DataFrame(Y_test)
test=pd.DataFrame(grid_search.predict(X_test).reshape(1,len(X_test))[0])
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [3]
Actual [5]
For critical Severity
identified: [0]
Actual [1]


### Random State 10

In [105]:
## Read the csv file which contains the data aggregated on user level 
z=pd.read_csv('Feature_Vector_70_30_3_17_random_state_10.csv',index_col='Source_User')

z.columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','Target','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]


columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]

print("The feature vector for infected users and sample of uninfected users")
print('Infected/Unifected Users')
display(z.Target.value_counts())
## Create a feature set
X=z.iloc[:,z.columns!='Target']
display(X.head(2))
X=X.values

The feature vector for infected users and sample of uninfected users
Infected/Unifected Users


uninfected    4487
infected      1923
Name: Target, dtype: int64

Unnamed: 0_level_0,NREC,PAR,PAS,LEN,BYS,BYR,SP,DP,NAPP,SIP,DIP,TDIF,0000,0100,0200,0300,0400,0500,0600,0700,0800,0900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,2200,2300
Source_User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
23a784294c34545e19b709e6b22286bd5cfda913b32d55224d18437016ca9135,5164,438.195198,231.486445,116.998644,25566.860767,578623.737026,4106.0,42.0,54,69,1296,952.22134,521,405,398,211,281,225,163,170,68,59,36,74,99,142,201,218,307,284,219,257,223,225,173,205
69881d3627335af7ed5613256f13b575318085c62d0a4eb30ef45663cdf794ef,1347,462.67706,125.536748,90.790646,13083.769859,633355.270973,1269.0,14.0,38,39,396,3335.414996,63,65,145,123,147,56,61,73,32,19,19,6,26,52,33,135,75,21,24,23,15,28,56,50


In [106]:
print("1 -> Infected\n0 -> Uninfected")
Y=z.Target
Y=Y.apply(lambda x: 1 if x=='infected' else 0)
display(Y.head(10))
# Y=Y.values

1 -> Infected
0 -> Uninfected


Source_User
23a784294c34545e19b709e6b22286bd5cfda913b32d55224d18437016ca9135    1
69881d3627335af7ed5613256f13b575318085c62d0a4eb30ef45663cdf794ef    0
4336978528c6a752e441b177d8222c6477449660ce0c772e567e6d7f32c7e896    0
6507f3f2e0c22751060868de62ee74aed8c0c37d331ab7ab39d0f9b17e8ef426    0
6944129302f4af2ade449b755a7cb0eace943209459b735222b74dbd369ed94e    0
60a7e83d06013e0ecddb5488d101b31bdc44a0738f64667b46945011e3061980    0
9695f657a7d9b2c3202a6a25c401a660dd4fff43b396060141f23b9eefe1bed1    1
da1de679d352a2e7977a412053149f36d9e70e9bf3a059d44111774d3d8dd73f    0
4f663930f46b6dbf3e88f4a24101abb6ec2d202593d5042e26aebb10b705acf4    0
ded957eff469c307b079c16459b5caffca56d2ec1b3e36a05326c99daff1888e    0
Name: Target, dtype: int64

In [107]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaled_X=scaler.fit_transform(X)

pca=PCA(n_components=13,whiten=True,random_state=0)
X_pca_13=pca.fit_transform(scaled_X)
print("Variance explained using 13 components is:",sum(pca.explained_variance_ratio_))

print("The train/test split ratio is 80:20")
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_pca_13,Y,random_state=0,test_size=0.2)

Variance explained using 13 components is: 0.903941813185
The train/test split ratio is 80:20


### ETC

In [108]:
ext_tree=ExtraTreeClassifier(criterion='entropy',max_depth=20,max_features='auto',min_impurity_decrease=1e-3,
                             min_samples_leaf=3,min_samples_split=2,splitter='best',random_state=0)

ext_tree.fit(X_train,Y_train)
train=pd.DataFrame(Y_test)
test=pd.DataFrame(ext_tree.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [4]
Actual [10]
For critical Severity
identified: [3]
Actual [3]


### NN2

In [109]:
def build_keras_classifier(optimizer):
    deep_learning=Sequential()
    deep_learning.add(Dense(units=64,activation='relu',use_bias=True,kernel_initializer='uniform',input_dim=13))
    deep_learning.add(Dense(units=128,activation='relu',use_bias=True,kernel_initializer='uniform'))
    deep_learning.add(Dense(units=256,activation='relu',use_bias=True,kernel_initializer='uniform'))
    deep_learning.add(Dense(units=512,activation='relu',use_bias=True,kernel_initializer='uniform'))
    deep_learning.add(Dense(units=1,activation='sigmoid',use_bias=True,kernel_initializer='uniform'))
    deep_learning.compile(optimizer=optimizer,loss='binary_crossentropy',metrics=['accuracy'])
    return deep_learning
classifier=KerasClassifier(build_fn=build_keras_classifier,batch_size=20,nb_epoch=100,verbose=0)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
parmas= {'batch_size':[20],'nb_epoch':[120],'optimizer':['adam','rmsprop']}
grid_search=GridSearchCV(estimator=classifier,param_grid=parmas,scoring='accuracy',cv=kfold,verbose=0)
grid_search.fit(X_train,Y_train)
train=pd.DataFrame(Y_test)
test=pd.DataFrame(grid_search.predict(X_test).reshape(1,len(X_test))[0])
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [6]
Actual [10]
For critical Severity
identified: [1]
Actual [3]


### Random State 20

In [110]:
## Read the csv file which contains the data aggregated on user level 
z=pd.read_csv('Feature_Vector_70_30_3_17_random_state_20.csv',index_col='Source_User')

z.columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','Target','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]


columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]

print("The feature vector for infected users and sample of uninfected users")
print('Infected/Unifected Users')
display(z.Target.value_counts())
## Create a feature set
X=z.iloc[:,z.columns!='Target']
display(X.head(2))
X=X.values

The feature vector for infected users and sample of uninfected users
Infected/Unifected Users


uninfected    4487
infected      1923
Name: Target, dtype: int64

Unnamed: 0_level_0,NREC,PAR,PAS,LEN,BYS,BYR,SP,DP,NAPP,SIP,DIP,TDIF,0000,0100,0200,0300,0400,0500,0600,0700,0800,0900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,2200,2300
Source_User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
73592328e1aba504b315fc06ce72a444a3afa3ca071ce135c6519b653b782900,1580,297.743038,121.938608,75.572152,10785.806962,373515.271519,1513.0,12.0,30,34,415,2816.703797,18,34,54,15,18,46,81,30,43,44,47,92,79,47,56,60,40,119,88,127,191,167,45,39
5393c0df6a44651953827b1afab0c1c821dbdc352b1b88263f9d325b00bf1b7d,2071,92.849831,68.0845,85.015451,9264.509416,113797.491067,1908.0,18.0,44,58,538,2378.489136,200,135,60,81,56,117,50,80,174,78,145,77,57,41,41,49,29,66,37,72,45,122,68,191


In [111]:
print("1 -> Infected\n0 -> Uninfected")
Y=z.Target
Y=Y.apply(lambda x: 1 if x=='infected' else 0)
display(Y.head(10))
# Y=Y.values

1 -> Infected
0 -> Uninfected


Source_User
73592328e1aba504b315fc06ce72a444a3afa3ca071ce135c6519b653b782900    0
5393c0df6a44651953827b1afab0c1c821dbdc352b1b88263f9d325b00bf1b7d    0
20e2144674cba74b4658c48eb02e290912cdf7d9815e9f0a8913e8f253d40b3c    0
0a8a122a1d0a74f656a8c288853357fe1e8ee2681f2308f75b9217f8dd73489d    0
e39c9093096a6ba89ad4d2e3987b430f8f6f8a86a6e73fc80bf4b5e0408e0edd    0
c82e979deebbc9c33760931128b2552d94d73128806ee7f3ebe1bdd3f7ca5c21    0
de1af22a9df53353a6f2c79696966a215a429a47150cd06de7b4f39b2f8e9976    0
a4a89c63ce9691738b6b014ae723928e73b2bc57ddb4bce98beb47cdce1ab45e    0
d8ccad69e30b55cf1048301d2fb7cd696a0ecab46a80e73f992ac427cd241e77    0
b7826c03e71919e65e0236c1bb7fb63e1de351fbc74c65b9ddb5cfe7c19881b0    0
Name: Target, dtype: int64

In [112]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaled_X=scaler.fit_transform(X)

pca=PCA(n_components=13,whiten=True,random_state=0)
X_pca_13=pca.fit_transform(scaled_X)
print("Variance explained using 13 components is:",sum(pca.explained_variance_ratio_))

print("The train/test split ratio is 80:20")
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_pca_13,Y,random_state=0,test_size=0.2)

Variance explained using 13 components is: 0.97709676314
The train/test split ratio is 80:20


### ETC

In [113]:
ext_tree=ExtraTreeClassifier(criterion='gini',max_depth=10,max_features='auto',min_impurity_decrease=1e-3,
                             min_samples_leaf=2,min_samples_split=2,splitter='best',random_state=0)

ext_tree.fit(X_train,Y_train)
train=pd.DataFrame(Y_test)
test=pd.DataFrame(ext_tree.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [10]
Actual [17]
For critical Severity
identified: [1]
Actual [3]


### NN2

In [114]:
def build_keras_classifier(optimizer):
    deep_learning=Sequential()
    deep_learning.add(Dense(units=64,activation='relu',use_bias=True,kernel_initializer='uniform',input_dim=13))
    deep_learning.add(Dense(units=128,activation='relu',use_bias=True,kernel_initializer='uniform'))
    deep_learning.add(Dense(units=256,activation='relu',use_bias=True,kernel_initializer='uniform'))
    deep_learning.add(Dense(units=512,activation='relu',use_bias=True,kernel_initializer='uniform'))
    deep_learning.add(Dense(units=1,activation='sigmoid',use_bias=True,kernel_initializer='uniform'))
    deep_learning.compile(optimizer=optimizer,loss='binary_crossentropy',metrics=['accuracy'])
    return deep_learning
classifier=KerasClassifier(build_fn=build_keras_classifier,batch_size=20,nb_epoch=100,verbose=0)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
parmas= {'batch_size':[20],'nb_epoch':[120],'optimizer':['adam','rmsprop']}
grid_search=GridSearchCV(estimator=classifier,param_grid=parmas,scoring='accuracy',cv=kfold,verbose=0)
grid_search.fit(X_train,Y_train)
train=pd.DataFrame(Y_test)
test=pd.DataFrame(grid_search.predict(X_test).reshape(1,len(X_test))[0])
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [10]
Actual [17]
For critical Severity
identified: [1]
Actual [3]


### 70:30 Split with PCA

#### Random State 0

In [115]:
## Read the csv file which contains the data aggregated on user level 
z=pd.read_csv('Feature_Vector_70_30_3_17_random_state_0.csv',index_col='Source_User')

z.columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','Target','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]


columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]

print("The feature vector for infected users and sample of uninfected users")
print('Infected/Unifected Users')
display(z.Target.value_counts())
## Create a feature set
X=z.iloc[:,z.columns!='Target']
display(X.head(2))
X=X.values

The feature vector for infected users and sample of uninfected users
Infected/Unifected Users


uninfected    4487
infected      1923
Name: Target, dtype: int64

Unnamed: 0_level_0,NREC,PAR,PAS,LEN,BYS,BYR,SP,DP,NAPP,SIP,DIP,TDIF,0000,0100,0200,0300,0400,0500,0600,0700,0800,0900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,2200,2300
Source_User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
faa73b3f65d10c31ffdaddc8709cc96679970d7bca5f9ce66a9a4968c19e7923,441,46.138322,29.460317,66.585034,4755.390023,54162.571429,438.0,6.0,21,25,132,10734.321995,14,15,2,0,1,1,6,19,15,9,12,18,18,21,33,49,86,72,13,4,14,11,2,6
0f12269fba4fbd7f6a9229708650d1511638053b127dc8df6d8ed628067c1c17,254,24.370079,21.968504,33.976378,3180.610236,25776.055118,252.0,4.0,12,2,108,1006.488189,0,0,0,0,0,0,0,6,231,1,3,1,2,2,1,3,1,3,0,0,0,0,0,0


In [116]:
print("1 -> Infected\n0 -> Uninfected")
Y=z.Target
Y=Y.apply(lambda x: 1 if x=='infected' else 0)
display(Y.head(10))
# Y=Y.values

1 -> Infected
0 -> Uninfected


Source_User
faa73b3f65d10c31ffdaddc8709cc96679970d7bca5f9ce66a9a4968c19e7923    0
0f12269fba4fbd7f6a9229708650d1511638053b127dc8df6d8ed628067c1c17    0
b343b9f652a841f55180dec62614b6f3f4ef123aa6c7bd4e11ba330f1f20ab1e    0
d4522f4975663af6ea4d8c261c8ff4ccca09ed099f1b4bf18c22a92219746d40    0
fd3ce5c94c3c37ad6e64bcd3fea156bce5f73d66ff8d5e38c668fd65d2fbad35    1
9bd426dc9dd76d55870bfa1d277228863be08643c1c251bfa01fbeea6befa548    0
64c7c7ab1002f853fec50dcb65b8d4ce104aa67df24a116fe47ab67d9816fda6    1
b232f1a6b8db7b92cce166503c5faafe30609a93860360536391ef9f827958ab    1
89085fa53e0136faacd2e660766c2af3e4f45890a9fe3954c72b9abd9706be63    0
13e3cf1c083c78d4313f1f000c24f9d1051d78d19a69725479c0c59cf105df0c    0
Name: Target, dtype: int64

In [117]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaled_X=scaler.fit_transform(X)

pca=PCA(n_components=13,whiten=True,random_state=0)
X_pca_13=pca.fit_transform(scaled_X)
print("Variance explained using 13 components is:",sum(pca.explained_variance_ratio_))

print("The train/test split ratio is 70:30")
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_pca_13,Y,random_state=0,test_size=0.3)

Variance explained using 13 components is: 0.90269079243
The train/test split ratio is 70:30


### Random Forest

In [118]:
rf=RandomForestClassifier(criterion='gini',max_depth=10,max_features='auto',max_leaf_nodes=10,min_impurity_decrease=0.1,
                          min_samples_leaf=10,min_samples_split=10,min_weight_fraction_leaf=0.1,n_estimators=100,
                          oob_score=False,random_state=0,verbose=0)
rf.fit(X_train,Y_train)
train=pd.DataFrame(Y_test)
test=pd.DataFrame(rf.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [0]
Actual [13]
For critical Severity
identified: [0]
Actual [3]


### AdaBoost 

In [119]:
ada=AdaBoostClassifier(algorithm='SAMME',learning_rate=0.5,n_estimators=400,random_state=0)
ada.fit(X_train,Y_train)

train=pd.DataFrame(Y_test)
test=pd.DataFrame(ada.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [9]
Actual [13]
For critical Severity
identified: [2]
Actual [3]


### GBC

In [120]:
gbc=GradientBoostingClassifier(loss='deviance', learning_rate=0.1, criterion='mse', min_samples_split=0.3,
                               min_samples_leaf=20,subsample=0.5,random_state = 0, n_estimators = 150)

gbc.fit(X_train,Y_train)

train=pd.DataFrame(Y_test)
test=pd.DataFrame(gbc.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [9]
Actual [13]
For critical Severity
identified: [2]
Actual [3]


### ETC

In [121]:
ext_tree=ExtraTreeClassifier(criterion='gini',max_depth=10,max_features='auto',min_impurity_decrease=1e-3,
                             min_samples_leaf=2,min_samples_split=2,splitter='best',random_state=0)

ext_tree.fit(X_train,Y_train)
train=pd.DataFrame(Y_test)
test=pd.DataFrame(ext_tree.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [8]
Actual [13]
For critical Severity
identified: [1]
Actual [3]


### Random State 10

In [122]:
## Read the csv file which contains the data aggregated on user level 
z=pd.read_csv('Feature_Vector_70_30_3_17_random_state_10.csv',index_col='Source_User')

z.columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','Target','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]


columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]

print("The feature vector for infected users and sample of uninfected users")
print('Infected/Unifected Users')
display(z.Target.value_counts())
## Create a feature set
X=z.iloc[:,z.columns!='Target']
display(X.head(2))
X=X.values

The feature vector for infected users and sample of uninfected users
Infected/Unifected Users


uninfected    4487
infected      1923
Name: Target, dtype: int64

Unnamed: 0_level_0,NREC,PAR,PAS,LEN,BYS,BYR,SP,DP,NAPP,SIP,DIP,TDIF,0000,0100,0200,0300,0400,0500,0600,0700,0800,0900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,2200,2300
Source_User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
23a784294c34545e19b709e6b22286bd5cfda913b32d55224d18437016ca9135,5164,438.195198,231.486445,116.998644,25566.860767,578623.737026,4106.0,42.0,54,69,1296,952.22134,521,405,398,211,281,225,163,170,68,59,36,74,99,142,201,218,307,284,219,257,223,225,173,205
69881d3627335af7ed5613256f13b575318085c62d0a4eb30ef45663cdf794ef,1347,462.67706,125.536748,90.790646,13083.769859,633355.270973,1269.0,14.0,38,39,396,3335.414996,63,65,145,123,147,56,61,73,32,19,19,6,26,52,33,135,75,21,24,23,15,28,56,50


In [123]:
print("1 -> Infected\n0 -> Uninfected")
Y=z.Target
Y=Y.apply(lambda x: 1 if x=='infected' else 0)
display(Y.head(10))
# Y=Y.values

1 -> Infected
0 -> Uninfected


Source_User
23a784294c34545e19b709e6b22286bd5cfda913b32d55224d18437016ca9135    1
69881d3627335af7ed5613256f13b575318085c62d0a4eb30ef45663cdf794ef    0
4336978528c6a752e441b177d8222c6477449660ce0c772e567e6d7f32c7e896    0
6507f3f2e0c22751060868de62ee74aed8c0c37d331ab7ab39d0f9b17e8ef426    0
6944129302f4af2ade449b755a7cb0eace943209459b735222b74dbd369ed94e    0
60a7e83d06013e0ecddb5488d101b31bdc44a0738f64667b46945011e3061980    0
9695f657a7d9b2c3202a6a25c401a660dd4fff43b396060141f23b9eefe1bed1    1
da1de679d352a2e7977a412053149f36d9e70e9bf3a059d44111774d3d8dd73f    0
4f663930f46b6dbf3e88f4a24101abb6ec2d202593d5042e26aebb10b705acf4    0
ded957eff469c307b079c16459b5caffca56d2ec1b3e36a05326c99daff1888e    0
Name: Target, dtype: int64

In [124]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaled_X=scaler.fit_transform(X)

pca=PCA(n_components=13,whiten=True,random_state=0)
X_pca_13=pca.fit_transform(scaled_X)
print("Variance explained using 13 components is:",sum(pca.explained_variance_ratio_))

print("The train/test split ratio is 70:30")
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_pca_13,Y,random_state=0,test_size=0.3)

Variance explained using 13 components is: 0.903941813185
The train/test split ratio is 70:30


### Random Forest

In [125]:
rf=RandomForestClassifier(criterion='gini',max_depth=10,max_features='auto',max_leaf_nodes=10,min_impurity_decrease=0.1,
                          min_samples_leaf=10,min_samples_split=10,min_weight_fraction_leaf=0.1,n_estimators=100,
                          oob_score=False,random_state=0,verbose=0)
rf.fit(X_train,Y_train)
train=pd.DataFrame(Y_test)
test=pd.DataFrame(rf.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [0]
Actual [11]
For critical Severity
identified: [0]
Actual [3]


### Adaboost

In [126]:
ada=AdaBoostClassifier(algorithm='SAMME',learning_rate=0.5,n_estimators=150,random_state=0)
ada.fit(X_train,Y_train)

train=pd.DataFrame(Y_test)
test=pd.DataFrame(ada.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [6]
Actual [11]
For critical Severity
identified: [2]
Actual [3]


### GBC

In [127]:
gbc=GradientBoostingClassifier(loss='deviance', learning_rate=0.05, criterion='mse', min_samples_split=0.1,
                               min_samples_leaf=20,subsample=0.7,random_state = 0, n_estimators = 50)

gbc.fit(X_train,Y_train)

train=pd.DataFrame(Y_test)
test=pd.DataFrame(gbc.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [6]
Actual [11]
For critical Severity
identified: [2]
Actual [3]


### ETC

In [128]:
ext_tree=ExtraTreeClassifier(criterion='entropy',max_depth=10,max_features='auto',min_impurity_decrease=1e-3,
                             min_samples_leaf=2,min_samples_split=2,splitter='best',random_state=0)

ext_tree.fit(X_train,Y_train)
train=pd.DataFrame(Y_test)
test=pd.DataFrame(ext_tree.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [7]
Actual [11]
For critical Severity
identified: [1]
Actual [3]


### Random State 20

In [129]:
## Read the csv file which contains the data aggregated on user level 
z=pd.read_csv('Feature_Vector_70_30_3_17_random_state_20.csv',index_col='Source_User')

z.columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','Target','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]


columns=['NREC','PAR','PAS','LEN','BYS','BYR','SP','DP','NAPP','SIP','DIP','TDIF',
    '0000','0100','0200','0300','0400','0500','0600','0700','0800','0900','1000','1100','1200',
    '1300','1400','1500','1600','1700','1800','1900','2000','2100','2200','2300'
]

print("The feature vector for infected users and sample of uninfected users")
print('Infected/Unifected Users')
display(z.Target.value_counts())
## Create a feature set
X=z.iloc[:,z.columns!='Target']
display(X.head(2))
X=X.values

The feature vector for infected users and sample of uninfected users
Infected/Unifected Users


uninfected    4487
infected      1923
Name: Target, dtype: int64

Unnamed: 0_level_0,NREC,PAR,PAS,LEN,BYS,BYR,SP,DP,NAPP,SIP,DIP,TDIF,0000,0100,0200,0300,0400,0500,0600,0700,0800,0900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,2200,2300
Source_User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
73592328e1aba504b315fc06ce72a444a3afa3ca071ce135c6519b653b782900,1580,297.743038,121.938608,75.572152,10785.806962,373515.271519,1513.0,12.0,30,34,415,2816.703797,18,34,54,15,18,46,81,30,43,44,47,92,79,47,56,60,40,119,88,127,191,167,45,39
5393c0df6a44651953827b1afab0c1c821dbdc352b1b88263f9d325b00bf1b7d,2071,92.849831,68.0845,85.015451,9264.509416,113797.491067,1908.0,18.0,44,58,538,2378.489136,200,135,60,81,56,117,50,80,174,78,145,77,57,41,41,49,29,66,37,72,45,122,68,191


In [130]:
print("1 -> Infected\n0 -> Uninfected")
Y=z.Target
Y=Y.apply(lambda x: 1 if x=='infected' else 0)
display(Y.head(10))
# Y=Y.values

1 -> Infected
0 -> Uninfected


Source_User
73592328e1aba504b315fc06ce72a444a3afa3ca071ce135c6519b653b782900    0
5393c0df6a44651953827b1afab0c1c821dbdc352b1b88263f9d325b00bf1b7d    0
20e2144674cba74b4658c48eb02e290912cdf7d9815e9f0a8913e8f253d40b3c    0
0a8a122a1d0a74f656a8c288853357fe1e8ee2681f2308f75b9217f8dd73489d    0
e39c9093096a6ba89ad4d2e3987b430f8f6f8a86a6e73fc80bf4b5e0408e0edd    0
c82e979deebbc9c33760931128b2552d94d73128806ee7f3ebe1bdd3f7ca5c21    0
de1af22a9df53353a6f2c79696966a215a429a47150cd06de7b4f39b2f8e9976    0
a4a89c63ce9691738b6b014ae723928e73b2bc57ddb4bce98beb47cdce1ab45e    0
d8ccad69e30b55cf1048301d2fb7cd696a0ecab46a80e73f992ac427cd241e77    0
b7826c03e71919e65e0236c1bb7fb63e1de351fbc74c65b9ddb5cfe7c19881b0    0
Name: Target, dtype: int64

In [131]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaled_X=scaler.fit_transform(X)

pca=PCA(n_components=13,whiten=True,random_state=0)
X_pca_13=pca.fit_transform(scaled_X)
print("Variance explained using 13 components is:",sum(pca.explained_variance_ratio_))

print("The train/test split ratio is 70:30")
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_pca_13,Y,random_state=0,test_size=0.3)

Variance explained using 13 components is: 0.97709676314
The train/test split ratio is 70:30


### Random Forest

In [132]:
rf=RandomForestClassifier(criterion='gini',max_depth=10,max_features='auto',max_leaf_nodes=10,min_impurity_decrease=0.1,
                          min_samples_leaf=10,min_samples_split=10,min_weight_fraction_leaf=0.1,n_estimators=100,
                          oob_score=False,random_state=0,verbose=0)
rf.fit(X_train,Y_train)
train=pd.DataFrame(Y_test)
test=pd.DataFrame(rf.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [0]
Actual [21]
For critical Severity
identified: [0]
Actual [4]


### Adaboost

In [133]:
ada=AdaBoostClassifier(algorithm='SAMME',learning_rate=0.5,n_estimators=400,random_state=0)
ada.fit(X_train,Y_train)

train=pd.DataFrame(Y_test)
test=pd.DataFrame(ada.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [15]
Actual [21]
For critical Severity
identified: [2]
Actual [4]


### GBC

In [134]:
gbc=GradientBoostingClassifier(loss='deviance', learning_rate=0.03, criterion='mse', min_samples_split=0.1,
                               min_samples_leaf=5,subsample=0.7,random_state = 0, n_estimators = 100)

gbc.fit(X_train,Y_train)

train=pd.DataFrame(Y_test)
test=pd.DataFrame(gbc.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [12]
Actual [21]
For critical Severity
identified: [2]
Actual [4]


### ETC

In [135]:
ext_tree=ExtraTreeClassifier(criterion='entropy',max_depth=20,max_features='auto',min_impurity_decrease=1e-3,
                             min_samples_leaf=2,min_samples_split=2,splitter='best',random_state=0)

ext_tree.fit(X_train,Y_train)
train=pd.DataFrame(Y_test)
test=pd.DataFrame(ext_tree.predict(X_test))
test.set_index(train.index.get_values(),inplace=True)
a=train.join(test,how='inner').reset_index()
m=a.Source_User.values
get_stats(a)

For high Severity:
identified: [12]
Actual [21]
For critical Severity
identified: [3]
Actual [4]


In [136]:
### Original Creator : Darshan Bhansali
### HTML code to hide the input cells 
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
To toggle code, click <a href="javascript:code_toggle()">here</a>.''')