In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pymc3 as pm
import numpy as np
import  pandas as pd

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,log_loss
from sklearn.model_selection import train_test_split
from pandas_ml import ConfusionMatrix as cm
from sklearn.preprocessing import StandardScaler
from sklearn import svm

from sklearn.neighbors.kde import KernelDensity

from IPython.display import display_html

from scipy.stats import norm

import warnings
warnings.filterwarnings('ignore')
import scipy.stats as st
import statsmodels as sm
import matplotlib

from IPython.display import display,clear_output,HTML

In [2]:
#### Source= https://stackoverflow.com/questions/38783027/jupyter-notebook-display-two-pandas-tables-side-by-side

def mydisplay(dfs, names=[]):
    html_str = ''
    if names:
        html_str += ('<tr>' + 
                     ''.join(f'<td style="text-align:center">{name}</td>' for name in names) + 
                     '</tr>')
    html_str += ('<tr>' + 
                 ''.join(f'<td style="vertical-align:top"> {df.to_html(index=False)}</td>' 
                         for df in dfs) + 
                 '</tr>')
    html_str = f'<table>{html_str}</table>'
    html_str = html_str.replace('table','table style="display:inline"')
    display_html(html_str, raw=True)

In [3]:
train=pd.read_csv('Feature_Vector_50_50_12_24_random_state_0.csv',index_col='User')
print('Infected/Unifected Users')
display(train.Target.value_counts())
user_Dec=train.index.values

Infected/Unifected Users


1    2910
0    2910
Name: Target, dtype: int64

In [4]:
jan=pd.read_csv('Feature-Set_Jan.csv',index_col='User')
print('Infected/Unifected Users')
display(jan.Target.value_counts())
user_jan=jan.index.values

Infected/Unifected Users


0    51123
1      764
Name: Target, dtype: int64

In [5]:
no=[user for user in user_jan if user not in user_Dec]

o=[user for user in user_jan if user in user_Dec]

jan.drop(o,inplace=True)

print("After eliminating overlapping users:")
print(jan.Target.value_counts())

After eliminating overlapping users:
0    46596
1      532
Name: Target, dtype: int64


#### There are 232 Malicious and 4527 Non-malicious Users in January 2018, who overlap with users in Sept-Dec 2017
#### The number of users after balancing the data in January 2018

In [6]:
x=jan[jan.Target==1]
y=jan[jan.Target==0]
y=y.sample(n=532,random_state=10)
z=x.append(y)
z=z.sample(frac=1,random_state=10)
display(z.Target.value_counts())

1    532
0    532
Name: Target, dtype: int64

### Parametric approach

In [7]:
Y_test=z.Target.values

for col in train.columns[:-1]:
    train[col]=np.log(train[col])
train.replace([np.inf,-np.inf],0,inplace=True)

for col in z.columns[:-1]:
    z[col]=np.log(z[col])
z.replace([np.inf,-np.inf],0,inplace=True)

In [8]:
lst=[]
for col in train.columns[:-1]:
    
    ### Non-Malicious Users
    X_train=train[train.Target==0][col].values
    Y_train=train[train.Target==0]['Target'].values
    scaler=StandardScaler()
    scaled_X=scaler.fit_transform(X_train.reshape(-1,1))
    clf = GaussianNB()
    clf.fit(scaled_X,Y_train)
    X_test=scaler.transform(z[col].values.reshape(-1,1))
    non_mal_lst=clf.predict(X_test)
    mean=clf.theta_[0][0]
    std=np.sqrt(clf.sigma_[0][0])
    non_mal_lst=[]
    for x in X_test:
        non_mal_lst.append((norm.pdf(x[0],mean,std)))
    
    
    ### Malicious users
    X_train=train[train.Target==1][col].values
    Y_train=train[train.Target==1]['Target'].values
    scaler=StandardScaler()
    scaled_X=scaler.fit_transform(X_train.reshape(-1,1))
    clf = GaussianNB()
    clf.fit(scaled_X,Y_train)
    X_test=scaler.transform(z[col].values.reshape(-1,1))
    mal_lst=clf.predict(X_test)
    mean=clf.theta_[0][0]
    std=np.sqrt(clf.sigma_[0][0])
    mal_lst=[]
    for x in X_test:
        mal_lst.append((norm.pdf(x[0],mean,std)))
    
    
    df=pd.DataFrame({'Non_Mal':non_mal_lst,'Mal':mal_lst,
              'Ground_Truth':Y_test.reshape(1,-1)[0].tolist()},index=z.index.values)


    df['Ground_Truth']=df['Ground_Truth'].apply(lambda x: 'Malicious' if x==1 else 'Non_Malicious')
    df['Predicted']=np.where(df['Non_Mal'] <= df['Mal'], 'Malicious','Non_Malicious')
    print('\033[1m' + str(col) + '\033[0m\n')
    display(pd.crosstab(df.Predicted,df.Ground_Truth))
    print("-----------------------------------------------------------------------------------------------------")
    file_name='Parametric/Para_col_'+str(col)+'.csv'
    df.to_csv(file_name)

[1mSP[0m



Ground_Truth,Malicious,Non_Malicious
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Malicious,249,20
Non_Malicious,283,512


-----------------------------------------------------------------------------------------------------
[1mDP[0m



Ground_Truth,Malicious,Non_Malicious
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Malicious,177,32
Non_Malicious,355,500


-----------------------------------------------------------------------------------------------------
[1mNAPP[0m



Ground_Truth,Malicious,Non_Malicious
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Malicious,228,17
Non_Malicious,304,515


-----------------------------------------------------------------------------------------------------
[1mSIP[0m



Ground_Truth,Malicious,Non_Malicious
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Malicious,48,2
Non_Malicious,484,530


-----------------------------------------------------------------------------------------------------
[1mDIP[0m



Ground_Truth,Malicious,Non_Malicious
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Malicious,230,15
Non_Malicious,302,517


-----------------------------------------------------------------------------------------------------
[1mNREC[0m



Ground_Truth,Malicious,Non_Malicious
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Malicious,248,20
Non_Malicious,284,512


-----------------------------------------------------------------------------------------------------
[1mPAR[0m



Ground_Truth,Malicious,Non_Malicious
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Malicious,301,146
Non_Malicious,231,386


-----------------------------------------------------------------------------------------------------
[1mPAS[0m



Ground_Truth,Malicious,Non_Malicious
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Malicious,275,142
Non_Malicious,257,390


-----------------------------------------------------------------------------------------------------
[1mLEN[0m



Ground_Truth,Malicious,Non_Malicious
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Malicious,146,97
Non_Malicious,386,435


-----------------------------------------------------------------------------------------------------
[1mBYS[0m



Ground_Truth,Malicious,Non_Malicious
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Malicious,214,119
Non_Malicious,318,413


-----------------------------------------------------------------------------------------------------
[1mBYR[0m



Ground_Truth,Malicious,Non_Malicious
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Malicious,289,137
Non_Malicious,243,395


-----------------------------------------------------------------------------------------------------
[1m0000[0m



Ground_Truth,Malicious,Non_Malicious
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Malicious,313,52
Non_Malicious,219,480


-----------------------------------------------------------------------------------------------------
[1m0100[0m



Ground_Truth,Malicious,Non_Malicious
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Malicious,318,55
Non_Malicious,214,477


-----------------------------------------------------------------------------------------------------
[1m0200[0m



Ground_Truth,Malicious,Non_Malicious
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Malicious,314,60
Non_Malicious,218,472


-----------------------------------------------------------------------------------------------------
[1m0300[0m



Ground_Truth,Malicious,Non_Malicious
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Malicious,301,55
Non_Malicious,231,477


-----------------------------------------------------------------------------------------------------
[1m0400[0m



Ground_Truth,Malicious,Non_Malicious
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Malicious,294,50
Non_Malicious,238,482


-----------------------------------------------------------------------------------------------------
[1m0500[0m



Ground_Truth,Malicious,Non_Malicious
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Malicious,292,54
Non_Malicious,240,478


-----------------------------------------------------------------------------------------------------
[1m0600[0m



Ground_Truth,Malicious,Non_Malicious
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Malicious,288,46
Non_Malicious,244,486


-----------------------------------------------------------------------------------------------------
[1m0700[0m



Ground_Truth,Malicious,Non_Malicious
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Malicious,258,53
Non_Malicious,274,479


-----------------------------------------------------------------------------------------------------
[1m0800[0m



Ground_Truth,Malicious,Non_Malicious
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Malicious,262,61
Non_Malicious,270,471


-----------------------------------------------------------------------------------------------------
[1m0900[0m



Ground_Truth,Malicious,Non_Malicious
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Malicious,231,47
Non_Malicious,301,485


-----------------------------------------------------------------------------------------------------
[1m1000[0m



Ground_Truth,Malicious,Non_Malicious
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Malicious,241,52
Non_Malicious,291,480


-----------------------------------------------------------------------------------------------------
[1m1100[0m



Ground_Truth,Malicious,Non_Malicious
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Malicious,235,44
Non_Malicious,297,488


-----------------------------------------------------------------------------------------------------
[1m1200[0m



Ground_Truth,Malicious,Non_Malicious
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Malicious,219,42
Non_Malicious,313,490


-----------------------------------------------------------------------------------------------------
[1m1300[0m



Ground_Truth,Malicious,Non_Malicious
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Malicious,210,42
Non_Malicious,322,490


-----------------------------------------------------------------------------------------------------
[1m1400[0m



Ground_Truth,Malicious,Non_Malicious
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Malicious,208,43
Non_Malicious,324,489


-----------------------------------------------------------------------------------------------------
[1m1500[0m



Ground_Truth,Malicious,Non_Malicious
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Malicious,224,34
Non_Malicious,308,498


-----------------------------------------------------------------------------------------------------
[1m1600[0m



Ground_Truth,Malicious,Non_Malicious
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Malicious,238,42
Non_Malicious,294,490


-----------------------------------------------------------------------------------------------------
[1m1700[0m



Ground_Truth,Malicious,Non_Malicious
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Malicious,276,37
Non_Malicious,256,495


-----------------------------------------------------------------------------------------------------
[1m1800[0m



Ground_Truth,Malicious,Non_Malicious
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Malicious,271,33
Non_Malicious,261,499


-----------------------------------------------------------------------------------------------------
[1m1900[0m



Ground_Truth,Malicious,Non_Malicious
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Malicious,298,40
Non_Malicious,234,492


-----------------------------------------------------------------------------------------------------
[1m2000[0m



Ground_Truth,Malicious,Non_Malicious
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Malicious,314,35
Non_Malicious,218,497


-----------------------------------------------------------------------------------------------------
[1m2100[0m



Ground_Truth,Malicious,Non_Malicious
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Malicious,322,33
Non_Malicious,210,499


-----------------------------------------------------------------------------------------------------
[1m2200[0m



Ground_Truth,Malicious,Non_Malicious
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Malicious,339,62
Non_Malicious,193,470


-----------------------------------------------------------------------------------------------------
[1m2300[0m



Ground_Truth,Malicious,Non_Malicious
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Malicious,337,58
Non_Malicious,195,474


-----------------------------------------------------------------------------------------------------
