In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, roc_curve, f1_score, classification_report, auc
from sklearn.cluster import KMeans
from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import ClusterCentroids
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

In [62]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
# fix random seed for reproducibility
np.random.seed(7)

In [3]:
nm_df = pd.read_csv("NetworkMeasuresFeatures.csv")

In [4]:
nm_df_orig = nm_df.copy()

In [5]:
# Renaming columns in readable format to understand what each column represents.
nm_df = nm_df_orig.copy()
col_names = ['Actor', 'A_Acc',
             'PartyPlay_InDegree', 'PartyPlay_OutDegree', 'PartyPlay_Degree',
             'PartyPlay_WeightedDegree', 'PartyPlay_WeightedInDegree', 'PartyPlay_WeightedOutDegree',
             'PartyPlay_Eccentricity', 'PartyPlay_ClosenessCentrality', 'PartyPlay_BetweennessCentrality',
             'PartyPlay_Authority', 'PartyPlay_Hub', 'PartyPlay_ModularityClass', 'PartyPlay_Pagerank',
             'PartyPlay_ComponentId', 'PartyPlay_StrConnId', 'PartyPlay_ClusteringCoefficient', 'PartyPlay_Eigenvector',
             'Friendship_InDegree', 'Friendship_OutDegree', 'Friendship_Degree',
             'Friendship_WeightedDegree', 'Friendship_WeightedInDegree', 'Friendship_WeightedOutDegree',
             'Friendship_Eccentricity', 'Friendship_ClosenessCentrality', 'Friendship_BetweennessCentrality',
             'Friendship_Authority', 'Friendship_Hub', 'Friendship_ModularityClass', 'Friendship_Pagerank',
             'Friendship_ComponentId', 'Friendship_StrConnId', 'Friendship_ClusteringCoefficient', 'Friendship_Eigenvector',
             'Trade_InDegree', 'Trade_OutDegree', 'Trade_Degree',
             'Trade_WeightedDegree', 'Trade_WeightedInDegree', 'Trade_WeightedOutDegree',
             'Trade_Eccentricity', 'Trade_ClosenessCentrality', 'Trade_BetweennessCentrality',
             'Trade_Authority', 'Trade_Hub', 'Trade_ModularityClass', 'Trade_Pagerank',
             'Trade_ComponentId', 'Trade_StrConnId', 'Trade_ClusteringCoefficient', 'Trade_Eigenvector',
             'Whisper_InDegree', 'Whisper_OutDegree', 'Whisper_Degree',
             'Whisper_WeightedDegree', 'Whisper_WeightedInDegree', 'Whisper_WeightedOutDegree',
             'Whisper_Eccentricity', 'Whisper_ClosenessCentrality', 'Whisper_BetweennessCentrality',
             'Whisper_Authority', 'Whisper_Hub', 'Whisper_ModularityClass', 'Whisper_Pagerank',
             'Whisper_ComponentId', 'Whisper_StrConnId', 'Whisper_ClusteringCoefficient', 'Whisper_Eigenvector',
             'Mail_InDegree', 'Mail_OutDegree', 'Mail_Degree',
             'Mail_WeightedDegree', 'Mail_WeightedInDegree', 'Mail_WeightedOutDegree',
             'Mail_Eccentricity', 'Mail_ClosenessCentrality', 'Mail_BetweennessCentrality',
             'Mail_Authority', 'Mail_Hub', 'Mail_ModularityClass', 'Mail_Pagerank',
             'Mail_ComponentId', 'Mail_StrConnId', 'Mail_ClusteringCoefficient', 'Mail_Eigenvector',
             'Shop_InDegree', 'Shop_OutDegree', 'Shop_Degree',
             'Shop_WeightedDegree', 'Shop_WeightedInDegree', 'Shop_WeightedOutDegree',
             'Shop_Eccentricity', 'Shop_ClosenessCentrality', 'Shop_BetweennessCentrality',
             'Shop_Authority', 'Shop_Hub', 'Shop_ModularityClass', 'Shop_Pagerank',
             'Shop_ComponentId', 'Shop_StrConnId', 'Shop_ClusteringCoefficient', 'Shop_Eigenvector',
             'Type'
            ]
nm_df.columns = col_names
nm_df_orig.columns = col_names

In [6]:
# Change Type as Category
nm_df.Type = nm_df.Type.astype('category')

In [7]:
# Creating a numerical field for Type
nm_df.Type.replace({'Human': 0, 'Bot': 1}, inplace=True)
type_dict = {0 : 'Human', 1: 'Bot'}

In [8]:
# Creating different datasets based on the Group activity
party_df = nm_df[[
                  'PartyPlay_InDegree', 'PartyPlay_OutDegree', 'PartyPlay_Degree',
                  'PartyPlay_WeightedDegree', 'PartyPlay_WeightedInDegree', 'PartyPlay_WeightedOutDegree',
                  'PartyPlay_Eccentricity', 'PartyPlay_ClosenessCentrality', 'PartyPlay_BetweennessCentrality',
                  'PartyPlay_Authority', 'PartyPlay_Hub', 'PartyPlay_ModularityClass', 'PartyPlay_Pagerank',
                  'PartyPlay_ComponentId', 'PartyPlay_StrConnId', 'PartyPlay_ClusteringCoefficient',
                  'PartyPlay_Eigenvector', 'Type']].copy()
friendship_df = nm_df[['Friendship_InDegree', 'Friendship_OutDegree', 'Friendship_Degree',
                       'Friendship_WeightedDegree', 'Friendship_WeightedInDegree', 'Friendship_WeightedOutDegree',
                       'Friendship_Eccentricity', 'Friendship_ClosenessCentrality', 'Friendship_BetweennessCentrality',
                       'Friendship_Authority', 'Friendship_Hub', 'Friendship_ModularityClass', 'Friendship_Pagerank',
                       'Friendship_ComponentId', 'Friendship_StrConnId', 'Friendship_ClusteringCoefficient', 'Friendship_Eigenvector',
                       'Type']].copy()
trade_df = nm_df[[
    'Trade_InDegree', 'Trade_OutDegree', 'Trade_Degree',
    'Trade_WeightedDegree', 'Trade_WeightedInDegree', 'Trade_WeightedOutDegree',
    'Trade_Eccentricity', 'Trade_ClosenessCentrality', 'Trade_BetweennessCentrality',
    'Trade_Authority', 'Trade_Hub', 'Trade_ModularityClass', 'Trade_Pagerank',
    'Trade_ComponentId', 'Trade_StrConnId', 'Trade_ClusteringCoefficient', 'Trade_Eigenvector',
    'Type']].copy()
shop_df = nm_df[[
             'Shop_InDegree', 'Shop_OutDegree', 'Shop_Degree',
             'Shop_WeightedDegree', 'Shop_WeightedInDegree', 'Shop_WeightedOutDegree',
             'Shop_Eccentricity', 'Shop_ClosenessCentrality', 'Shop_BetweennessCentrality',
             'Shop_Authority', 'Shop_Hub', 'Shop_ModularityClass', 'Shop_Pagerank',
             'Shop_ComponentId', 'Shop_StrConnId', 'Shop_ClusteringCoefficient', 'Shop_Eigenvector',
             'Type']].copy()
whisper_df = nm_df[[
    'Whisper_InDegree', 'Whisper_OutDegree', 'Whisper_Degree',
    'Whisper_WeightedDegree', 'Whisper_WeightedInDegree', 'Whisper_WeightedOutDegree',
    'Whisper_Eccentricity', 'Whisper_ClosenessCentrality', 'Whisper_BetweennessCentrality',
    'Whisper_Authority', 'Whisper_Hub', 'Whisper_ModularityClass', 'Whisper_Pagerank',
    'Whisper_ComponentId', 'Whisper_StrConnId', 'Whisper_ClusteringCoefficient', 'Whisper_Eigenvector',
    'Type' ]].copy()
mail_df = nm_df[[
    'Mail_InDegree', 'Mail_OutDegree', 'Mail_Degree',
    'Mail_WeightedDegree', 'Mail_WeightedInDegree', 'Mail_WeightedOutDegree',
    'Mail_Eccentricity', 'Mail_ClosenessCentrality', 'Mail_BetweennessCentrality',
    'Mail_Authority', 'Mail_Hub', 'Mail_ModularityClass', 'Mail_Pagerank',
    'Mail_ComponentId', 'Mail_StrConnId', 'Mail_ClusteringCoefficient', 'Mail_Eigenvector',
    'Type']].copy()


In [9]:
# Getting the list of cols that we want to visulaize in pairplot. This will exclude the Actor and A_Acc cols
cols = party_df.columns.tolist()
#cols.remove('Actor'); cols.remove('A_Acc')
print(cols)

['PartyPlay_InDegree', 'PartyPlay_OutDegree', 'PartyPlay_Degree', 'PartyPlay_WeightedDegree', 'PartyPlay_WeightedInDegree', 'PartyPlay_WeightedOutDegree', 'PartyPlay_Eccentricity', 'PartyPlay_ClosenessCentrality', 'PartyPlay_BetweennessCentrality', 'PartyPlay_Authority', 'PartyPlay_Hub', 'PartyPlay_ModularityClass', 'PartyPlay_Pagerank', 'PartyPlay_ComponentId', 'PartyPlay_StrConnId', 'PartyPlay_ClusteringCoefficient', 'PartyPlay_Eigenvector', 'Type']


In [10]:
cols_to_zscore = {
    'party_play': ['PartyPlay_BetweennessCentrality', 'PartyPlay_ClosenessCentrality', 
    'PartyPlay_Eigenvector', 'PartyPlay_Eccentricity', 'PartyPlay_Authority', 'PartyPlay_Hub', 'PartyPlay_Pagerank'],
    'friendship': ['Friendship_BetweennessCentrality', 'Friendship_ClosenessCentrality', 
    'Friendship_Eigenvector', 'Friendship_Eccentricity', 'Friendship_Authority', 'Friendship_Hub', 'Friendship_Pagerank'],
    'trade': ['Trade_BetweennessCentrality',  'Trade_ClosenessCentrality',
    'Trade_Eigenvector', 'Trade_Eccentricity', 'Trade_Authority', 'Trade_Hub', 'Trade_Pagerank'],
    'shop': ['Shop_BetweennessCentrality', 'Shop_ClosenessCentrality',
    'Shop_Eigenvector', 'Shop_Eccentricity', 'Shop_Authority', 'Shop_Hub', 'Shop_Pagerank'],
    'mail': ['Mail_BetweennessCentrality', 'Mail_ClosenessCentrality', 
    'Mail_Eigenvector', 'Mail_Eccentricity', 'Mail_Authority', 'Mail_Hub', 'Mail_Pagerank'],
    'whisper': ['Whisper_BetweennessCentrality', 'Whisper_ClosenessCentrality', 
    'Whisper_Eigenvector', 'Whisper_Eccentricity', 'Whisper_Authority', 'Whisper_Hub', 'Whisper_Pagerank']
}

In [11]:
grp_act_df = {
    'party_play': party_df,
    'friendship': friendship_df,
    'trade': trade_df,
    'shop': shop_df,
    'mail': mail_df,
    'whisper': whisper_df
}

In [12]:
# For each act_df, apply zscore to the cols listed in cols_to_zscore
for act_name, act_df in grp_act_df.items():
    act_cols_to_zscore = cols_to_zscore.get(act_name)
    act_df[act_cols_to_zscore] = act_df[act_cols_to_zscore].apply(zscore)

In [13]:
X_cols_to_log_transform = [
    'PartyPlay_InDegree', 'PartyPlay_OutDegree', 'PartyPlay_WeightedInDegree', 'PartyPlay_WeightedOutDegree',
    'Friendship_InDegree', 'Friendship_OutDegree', 'Friendship_WeightedInDegree', 'Friendship_WeightedOutDegree',
    'Trade_InDegree', 'Trade_OutDegree', 'Trade_WeightedInDegree', 'Trade_WeightedOutDegree',
    'Mail_InDegree', 'Mail_OutDegree', 'Mail_WeightedInDegree', 'Mail_WeightedOutDegree',
    'Shop_InDegree', 'Shop_OutDegree', 'Shop_WeightedInDegree', 'Shop_WeightedOutDegree',
    'Whisper_InDegree', 'Whisper_OutDegree', 'Whisper_WeightedInDegree', 'Whisper_WeightedOutDegree',
    'PartyPlay_Degree', 'PartyPlay_WeightedDegree', 'PartyPlay_BetweennessCentrality', 
    'Friendship_Degree', 'Friendship_WeightedDegree', 'Friendship_BetweennessCentrality',
    'Trade_Degree', 'Trade_WeightedDegree', 'Trade_BetweennessCentrality',
    'Shop_Degree', 'Shop_WeightedDegree', 'Shop_BetweennessCentrality',
    'Mail_Degree', 'Mail_WeightedDegree', 'Mail_BetweennessCentrality',
    'Whisper_Degree', 'Whisper_WeightedDegree', 'Whisper_BetweennessCentrality']

In [14]:
nm_df_zscored = nm_df.copy()

In [15]:
from scipy import stats as scipy_stats
from scipy.special import boxcox1p
from sklearn.preprocessing import PowerTransformer

nm_df_z_t_1 = nm_df_zscored.copy()
'''
transformer = PowerTransformer(method='yeo-johnson')
X = nm_df_z_t_1[X_cols_to_log_transform]
nm_df_z_t_1[X_cols_to_log_transform] = transformer.fit_transform(X)
'''
for col in X_cols_to_log_transform:
    #print("Transforming %s" % col)
    # The boxcox transformation with value of 0 does log transformation
    nm_df_z_t_1[col] = pd.Series(boxcox1p(nm_df_z_t_1[col].values, 0))
    nm_df_z_t_1[col] = pd.Series(boxcox1p(nm_df_z_t_1[col].values, 0))

In [16]:
nm_df = nm_df_z_t_1.copy()

In [41]:
nm_df.corr()

Unnamed: 0,Actor,A_Acc,PartyPlay_InDegree,PartyPlay_OutDegree,PartyPlay_Degree,PartyPlay_WeightedDegree,PartyPlay_WeightedInDegree,PartyPlay_WeightedOutDegree,PartyPlay_Eccentricity,PartyPlay_ClosenessCentrality,...,Shop_BetweennessCentrality,Shop_Authority,Shop_Hub,Shop_ModularityClass,Shop_Pagerank,Shop_ComponentId,Shop_StrConnId,Shop_ClusteringCoefficient,Shop_Eigenvector,Type
Actor,1.000000,0.488634,-0.175530,-0.225730,-0.209193,-0.202832,-0.174194,-0.221592,-0.185391,-0.117984,...,-0.322334,-0.229130,-0.231482,-0.137377,-0.233991,-0.009794,-0.316568,-0.109305,-0.252250,0.158702
A_Acc,0.488634,1.000000,-0.173875,-0.210655,-0.210881,-0.203080,-0.172943,-0.203328,-0.219031,-0.179034,...,-0.197066,-0.109484,-0.118582,-0.063174,-0.117826,-0.000522,-0.176368,-0.062670,-0.116192,0.293703
PartyPlay_InDegree,-0.175530,-0.173875,1.000000,0.827675,0.904883,0.887713,0.988740,0.815746,0.548015,0.436167,...,0.547692,0.454252,0.468527,0.231185,0.480043,0.036731,0.397865,0.121199,0.404342,-0.279415
PartyPlay_OutDegree,-0.225730,-0.210655,0.827675,1.000000,0.966654,0.947960,0.813277,0.982198,0.774752,0.668634,...,0.568104,0.405528,0.421710,0.246365,0.434036,0.033139,0.430818,0.138458,0.366379,-0.347976
PartyPlay_Degree,-0.209193,-0.210881,0.904883,0.966654,1.000000,0.985705,0.894365,0.953658,0.761789,0.661952,...,0.557433,0.407151,0.424034,0.241142,0.435981,0.036646,0.420516,0.134535,0.363995,-0.359481
PartyPlay_WeightedDegree,-0.202832,-0.203080,0.887713,0.947960,0.985705,1.000000,0.893352,0.962420,0.764124,0.672429,...,0.541407,0.392862,0.409086,0.236800,0.421950,0.040930,0.411256,0.136354,0.352198,-0.348657
PartyPlay_WeightedInDegree,-0.174194,-0.172943,0.988740,0.813277,0.894365,0.893352,1.000000,0.811090,0.547418,0.440195,...,0.537599,0.440999,0.454859,0.229757,0.467362,0.040855,0.393075,0.123392,0.393171,-0.271483
PartyPlay_WeightedOutDegree,-0.221592,-0.203328,0.815746,0.982198,0.953658,0.962420,0.811090,1.000000,0.781672,0.684023,...,0.554253,0.394536,0.410046,0.241670,0.423079,0.035919,0.423157,0.142674,0.357991,-0.338586
PartyPlay_Eccentricity,-0.185391,-0.219031,0.548015,0.774752,0.761789,0.764124,0.547418,0.781672,1.000000,0.975258,...,0.342767,0.191390,0.206438,0.150445,0.215497,0.017693,0.269658,0.106178,0.166641,-0.391885
PartyPlay_ClosenessCentrality,-0.117984,-0.179034,0.436167,0.668634,0.661952,0.672429,0.440195,0.684023,0.975258,1.000000,...,0.231123,0.105562,0.119328,0.097317,0.124753,0.016723,0.186490,0.078339,0.088429,-0.371373


In [33]:
X_cols = ['PartyPlay_InDegree', 'PartyPlay_OutDegree', 'PartyPlay_WeightedInDegree', 'PartyPlay_WeightedOutDegree',
    'PartyPlay_ClusteringCoefficient',
    'Friendship_InDegree', 'Friendship_OutDegree', 'Friendship_WeightedInDegree', 'Friendship_WeightedOutDegree',
    'Friendship_ClusteringCoefficient',
    'Trade_InDegree', 'Trade_OutDegree', 'Trade_WeightedInDegree', 'Trade_WeightedOutDegree',
    'Trade_ClusteringCoefficient',
    'Mail_InDegree', 'Mail_OutDegree', 'Mail_WeightedInDegree', 'Mail_WeightedOutDegree',
    'Mail_ClusteringCoefficient',
    'Shop_InDegree', 'Shop_OutDegree', 'Shop_WeightedInDegree', 'Shop_WeightedOutDegree',
    'Shop_ClusteringCoefficient', 
    'Whisper_InDegree', 'Whisper_OutDegree', 'Whisper_WeightedInDegree', 'Whisper_WeightedOutDegree',
    'Whisper_ClusteringCoefficient',
    'PartyPlay_Degree', 'PartyPlay_WeightedDegree', 'PartyPlay_BetweennessCentrality', 'PartyPlay_ClosenessCentrality', 
    'PartyPlay_Eigenvector', 'PartyPlay_Eccentricity', 'PartyPlay_Authority', 'PartyPlay_Hub', 'PartyPlay_Pagerank',
    'Friendship_Degree', 'Friendship_WeightedDegree', 'Friendship_BetweennessCentrality', 'Friendship_ClosenessCentrality', 
    'Friendship_Eigenvector', 'Friendship_Eccentricity', 'Friendship_Authority', 'Friendship_Hub', 'Friendship_Pagerank',
    'Trade_Degree', 'Trade_WeightedDegree', 'Trade_BetweennessCentrality',  'Trade_ClosenessCentrality',
    'Trade_Eigenvector', 'Trade_Eccentricity', 'Trade_Authority', 'Trade_Hub', 'Trade_Pagerank',
    'Shop_Degree', 'Shop_WeightedDegree', 'Shop_BetweennessCentrality', 'Shop_ClosenessCentrality',
    'Shop_Eigenvector', 'Shop_Eccentricity', 'Shop_Authority', 'Shop_Hub', 'Shop_Pagerank',
    'Mail_Degree', 'Mail_WeightedDegree', 'Mail_BetweennessCentrality', 'Mail_ClosenessCentrality', 
    'Mail_Eigenvector', 'Mail_Eccentricity', 'Mail_Authority', 'Mail_Hub', 'Mail_Pagerank']
#X_cols = list(col_names)
#X_cols.remove('A_Acc');
#X_cols.remove('Type')
#print(X_cols) 
#X_cols = ['PartyPlay_InDegree', 'PartyPlay_OutDegree', 'PartyPlay_WeightedInDegree', 'PartyPlay_WeightedOutDegree',
#    'PartyPlay_ClusteringCoefficient', 'PartyPlay_Degree', 'PartyPlay_WeightedDegree', 'PartyPlay_BetweennessCentrality', 'PartyPlay_ClosenessCentrality', 
#    'PartyPlay_Eigenvector', 'PartyPlay_Eccentricity', 'PartyPlay_Authority', 'PartyPlay_Hub', 'PartyPlay_Pagerank']
y_col = ['Type']
X = nm_df[X_cols]
y = nm_df[y_col]

In [42]:
# We will cluster each of the cols in cols_to_zscore and assign a cluster number to each, replacing its original value
# by its cluster number. This is to make the model better as it will then the model will not have to get build
# on scatter values, but on values which represents its cluster, which are very less in number. We do this for
# every such feature in cols_to_zscore on not just on the all the features taken as whole.
k_means_cluster_num = 3

In [43]:
# For each feature, get predicted cluster. Put the resultant cluster value of the feature against new nm_df dataframe
nm_df_z_t_c = nm_df_z_t_1.copy()
for act_name, act_df in grp_act_df.items():
    act_cols = cols_to_zscore.get(act_name)
    for act_col in act_cols:
        tmp_df = pd.DataFrame({act_col: act_df[act_col].copy()})
        clusters = KMeans(n_clusters=k_means_cluster_num, random_state=7, n_init=10)
        clusters.fit(tmp_df)
        predicted_clusters_array = clusters.predict(tmp_df)
        nm_df_z_t_c[act_col] = predicted_clusters_array

In [44]:
nm_df = nm_df_z_t_c.copy()

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7)

In [74]:
model = Sequential()
model.add(Dense(75, input_dim=75, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(300, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [75]:
model.fit(X_train, y_train, epochs=30, batch_size=50)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x1a435ab320>

In [76]:
# evaluate the model
scores = model.evaluate(X_test, y_test)
scores[1]*100



91.66331590779777

In [77]:
y_predict = model.predict_classes(X_test)
cr = metrics.classification_report(y_test ,y_predict)
print(cr)

cm = metrics.confusion_matrix(y_test,y_predict)
print(cm)

              precision    recall  f1-score   support

           0       0.93      0.98      0.95     12672
           1       0.83      0.56      0.67      2250

   micro avg       0.92      0.92      0.92     14922
   macro avg       0.88      0.77      0.81     14922
weighted avg       0.91      0.92      0.91     14922

[[12420   252]
 [  992  1258]]


In [78]:
fpr, tpr, threshold = roc_curve(y_test, y_predict)
roc_auc = auc(fpr, tpr)
print("AUC %s" % roc_auc)

AUC 0.7696123737373737


In [145]:
type(X), type(y)


(pandas.core.frame.DataFrame, pandas.core.frame.DataFrame)

In [79]:
from sklearn.model_selection import StratifiedKFold
# define 10-fold cross validation test harness
kfold = StratifiedKFold(n_splits=10, shuffle=True)
cvscores = []
for train, test in kfold.split(X, y):
    #print(train)
    #print(test)
    # create model
    model = Sequential()
    model.add(Dense(75, input_dim=75, activation='relu'))
    model.add(Dense(300, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # Fit the model
    model.fit(X.iloc[train],
              y.iloc[train], epochs=10, batch_size=100, verbose=0)
    # evaluate the model
    scores = model.evaluate(X.iloc[test], y.iloc[test], verbose=0)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)
    y_predict = model.predict_classes(X_test)
    cr = metrics.classification_report(y_test ,y_predict)
    print(cr)
    cm = metrics.confusion_matrix(y_test,y_predict)
    print(cm)
    fpr, tpr, threshold = roc_curve(y_test, y_predict)
    roc_auc = auc(fpr, tpr)
    print("AUC %s" % roc_auc)


acc: 90.91%
              precision    recall  f1-score   support

           0       0.93      0.98      0.95     12672
           1       0.82      0.60      0.70      2250

   micro avg       0.92      0.92      0.92     14922
   macro avg       0.88      0.79      0.83     14922
weighted avg       0.92      0.92      0.92     14922

[[12380   292]
 [  893  1357]]
AUC 0.7900340909090909
acc: 91.44%
              precision    recall  f1-score   support

           0       0.93      0.98      0.95     12672
           1       0.87      0.56      0.68      2250

   micro avg       0.92      0.92      0.92     14922
   macro avg       0.90      0.77      0.82     14922
weighted avg       0.92      0.92      0.91     14922

[[12475   197]
 [  981  1269]]
AUC 0.7742269570707071
acc: 91.62%
              precision    recall  f1-score   support

           0       0.93      0.99      0.95     12672
           1       0.87      0.56      0.68      2250

   micro avg       0.92      0.92     

In [80]:
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

91.42% (+/- 0.30%)


In [57]:
rus = RandomUnderSampler()
print("Before DownSampling, counts of label 1: %d" % (sum(y_train.values==1)))
print("Before DownSampling, counts of label 0: %d" % (sum(y_train.values==0)))
X_train_resample, y_train_resample = rus.fit_sample(X_train, y_train)
print("After DownSampling, counts of label '1': %s" % (sum(y_train_resample==1)))
print("After DownSampling, counts of label '0': %s" % (sum(y_train_resample==0)))

Before DownSampling, counts of label 1: 5452
Before DownSampling, counts of label 0: 29365
After DownSampling, counts of label '1': [5452]
After DownSampling, counts of label '0': [5452]


In [61]:
# Undersampling with neural networks does not give better performance. The performance is actually slightly less, 
# when comparing the values for overall ROC, precision and recall of Bot.