In [1]:
from IPython.display import display,clear_output,HTML

import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from scipy import stats
import tensorflow as tf
import seaborn as sns
from pylab import rcParams
from sklearn.model_selection import train_test_split
from keras.models import Model, load_model
from keras.layers import Input, Dense
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras import regularizers
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc, confusion_matrix, accuracy_score, matthews_corrcoef,\
                            precision_score, recall_score, roc_auc_score

from imblearn.over_sampling import SMOTE
import featuretools as ft
from sklearn.decomposition import PCA

Using TensorFlow backend.


In [2]:
### Read HTML 
df_html=pd.read_csv('../../html/HTML.csv',index_col='domain')
df_html.drop(['Unnamed: 0'],axis=1,inplace=True)
cookie=[col for col in df_html.columns if 'number' in col]
df_html.drop(cookie,axis=1,inplace=True)
print("Shape of HTML dataset:",df_html.shape)
df_html.Target=df_html.Target.apply(lambda x: 1 if x=='Malicious' else 0)
count_columns=[col for col in df_html.columns if col.endswith('count')]
df_html['total_count']=df_html[count_columns].sum(axis=1)

href_columns=[col for col in df_html.columns if (('href_absolute' in col) | ('href_relative' in col) | ('href_page' in col))  ]
df_html['total_href']=df_html[href_columns].sum(axis=1)

total_img_src=[col for col in df_html.columns if 'img_src_' in col]
df_html['total_img_src']=df_html[total_img_src].sum(axis=1)


### Read JS 
df_js=pd.read_csv('../Javascript.csv',low_memory=False,index_col='domain')
cookie=[col for col in df_js.columns if 'number' in col]
df_js.drop(cookie,axis=1,inplace=True)
df_js.rename(columns={'Malicious':'Target'},inplace=True)
print("Shape of JS,DOM dataset:",df_js.shape)
df_js.Target=df_js.Target.apply(lambda x: 1 if x=='Malicious' else 0)


### Read HTTP
df_http=pd.read_csv('../HTTP.csv')
df_http.rename(columns={'Malicious':'Target'},inplace=True)
df_http.fillna(value=0,inplace=True)
cookies=[columns for columns in df_http.columns if 'number' in columns]
df_http.drop(cookies,axis=1,inplace=True)
df_http=df_http[df_http['domain']!=0]
df_http.sort_values(by='Target',inplace=True,ascending=False)
df_http.drop_duplicates(['domain'], keep='first',inplace=True)
df_http=df_http.sample(frac=1,random_state=0)
df_http.set_index(['domain'],drop=True,inplace=True)
print("Shape of HTTP dataset:",df_http.shape)



### Read URL
df_url=pd.read_csv('../URL.csv', low_memory = False)
df_url.fillna(value=0,inplace=True)
cookies=[columns for columns in df_url.columns if 'number' in columns]
df_url.drop(cookies,axis=1,inplace=True)
df_url.drop(['url_host','url_ip'],axis=1,inplace=True)
df_url=df_url[df_url['domain']!=0]
df_url.sort_values(by='Target',inplace=True,ascending=False)
df_url.drop_duplicates(['domain'], keep='first',inplace=True)
df_url=df_url.sample(frac=1,random_state=0)
print("Shape of URL dataset:",df_url.shape)
df_url.set_index('domain',inplace=True,drop=True)

df=pd.concat([df_js,df_html,df_http,df_url],axis=1,join='inner')
df.drop(['Unnamed: 0'],axis=1,inplace=True)
df['Target_z']=df[['Target','Target','Target','Target']].apply(max,axis=1)
df.drop(['Target','Target','Target','Target'],axis=1,inplace=True)
print("Shape of combined dataset:",df.shape)
df.rename(columns={'Target_z':'Target'},inplace=True)
df.Target.value_counts()

Shape of HTML dataset: (43491, 1461)
Shape of JS,DOM dataset: (43294, 401)
Shape of HTTP dataset: (45856, 672)
Shape of URL dataset: (46771, 4194)
Shape of combined dataset: (39183, 6726)


0    34742
1     4441
Name: Target, dtype: int64

In [3]:
columns=['dom_function_document.createElement(',
 'dom_function_document.write(',
 'dom_function_window.addEventListener(',
 'dom_function_window.setInterval(',
 'dom_function_window.setTimeout(',
 'js_function_.charCodeAt(',
 'js_function_.concat(',
 'js_function_.exec(',
 'js_function_.link(',
 'js_function_.search(',
 'js_function_.substring(',
 'js_function_escape(',
 'js_function_eval(',
 'js_function_parseInt(',
 'js_function_String.fromCharCode(',
 'js_function_unescape(',
 'js_function_.replace(',
 'a_count',
 'div_count',
 'embed_count',
 'frame_count',
 'iframe_count',
 'img_count',
 'input_count',
 'link_count',
 'meta_count',
 'object_count',
 'style_count',
 'title_count',
 'total_count',
 'total_href',
 'total_img_src',
 'iframe_hidden_true',
 'script_type_text/javascript',
 'a_href_out_of_domain',
 'area_href_out_of_domain',
 'audio_src_out_of_domain',
 'base_href_out_of_domain',
 'embed_src_out_of_domain',
 'form_action_out_of_domain',
 'frame_src_out_of_domain',
 'iframe_src_out_of_domain',
 'img_src_out_of_domain',
 'img_srcset_out_of_domain',
 'link_href_out_of_domain',
 'object_data_out_of_domain',
 'script_src_out_of_domain',
 'source_src_out_of_domain',
 'source_srcset_out_of_domain',
 'video_src_out_of_domain',
 'http_header_cache-control_set_max-age',
 'http_header_cache-control_set_must-revalidate',
 'http_header_cache-control_set_no-cache',
 'http_header_cache-control_set_no-store',
 'http_header_cache-control_set_private',
 'http_header_cache-control_set_public',
 'http_header_content-encoding_gzip',
 'http_header_content-language_text/html',
 'http_header_content-length',
 'http_header_server_apache',
 'http_header_server_nginx',
 'url_char_-',
 'url_char_.',
 'url_char_a',
 'url_char_b',
 'url_char_c',
 'url_char_d',
 'url_char_e',
 'url_char_f',
 'url_char_g',
 'url_char_h',
 'url_char_i',
 'url_char_j',
 'url_char_k',
 'url_char_l',
 'url_char_m',
 'url_char_n',
 'url_char_o',
 'url_char_p',
 'url_char_r',
 'url_char_s',
 'url_char_t',
 'url_char_u',
 'url_char_v',
 'url_char_w',
 'url_char_x',
 'url_char_y',
 'url_char_z',
 'url_extension_.com',
 'url_extension_.i',
 'url_extension_.net',
 'url_extensions',
 'url_length',
 'url_tlds',
 'url_words_with_length_4',
 'url_words_with_length_5',
 'url_words_with_length_6',
 'url_words_with_length_7',
 'url_words_with_length_8',
 'Target']

print("features being used")
display(columns)
df_sel=df[columns].copy(deep=True)

features being used


['dom_function_document.createElement(',
 'dom_function_document.write(',
 'dom_function_window.addEventListener(',
 'dom_function_window.setInterval(',
 'dom_function_window.setTimeout(',
 'js_function_.charCodeAt(',
 'js_function_.concat(',
 'js_function_.exec(',
 'js_function_.link(',
 'js_function_.search(',
 'js_function_.substring(',
 'js_function_escape(',
 'js_function_eval(',
 'js_function_parseInt(',
 'js_function_String.fromCharCode(',
 'js_function_unescape(',
 'js_function_.replace(',
 'a_count',
 'div_count',
 'embed_count',
 'frame_count',
 'iframe_count',
 'img_count',
 'input_count',
 'link_count',
 'meta_count',
 'object_count',
 'style_count',
 'title_count',
 'total_count',
 'total_href',
 'total_img_src',
 'iframe_hidden_true',
 'script_type_text/javascript',
 'a_href_out_of_domain',
 'area_href_out_of_domain',
 'audio_src_out_of_domain',
 'base_href_out_of_domain',
 'embed_src_out_of_domain',
 'form_action_out_of_domain',
 'frame_src_out_of_domain',
 'iframe_src_o

### 1. No Sampling
#### 70% for training, 30% for testing  
#### Split into Training and test set first. Split Training into training and validation. 
### We then remove malicious domains from training set and only include the non-malicious domains in the train-set. Thus, we cant perform over-sampling or undersampling
#### Standardize training and then scaled validation and testing

In [4]:
X_train, X_test = train_test_split(df_sel, test_size=0.3, random_state=0)

X_train, X_val = train_test_split(X_train, test_size=0.2, random_state=0)
X_train = X_train[X_train.Target == 0]
X_train = X_train.drop(['Target'], axis=1) 
sc = StandardScaler()
scaled_X_train = sc.fit_transform(X_train)


X_val = X_val.drop(['Target'],axis=1)
scaled_X_val = sc.transform(X_val)

y_test = X_test['Target']
X_test = X_test.drop(['Target'], axis=1)
scaled_X_test = sc.transform(X_test)

#### a. L1 regularization of 10e-5
#### b. 4 encoding layers with 60,30,15 and 99 neurons respectively

In [5]:
input_dim = scaled_X_train.shape[1]
encoding_dim = 60
input_layer = Input(shape=(input_dim, ))
encoder = Dense(encoding_dim, activation="relu",activity_regularizer=regularizers.l1(10e-5))(input_layer)
encoder = Dense(int(encoding_dim / 2), activation="relu")(encoder)
decoder = Dense(int(encoding_dim / 2), activation='relu')(encoder)
decoder = Dense(input_dim, activation='relu')(decoder)
autoencoder = Model(inputs=input_layer, outputs=decoder)

In [6]:
nb_epoch = 200
batch_size = 64
autoencoder.compile(optimizer='adam',loss='mean_squared_error',metrics=['binary_accuracy'])
checkpointer = ModelCheckpoint(filepath="3_16_ns_model_70_30_prev.h5",verbose=0,save_best_only=True)
tensorboard = TensorBoard(log_dir='./logs',histogram_freq=0,write_graph=True,write_images=True)
autoencoder.fit(scaled_X_train, scaled_X_train,
                    epochs=nb_epoch,
                    batch_size=batch_size,
                    shuffle=False,
                    validation_data=(scaled_X_val, scaled_X_val),
                    verbose=1)

Train on 19484 samples, validate on 5486 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/20

<keras.callbacks.History at 0x7f5898ce1208>

In [7]:
predictions = autoencoder.predict(scaled_X_test)
mse = np.mean(np.power(scaled_X_test - predictions, 2), axis=1)
error_df = pd.DataFrame({'reconstruction_error': mse,'true_class': y_test})

In [8]:
threshold=0.83
y_pred = [1 if e > threshold else 0 for e in error_df.reconstruction_error.values]
print("Accuracy on test set : ", accuracy_score( error_df.true_class.values , np.array(y_pred) ) )
print("MCC on test set : " , matthews_corrcoef( error_df.true_class.values , np.array(y_pred) ) )
print("Precision : " , precision_score( error_df.true_class.values , np.array(y_pred) ) )
print("Recall : " , recall_score( error_df.true_class.values , np.array(y_pred) ) )
print("AUC : " , roc_auc_score( error_df.true_class.values , np.array(y_pred) ) )
pd.crosstab(error_df.true_class.values,np.array(y_pred))

Accuracy on test set :  0.8702679710761378
MCC on test set :  0.3113456688667923
Precision :  0.4171833480956599
Recall :  0.35201793721973096
AUC :  0.6444259792655245


col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,9759,658
1,867,471


### 2. Feature Transformation

In [9]:
df.reset_index(inplace=True)
columns.remove('Target')
columns.append('domain')
df_min=df[columns]

es = ft.EntitySet(id = 'malicious')
es.entity_from_dataframe(entity_id = 'data', dataframe = df_min, index = 'domain')

feature_matrix_sessions, features_defs = ft.dfs(entityset=es,target_entity="data",
                                                trans_primitives = ['add','divide','multiply'],n_jobs=-1,
                                                verbose=1,max_depth=1)

Built 19503 features
EntitySet scattered to workers in 4.734 seconds
Elapsed: 01:46 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 11/11 chunks


In [30]:
# df.set_index(['domain'],drop=True,inplace=True)
feature_name= ['url_tlds / total_count',
 'url_length / total_count',
 'url_extensions / total_count',
 'url_char_z / total_count',
 'url_words_with_length_5 / total_count',
 'url_words_with_length_4 / total_count',
 'url_extensions + url_length',
 'url_char_w + url_length',
 'url_char_u / total_count',
 'url_char_t / total_count',
 'url_char_s / total_count',
 'url_char_l / total_count',
 'url_char_i / total_count',
 'url_char_e / total_count',
 'url_char_. / total_count',
 'url_char_. + url_char_z',
 'url_char_. + url_char_y',
 'url_char_. + url_char_x',
 'url_char_. + url_char_w',
 'url_char_.',
 'js_function_String.fromCharCode( + url_length',
 'http_header_content-language_text/html / url_char_.',
 'http_header_content-language_text/html / title_count',
 'http_header_content-language_text/html + url_length',
 'http_header_content-encoding_gzip + http_header_content-language_text/html',
 'frame_src_out_of_domain + url_char_.',
 'frame_count + url_char_.']
display(feature_name)
feature_matrix_sessions = feature_matrix_sessions[feature_name]

fs=pd.merge(feature_matrix_sessions,pd.DataFrame(df_sel.Target),left_index=True, right_index=True)

fs.replace([np.inf],0,inplace=True)
fs.fillna(value=0,inplace=True)

fs=fs.reindex(df_sel.index.values)
X=fs.loc[:,fs.columns!='Target']
y=fs.Target
feature_name = X.columns.tolist()
fs=fs.reindex(df_sel.index.values)

['url_tlds / total_count',
 'url_length / total_count',
 'url_extensions / total_count',
 'url_char_z / total_count',
 'url_words_with_length_5 / total_count',
 'url_words_with_length_4 / total_count',
 'url_extensions + url_length',
 'url_char_w + url_length',
 'url_char_u / total_count',
 'url_char_t / total_count',
 'url_char_s / total_count',
 'url_char_l / total_count',
 'url_char_i / total_count',
 'url_char_e / total_count',
 'url_char_. / total_count',
 'url_char_. + url_char_z',
 'url_char_. + url_char_y',
 'url_char_. + url_char_x',
 'url_char_. + url_char_w',
 'url_char_.',
 'js_function_String.fromCharCode( + url_length',
 'http_header_content-language_text/html / url_char_.',
 'http_header_content-language_text/html / title_count',
 'http_header_content-language_text/html + url_length',
 'http_header_content-encoding_gzip + http_header_content-language_text/html',
 'frame_src_out_of_domain + url_char_.',
 'frame_count + url_char_.']

#### 70% for training, 30% for testing  
#### Split into Training and test set first. Split Training into training and validation. 
### We then remove malicious domains from training set and only include the non-malicious domains in the train-set. Thus, we cant-perform over-sampling or undersampling
#### Standardize training and then scaled validation and testing

In [None]:
X_train, X_test = train_test_split(fs, test_size=0.3, random_state=0)

X_train, X_val = train_test_split(X_train, test_size=0.2, random_state=0)
X_train = X_train[X_train.Target == 0]
X_train = X_train.drop(['Target'], axis=1) 
sc = StandardScaler()
scaled_X_train = sc.fit_transform(X_train)


X_val = X_val.drop(['Target'],axis=1)
scaled_X_val = sc.transform(X_val)

y_test = X_test['Target']
X_test = X_test.drop(['Target'], axis=1)
scaled_X_test = sc.transform(X_test)

#### a. L1 regularization of 10e-5
#### b. 4 encoding layers with 20,10,5 and 27 neurons respectively

In [32]:
input_dim = X_train.shape[1]
encoding_dim = 20
input_layer = Input(shape=(input_dim, ))
encoder = Dense(encoding_dim, activation="relu",activity_regularizer=regularizers.l1(10e-5))(input_layer)
encoder = Dense(int(encoding_dim / 2), activation="relu")(encoder)
decoder = Dense(int(encoding_dim / 2), activation='relu')(encoder)
decoder = Dense(input_dim, activation='relu')(decoder)
autoencoder = Model(inputs=input_layer, outputs=decoder)

#### 200 epochs, batch_size=64, optimizer = SDG , loss = mean_squared_error

In [33]:
nb_epoch = 200
batch_size = 64
autoencoder.compile(optimizer='SGD',loss='mean_squared_error',metrics=['binary_accuracy'])
checkpointer = ModelCheckpoint(filepath="3_16_ft_model_70_30_prev.h5",verbose=0,save_best_only=True)
tensorboard = TensorBoard(log_dir='./logs',histogram_freq=0,write_graph=True,write_images=True)
autoencoder.fit(scaled_X_train, scaled_X_train,
                    epochs=nb_epoch,
                    batch_size=batch_size,
                    shuffle=False,
                    validation_data=(scaled_X_val, scaled_X_val),
                    verbose=1)

Train on 19484 samples, validate on 5486 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/20

<keras.callbacks.History at 0x7f55fc1ce9e8>

In [34]:
predictions = autoencoder.predict(scaled_X_test)
mse = np.mean(np.power(scaled_X_test - predictions, 2), axis=1)
error_df = pd.DataFrame({'reconstruction_error': mse,'true_class': y_test})

In [42]:
# for i in np.arange(0,3,0.01):
#     threshold=i
#     y_pred = [1 if e > threshold else 0 for e in error_df.reconstruction_error.values]
# #     pd.crosstab(error_df.true_class.values,np.array(y_pred))
#     print(i)
#     print("MCC on test set : " , matthews_corrcoef( error_df.true_class.values , np.array(y_pred) ) )

In [41]:
# threshold = error_df.reconstruction_error.describe()['25%']
threshold = 2.39
y_pred = [1 if e > threshold else 0 for e in error_df.reconstruction_error.values]
# conf_matrix = confusion_matrix(error_df.true_class, y_pred)

print("Accuracy on test set : ", accuracy_score( error_df.true_class.values , np.array(y_pred) ) )
print("MCC on test set : " , matthews_corrcoef( error_df.true_class.values , np.array(y_pred) ) )
print("Precision : " , precision_score( error_df.true_class.values , np.array(y_pred) ) )
print("Recall : " , recall_score( error_df.true_class.values , np.array(y_pred) ) )
print("AUC : " , roc_auc_score( error_df.true_class.values , np.array(y_pred) ) )
pd.crosstab(error_df.true_class.values,np.array(y_pred))

Accuracy on test set :  0.9444491705657168
MCC on test set :  0.7046876818886748
Precision :  0.815668202764977
Recall :  0.6614349775784754
AUC :  0.8211177959794076


col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,10217,200
1,453,885


### 3. PCA Feature Transformation

In [43]:
df.reset_index(inplace=True)
# columns.remove('Target')
# columns.append('domain')
df_min=df[columns]

es = ft.EntitySet(id = 'malicious')
es.entity_from_dataframe(entity_id = 'data', dataframe = df_min, index = 'domain')

feature_matrix_sessions, features_defs = ft.dfs(entityset=es,target_entity="data",
                                                trans_primitives = ['add','divide','multiply'],n_jobs=-1,
                                                verbose=1,max_depth=1)

df.set_index(['domain'],drop=True,inplace=True)
fs=pd.merge(feature_matrix_sessions,pd.DataFrame(df_sel.Target),left_index=True, right_index=True)
fs.replace([np.inf],0,inplace=True)
fs.fillna(value=0,inplace=True)
fs=fs.reindex(df.index.values)

Built 19503 features
EntitySet scattered to workers in 4.815 seconds
Elapsed: 01:46 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 11/11 chunks


In [48]:
X_train, X_test = train_test_split(fs, test_size=0.3, random_state=0)
X_train, X_val = train_test_split(X_train, test_size=0.2, random_state=0)
X_train = X_train[X_train.Target == 0]

In [49]:
X=X_train.drop(['Target'] , axis=1)
y=X_train.Target.values
feature_name = X.columns.tolist()
X=np.nan_to_num(X)
scaler=StandardScaler()
scaled_X=scaler.fit_transform(X)

pca=PCA(n_components=750,whiten=True,random_state=0)
X_pca_21=pca.fit_transform(scaled_X)
print("Variance explained using 750 components is:",sum(pca.explained_variance_ratio_))

X_val = X_val.drop(['Target'],axis=1)
X_val_pca = pca.transform(X_val) 

y_test = X_test['Target']
X_test = X_test.drop(['Target'], axis=1)
X_test_pca = pca.transform(X_test)

Variance explained using 750 components is: 0.8211123626764436


#### a. L1 regularization of 10e-5
#### b. 4 encoding layers with 500, 250, 125 and 750 neurons respectively

In [50]:
input_dim = X_pca_21.shape[1]
encoding_dim = 500
input_layer = Input(shape=(input_dim, ))
encoder = Dense(encoding_dim, activation="relu",activity_regularizer=regularizers.l1(10e-5))(input_layer)
encoder = Dense(int(encoding_dim / 2), activation="relu")(encoder)
decoder = Dense(int(encoding_dim / 2), activation='relu')(encoder)
decoder = Dense(input_dim, activation='relu')(decoder)
autoencoder = Model(inputs=input_layer, outputs=decoder)

In [51]:
nb_epoch = 200
batch_size = 64
autoencoder.compile(optimizer='SGD',loss='mean_squared_error',metrics=['binary_accuracy'])
checkpointer = ModelCheckpoint(filepath="3_16_pca_model_70_30_prev.h5",verbose=0,save_best_only=True)
tensorboard = TensorBoard(log_dir='./logs',histogram_freq=0,write_graph=True,write_images=True)
autoencoder.fit(X_pca_21, X_pca_21,
                    epochs=nb_epoch,
                    batch_size=batch_size,
                    shuffle=False,
                    validation_data=(X_val_pca, X_val_pca),
                    verbose=1)

Train on 19484 samples, validate on 5486 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/20

<keras.callbacks.History at 0x7f585cb43438>

In [52]:
predictions = autoencoder.predict(X_test_pca)
mse = np.mean(np.power(X_test_pca - predictions, 2), axis=1)
error_df = pd.DataFrame({'reconstruction_error': mse,'true_class': y_test})

In [54]:
# threshold = error_df.reconstruction_error.describe()['25%']
threshold = 40
y_pred = [0 if e > threshold else 1 for e in error_df.reconstruction_error.values]
# conf_matrix = confusion_matrix(error_df.true_class, y_pred)

print("Accuracy on test set : ", accuracy_score( error_df.true_class.values , np.array(y_pred) ) )
print("MCC on test set : " , matthews_corrcoef( error_df.true_class.values , np.array(y_pred) ) )
print("Precision : " , precision_score( error_df.true_class.values , np.array(y_pred) ) )
print("Recall : " , recall_score( error_df.true_class.values , np.array(y_pred) ) )
print("AUC : " , roc_auc_score( error_df.true_class.values , np.array(y_pred) ) )
pd.crosstab(error_df.true_class.values,np.array(y_pred))

Accuracy on test set :  0.882007656316461
MCC on test set :  0.5581376100419595
Precision :  0.48859934853420195
Recall :  0.7847533632286996
AUC :  0.8396263696243335


col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,9318,1099
1,288,1050


In [55]:
### Darshan Bhansali
### HTML code to hide the input cells 
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
To toggle code, click <a href="javascript:code_toggle()">here</a>.''')