In [1]:
import pandas as pd
import numpy as np
import os
import json
from pandas.io.json import json_normalize
from IPython.display import display,clear_output
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows',305)
from sklearn.feature_selection import SelectKBest,chi2,RFE,RFECV,f_regression,SelectFromModel
from sklearn.linear_model import LogisticRegression,RandomizedLasso,LinearRegression, Ridge,Lasso
from sklearn.ensemble import ExtraTreesClassifier,RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
import time

from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from minepy import MINE

import warnings
warnings.simplefilter("ignore", DeprecationWarning)
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA

### Eliminating cases where html_status=="Not success" as they have all null values

In [2]:
df_alex=pd.read_csv('Document_alexa_http.csv',low_memory=False)
df_virus=pd.read_csv('Document_malicious_http.csv',low_memory=False)

df_alex['Malicious']=0
df_virus['Malicious']=1

df=pd.concat([df_alex,df_virus])

df.drop(['Unnamed: 0'],axis=1,inplace=True)

df = df.sample(frac=1,random_state=867).reset_index(drop=True)

df.fillna(value=0,inplace=True)

display(df.shape)

df=df[df['html_status']=='Success']
df.shape

(45684, 119)

(36305, 119)

### Dropping columns that indicate the html status of domain data collection

In [3]:
df.drop(['html_status','html_status_strerror','html_status_message'],axis=1,inplace=True)
df.set_index(['domain'],inplace=True)

In [5]:
cols_document=[col for col in df.columns if 'document' in col]
df['tot_document']=df[cols_document].sum(axis=1)

cols_window=[col for col in df.columns if 'window' in col]
df['tot_window']=df[cols_window].sum(axis=1)

#### Columns and nummber of unique values they have

In [6]:
lst=[]
for col in df.columns:
    lst.append((col,len(df[col].value_counts())))

unique_val_cols=pd.DataFrame(lst)
unique_val_cols.columns=['Column_Name','Num_unique_value']

unique_val_cols.sort_values(by=['Num_unique_value'],inplace=True)
unique_val_cols.head()

Unnamed: 0,Column_Name,Num_unique_value
0,dom_function_document.adoptNode(,1
71,dom_function_window.getAttention(,1
36,dom_function_document.normalizeDocument(,1
98,dom_function_window.scrollByPages(,1
39,dom_function_document.queryCommandIndeterm(,1


In [7]:
print("Columns with all same values")
col_1=unique_val_cols[unique_val_cols['Num_unique_value']==1].Column_Name.values

for col in col_1:
    print(df[col].value_counts(),"\n")

Columns with all same values
0.0    36305
Name: dom_function_document.adoptNode(, dtype: int64 

0.0    36305
Name: dom_function_window.getAttention(, dtype: int64 

0.0    36305
Name: dom_function_document.normalizeDocument(, dtype: int64 

0.0    36305
Name: dom_function_window.scrollByPages(, dtype: int64 

0.0    36305
Name: dom_function_document.queryCommandIndeterm(, dtype: int64 

0.0    36305
Name: dom_function_document.queryCommandValue(, dtype: int64 

0.0    36305
Name: dom_function_window.scrollByLines(, dtype: int64 

0.0    36305
Name: dom_function_window.routeEvent(, dtype: int64 

0.0    36305
Name: dom_function_document.releaseCapture(, dtype: int64 

0.0    36305
Name: dom_function_document.releaseEvent(, dtype: int64 

0.0    36305
Name: dom_function_document.routeEvent(, dtype: int64 

0.0    36305
Name: dom_function_window.restore(, dtype: int64 

0.0    36305
Name: dom_function_window.resizeBy(, dtype: int64 

0.0    36305
Name: dom_function_window.requestIdleCall

#### Dependent and Target variables

In [8]:
X=df.loc[:,df.columns!='Malicious']
y=df.Malicious.values
feature_name = X.columns.tolist()

X=df.loc[:,df.columns!='Malicious']
y=df.Malicious.values
feature_name = X.columns.tolist()

model = LogisticRegression(random_state=0)
model.fit(X, y)

print("Accuracy score with all of the features")
display(model.score(X,y))

Accuracy score with all of the features


0.857209750723041

In [9]:
pca=PCA(n_components=2,random_state=0)

pca.fit_transform(X,y)

print("Explained variance with 2 features",sum(pca.explained_variance_ratio_))

X_pca=pca.transform(X)

model = LogisticRegression(random_state=0)
model.fit(X_pca, y)

print("Columns selected:\n")
# display(b.columns[:-1][fit.support_])
print("Accuracy score")
display(model.score(X_pca,y))

Explained variance with 2 features 0.9654018781166578
Columns selected:

Accuracy score


0.8569893954000827

In [10]:
pca_inv_data = pca.components_
lst=[]
lst.append((np.square(pca_inv_data.std(axis=0)),X.columns))
pca_df=pd.DataFrame(np.square(pca_inv_data.std(axis=0)))
pca_df['features']=X.columns
pca_df.columns=['% Variance explained by each feature','Features']
pca_df['% Variance explained by each feature']=pca_df['% Variance explained by each feature']*100
pca_df.sort_values(by='% Variance explained by each feature',ascending=False,inplace=True)
pca_df

Unnamed: 0,% Variance explained by each feature,Features
108,26.33851,number_functions_declared
121,12.59782,tot_document
46,11.01065,dom_function_document.write(
28,0.05240352,dom_function_document.getElementById(
82,0.0003340251,dom_function_window.open(
31,8.794608e-05,dom_function_document.getElementsByTagName(
104,3.760054e-05,dom_function_window.setTimeout(
11,3.086893e-05,dom_function_document.createElement(
111,2.341362e-05,number_of_non_http_only_cookies
48,2.099716e-05,dom_function_window.addEventListener(


### Correlation

In [11]:
def cor_selector(X, y):
    cor_list = []
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # feature name
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-50:]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature


cor_support, cor_feature = cor_selector(X, y)
print(str(len(cor_feature)), 'selected features')

50 selected features


  c /= stddev[:, None]
  c /= stddev[None, :]


### CHI2

In [12]:
X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=50)
chi_selector.fit(X_norm, y)
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')

50 selected features


### RFE

In [13]:
rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=50, step=10, verbose=False)
rfe_selector.fit(X_norm, y)

rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')

50 selected features


### SelectFromModel

In [21]:
embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100,random_state=0), threshold='30*median')
embeded_rf_selector.fit(X, y)
embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()
print(str(len(embeded_rf_feature)), 'selected features')

53 selected features


In [22]:
# from lightgbm import LGBMClassifier

# lgbc=LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=32, colsample_bytree=0.2,
#             reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)

# embeded_lgb_selector = SelectFromModel(lgbc, threshold='1e-20*mean')
# embeded_lgb_selector.fit(X, y)
# embeded_lgb_support = embeded_lgb_selector.get_support()

# embeded_lgb_feature = X.loc[:,embeded_lgb_support].columns.tolist()
# print(str(len(embeded_lgb_feature)), 'selected features')

In [23]:
feature_selection_df = pd.DataFrame({'Feature':feature_name, 'Pearson':cor_support, 'Chi-2':chi_support, 'RFE':rfe_support,
                                    'Random Forest':embeded_rf_support})
# count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
# display the top 100
feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)
feature_selection_df.head(100)

Unnamed: 0,Feature,Pearson,Chi-2,RFE,Random Forest,Total
1,tot_window,True,True,True,True,4
2,tot_document,True,True,True,True,4
3,number_of_unsecure_cookies,True,True,True,True,4
4,number_of_secure_cookies,True,True,True,True,4
5,number_of_non_http_only_cookies,True,True,True,True,4
6,number_of_http_only_cookies,True,True,True,True,4
7,number_of_cookies,True,True,True,True,4
8,number_functions_declared,True,True,True,True,4
9,dom_function_window.stop(,True,True,True,True,4
10,dom_function_window.setTimeout(,True,True,True,True,4


In [24]:
X=df[feature_selection_df.Feature]
y=df.Malicious.values

model = LogisticRegression()
model.fit(X, y)

print("Accuracy score with top 100 selected features:")
display(model.score(X,y))

Accuracy score with top 100 selected features:


0.857209750723041

In [25]:
pca=PCA(n_components=2,random_state=0)

pca.fit_transform(X,y)

print("Explained variance with 2 features",sum(pca.explained_variance_ratio_))

X_pca=pca.transform(X)

model = LogisticRegression(random_state=0)
model.fit(X_pca, y)

print("Columns selected:\n")
# display(b.columns[:-1][fit.support_])
print("Accuracy score")
display(model.score(X_pca,y))

Explained variance with 2 features 0.9654018781166525
Columns selected:

Accuracy score


0.8569893954000827

In [26]:
pca_inv_data = pca.components_
lst=[]
lst.append((np.square(pca_inv_data.std(axis=0)),X.columns))
pca_df=pd.DataFrame(np.square(pca_inv_data.std(axis=0)))
pca_df['features']=X.columns
pca_df.columns=['% Variance explained by each feature','Features']
pca_df['% Variance explained by each feature']=pca_df['% Variance explained by each feature']*100
pca_df.sort_values(by='% Variance explained by each feature',ascending=False,inplace=True)
pca_df

Unnamed: 0,% Variance explained by each feature,Features
7,26.33851,number_functions_declared
1,12.59782,tot_document
43,11.01065,dom_function_document.write(
33,0.05240352,dom_function_document.getElementById(
15,0.0003340251,dom_function_window.open(
31,8.794608e-05,dom_function_document.getElementsByTagName(
9,3.760054e-05,dom_function_window.setTimeout(
38,3.086893e-05,dom_function_document.createElement(
4,2.341362e-05,number_of_non_http_only_cookies
27,2.099716e-05,dom_function_window.addEventListener(
