In [1]:
import pandas as pd
import numpy as np
import os
import json
from pandas.io.json import json_normalize
from IPython.display import display,clear_output
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows',305)
from sklearn.feature_selection import SelectKBest,chi2,RFE,RFECV,f_regression,SelectFromModel
from sklearn.linear_model import LogisticRegression,RandomizedLasso,LinearRegression, Ridge,Lasso
from sklearn.ensemble import ExtraTreesClassifier,RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
import time

from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from minepy import MINE

import warnings
warnings.simplefilter("ignore", DeprecationWarning)
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA

### Eliminating cases where html_status=="Not success" as they have all null values

In [2]:
df_alex=pd.read_csv('Javascript_alexa_output.csv',low_memory=False)
df_virus=pd.read_csv('Javascript_malicious_output.csv',low_memory=False)

df_alex['Malicious']=0
df_virus['Malicious']=1

df=pd.concat([df_alex,df_virus])

df.drop(['Unnamed: 0'],axis=1,inplace=True)

df = df.sample(frac=1,random_state=867).reset_index(drop=True)

df.fillna(value=0,inplace=True)

display(df.shape)

df=df[df['html_status']=='Success']
df.shape

(45700, 303)

(42968, 303)

### Dropping columns that indicate the html status of domain data collection

In [3]:
df.drop(['html_status','html_status_strerror','html_status_message'],axis=1,inplace=True)
df.set_index(['domain'],inplace=True)

In [4]:
cols_WebAssembly=[col for col in df.columns if 'js_function_WebAssembly' in col]
cols_Reflect=[col for col in df.columns if 'Reflect' in col]
cols_Object=[col for col in df.columns if 'js_function_Object' in col]
cols_Number=[col for col in df.columns if 'js_function_Number' in col]
cols_Math=[col for col in df.columns if 'js_function_Math' in col]
cols_Atomics=[col for col in df.columns if 'js_function_Atomics' in col]
cols_Promise=[col for col in df.columns if 'Promise' in col]



df['tot_WebAssembly']=df[cols_WebAssembly].sum(axis=1)
df['tot_Reflect']=df[cols_Reflect].sum(axis=1)
df['tot_Object']=df[cols_Object].sum(axis=1)
df['tot_Number']=df[cols_Number].sum(axis=1)
df['tot_Math']=df[cols_Math].sum(axis=1)
df['tot_Atomics']=df[cols_Atomics].sum(axis=1)
df['tot_Promise']=df[cols_Atomics].sum(axis=1)

len(cols_WebAssembly)+len(cols_Reflect)+len(cols_Object)+len(cols_Number)+len(cols_Math)+len(cols_Atomics)+len(cols_Promise)

97

#### Columns and nummber of unique values they have

In [5]:
lst=[]
for col in df.columns:
    lst.append((col,len(df[col].value_counts())))

unique_val_cols=pd.DataFrame(lst)
unique_val_cols.columns=['Column_Name','Num_unique_value']

unique_val_cols.sort_values(by=['Num_unique_value'],inplace=True)
unique_val_cols.head()

Unnamed: 0,Column_Name,Num_unique_value
305,tot_Promise,1
177,js_function_Atomics.store(,1
243,js_function_Object.isSealed(,1
108,js_function_.setFloat32(,1
109,js_function_.setFloat64(,1


In [6]:
print("Columns with all same values")
col_1=unique_val_cols[unique_val_cols['Num_unique_value']==1].Column_Name.values

for col in col_1:
    print(df[col].value_counts(),"\n")

Columns with all same values
0.0    42968
Name: tot_Promise, dtype: int64 

0.0    42968
Name: js_function_Atomics.store(, dtype: int64 

0.0    42968
Name: js_function_Object.isSealed(, dtype: int64 

0.0    42968
Name: js_function_.setFloat32(, dtype: int64 

0.0    42968
Name: js_function_.setFloat64(, dtype: int64 

0.0    42968
Name: js_function_Atomics.or(, dtype: int64 

0.0    42968
Name: js_function_Atomics.load(, dtype: int64 

0.0    42968
Name: js_function_Atomics.sub(, dtype: int64 

0.0    42968
Name: js_function_Proxy.revocable(, dtype: int64 

0.0    42968
Name: js_function_Reflect.apply(, dtype: int64 

0.0    42968
Name: js_function_Atomics.isLockFree(, dtype: int64 

0.0    42968
Name: js_function_.getFloat64(, dtype: int64 

0.0    42968
Name: js_function_.getFloat32(, dtype: int64 

0.0    42968
Name: js_function_Atomics.exchange(, dtype: int64 

0.0    42968
Name: js_function_Atomics.compareExchange(, dtype: int64 

0.0    42968
Name: js_function_.setInt16(, dtype

#### Dependent and Target variables

In [7]:
X=df.loc[:,df.columns!='Malicious']
y=df.Malicious.values
feature_name = X.columns.tolist()

X=df.loc[:,df.columns!='Malicious']
y=df.Malicious.values
feature_name = X.columns.tolist()

model = LogisticRegression(random_state=0)
model.fit(X, y)

print("Accuracy score with all of the features")
display(model.score(X,y))

Accuracy score with all of the features


0.8698566374976727

In [8]:
pca=PCA(n_components=2,random_state=0)

pca.fit_transform(X,y)

print("Explained variance with 2 features",sum(pca.explained_variance_ratio_))

X_pca=pca.transform(X)

model = LogisticRegression(random_state=0)
model.fit(X_pca, y)

print("Columns selected:\n")
# display(b.columns[:-1][fit.support_])
print("Accuracy score")
display(model.score(X_pca,y))

Explained variance with 2 features 0.9417721915529939
Columns selected:

Accuracy score


0.8700195494321356

In [9]:
pca_inv_data = pca.components_
lst=[]
lst.append((np.square(pca_inv_data.std(axis=0)),X.columns))
pca_df=pd.DataFrame(np.square(pca_inv_data.std(axis=0)))
pca_df['features']=X.columns
pca_df.columns=['% Variance explained by each feature','Features']
pca_df['% Variance explained by each feature']=pca_df['% Variance explained by each feature']*100
pca_df.sort_values(by='% Variance explained by each feature',ascending=False,inplace=True)
pca_df

Unnamed: 0,% Variance explained by each feature,Features
292,31.81697,number_functions_declared
96,17.10937,js_function_.push(
9,0.2372004,js_function_.call(
83,0.1920854,js_function_.log(
29,0.1528904,js_function_.find(
302,0.08971389,tot_Math
100,0.03943011,js_function_.replace(
4,0.03783191,js_function_.apply(
300,0.03590346,tot_Object
73,0.03416923,js_function_.indexOf(


### Correlation

In [10]:
def cor_selector(X, y):
    cor_list = []
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # feature name
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-100:]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature


cor_support, cor_feature = cor_selector(X, y)
print(str(len(cor_feature)), 'selected features')

  c /= stddev[:, None]
  c /= stddev[None, :]


100 selected features


### CHI2

In [11]:
X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=100)
chi_selector.fit(X_norm, y)
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')

100 selected features


### RFE

In [12]:
rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=100, step=10, verbose=False)
rfe_selector.fit(X_norm, y)

rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')

100 selected features


### SelectFromModel

In [13]:
embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), threshold='23*median')
embeded_rf_selector.fit(X, y)
embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()
print(str(len(embeded_rf_feature)), 'selected features')

124 selected features


In [14]:
# from lightgbm import LGBMClassifier

# lgbc=LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=32, colsample_bytree=0.2,
#             reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)

# embeded_lgb_selector = SelectFromModel(lgbc, threshold='1e-20*mean')
# embeded_lgb_selector.fit(X, y)
# embeded_lgb_support = embeded_lgb_selector.get_support()

# embeded_lgb_feature = X.loc[:,embeded_lgb_support].columns.tolist()
# print(str(len(embeded_lgb_feature)), 'selected features')

In [15]:
feature_selection_df = pd.DataFrame({'Feature':feature_name, 'Pearson':cor_support, 'Chi-2':chi_support, 'RFE':rfe_support,
                                    'Random Forest':embeded_rf_support})
# count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
# display the top 100
feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)
feature_selection_df.head(100)

Unnamed: 0,Feature,Pearson,Chi-2,RFE,Random Forest,Total
1,tot_Object,True,True,True,True,4
2,tot_Math,True,True,True,True,4
3,number_of_unsecure_cookies,True,True,True,True,4
4,number_of_secure_cookies,True,True,True,True,4
5,number_of_non_http_only_cookies,True,True,True,True,4
6,number_of_http_only_cookies,True,True,True,True,4
7,number_of_cookies,True,True,True,True,4
8,number_functions_declared,True,True,True,True,4
9,js_function_parseInt(,True,True,True,True,4
10,js_function_parseFloat(,True,True,True,True,4


In [16]:
X=df[feature_selection_df.Feature]
y=df.Malicious.values

model = LogisticRegression()
model.fit(X, y)

print("Accuracy score with top 100 selected features:")
display(model.score(X,y))

Accuracy score with top 100 selected features:


0.8698799106311674

In [17]:
pca=PCA(n_components=2,random_state=0)

pca.fit_transform(X,y)

print("Explained variance with 2 features",sum(pca.explained_variance_ratio_))

X_pca=pca.transform(X)

model = LogisticRegression(random_state=0)
model.fit(X_pca, y)

print("Columns selected:\n")
# display(b.columns[:-1][fit.support_])
print("Accuracy score")
display(model.score(X_pca,y))

Explained variance with 2 features 0.9417721915529831
Columns selected:

Accuracy score


0.8700195494321356

In [18]:
pca_inv_data = pca.components_
lst=[]
lst.append((np.square(pca_inv_data.std(axis=0)),X.columns))
pca_df=pd.DataFrame(np.square(pca_inv_data.std(axis=0)))
pca_df['features']=X.columns
pca_df.columns=['% Variance explained by each feature','Features']
pca_df['% Variance explained by each feature']=pca_df['% Variance explained by each feature']*100
pca_df.sort_values(by='% Variance explained by each feature',ascending=False,inplace=True)
pca_df

Unnamed: 0,% Variance explained by each feature,Features
7,31.81697,number_functions_declared
45,17.10937,js_function_.push(
74,0.2372004,js_function_.call(
106,0.1920854,js_function_.log(
109,0.1528904,js_function_.find(
1,0.08971389,tot_Math
44,0.03943011,js_function_.replace(
75,0.03783191,js_function_.apply(
0,0.03590346,tot_Object
54,0.03416923,js_function_.indexOf(
