In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import pickle
import warnings
import nltk 
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.linear_model import LogisticRegression
#from sklearn.pipeline import Pipeline
#from sklearn.pipeline import make_pipeline
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report, precision_score,recall_score, accuracy_score, make_scorer, f1_score, roc_curve, roc_auc_score
from sklearn.model_selection import cross_val_score,train_test_split, cross_validate, cross_val_predict
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA
from pandarallel import pandarallel
from os import walk
from wordcloud import WordCloud, STOPWORDS
from time import time
from scipy.stats import wilcoxon
import os
import sys
import random
from sklearn.exceptions import ConvergenceWarning
from pandas.errors import SettingWithCopyWarning

# Initialization
pandarallel.initialize(progress_bar=True)

# disable unuseful warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore", category=ConvergenceWarning)
    warnings.simplefilter("ignore", category=FutureWarning)
    warnings.simplefilter("ignore", category=SettingWithCopyWarning)
os.environ["PYTHONWARNINGS"] = ('ignore::UserWarning,ignore::RuntimeWarning')


ModuleNotFoundError: No module named 'plotly'

## Import Data

In [None]:
# Read the urls database
urls_df = pd.read_csv(r'phishing_site_urls.csv')

# Rename the column
urls_df.rename(columns = {'URL':'Content'}, inplace = True)

# Change label "bad" in 0 and "good" in 1
urls_df.loc[urls_df["Label"]=="bad", "Label"]= 0
urls_df.loc[urls_df["Label"]=="good", "Label"]= 1
urls_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 549346 entries, 0 to 549345
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   Content  549346 non-null  object
 1   Label    549346 non-null  object
dtypes: object(2)
memory usage: 8.4+ MB


In [None]:
urls_df.drop_duplicates(inplace=True)
urls_df.isnull().sum()

Content    0
Label      0
dtype: int64

In [None]:
# URL labels overview
label_counter = pd.DataFrame(urls_df.Label.value_counts())
print(label_counter.Label)
fig = px.bar(label_counter, x=label_counter.index, y=label_counter.Label)
fig.update_layout(
    xaxis_title="Label",
    yaxis_title="Instances",
)
fig.show()

1    392897
0    114299
Name: Label, dtype: int64


In [None]:
# Read the emails dataset
pathwalk = walk(r"enron-spam/")

allHamData, allSpamData = [], []
for root, dr, file in pathwalk:
    if 'ham' in str(file):
        for obj in file:
            with open(root + '/' + obj, encoding='latin1') as ip:
                allHamData.append(" ".join(ip.readlines()))

    elif 'spam' in str(file):
        for obj in file:
            with open(root + '/' + obj, encoding='latin1') as ip:
                allSpamData.append(" ".join(ip.readlines()))
                
# remove all redundant data
allHamData = list(set(allHamData))
allSpamData = list(set(allSpamData))

# merge it in a dataframe
hamPlusSpamData = allHamData + allSpamData

# Labels: "bad" = 0 and "good" = 1
labels = [1]*len(allHamData) + [0]*len(allSpamData)

emails_df = pd.DataFrame({"Content": hamPlusSpamData, "Label": labels})

In [None]:
# Email labels overview
label_counter = pd.DataFrame(emails_df.Label.value_counts())
print(label_counter.Label)
fig = px.bar(label_counter, x=label_counter.index, y=label_counter.Label)
fig.update_layout(
    xaxis_title="Label",
    yaxis_title="Instances",
)
fig.show()

1    15910
0    14584
Name: Label, dtype: int64


In [None]:
df = pd.concat([urls_df, emails_df], axis=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 537690 entries, 0 to 30493
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   Content  537690 non-null  object
 1   Label    537690 non-null  object
dtypes: object(2)
memory usage: 12.3+ MB


In [None]:
label_counter = pd.DataFrame(df.Label.value_counts())
print(label_counter.Label)
fig = px.bar(label_counter, x=label_counter.index, y=label_counter.Label)
fig.update_layout(
    xaxis_title="Label",
    yaxis_title="Instances",
)
fig.show()

1    408807
0    128883
Name: Label, dtype: int64


## Pipeline

In [None]:
stopwords = nltk.corpus.stopwords.words('english')
snow_stemmer = SnowballStemmer('english')

In [None]:
analyzer = CountVectorizer(stop_words = stopwords).build_analyzer()

In [None]:
def print_metrics(results):
    metrics = ['accuracy','precision_ham', 'recall_ham','precision_spam', 'recall_spam']
    for metric in metrics:
        print(f'{metric:15}', np.round(results["test_"+metric].mean(),3), [np.round(elem,3) for elem in results["test_"+metric]])
    print()
    print(f'{"vocabulary size":15}', np.mean([len(results['estimator'][i][0].vocabulary_) for i in range(n_folds)]))
    print(f'{"fit_time":15}', np.round(results["fit_time"].mean()))
    print(f'{"score_time":15}', np.round(results["score_time"].mean()))

In [None]:
# KFold cross_validate
n_folds = 10
# instead of Kfold()
kf = StratifiedKFold(n_folds)

y = df.Label
y=y.astype('int')
y

0        0
1        0
2        0
3        0
4        0
        ..
30489    0
30490    0
30491    0
30492    0
30493    0
Name: Label, Length: 537690, dtype: int64

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from imblearn.metrics import classification_report_imbalanced
from imblearn.under_sampling import NearMiss
from imblearn.pipeline import make_pipeline as make_pipeline_imb

# Split in training and test set but with stratify (balance split)
X_train, X_test, y_train, y_test = train_test_split(df.Content.values, y,
                                                    stratify=y, 
                                                    test_size=0.25, random_state = 42)


## MFNN

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score

# Initialize the MFNN classifier
mlp = MLPClassifier(hidden_layer_sizes=(5, 2), max_iter=200,random_state=42)

# Create the pipeline
pipe = Pipeline([('vect', TfidfVectorizer(stop_words = stopwords)), ('samp', RandomUnderSampler()), ('clf', mlp)])

# Fit the pipeline to the data
pipe.fit(X_train, y_train, verbose=2)

# Predict the class labels for the provided data
y_pred = pipe.predict(X_test)

# around 290 minutes
MFNN_tfidf = cross_validate(pipe,
                         df.Content.values,
                         y,
                         scoring = {'precision_ham': make_scorer(precision_score,pos_label = 1),                         
                                    'precision_spam': make_scorer(precision_score,pos_label = 0),
                                    'recall_ham': make_scorer(recall_score,pos_label = 1),
                                    'recall_spam': make_scorer(recall_score,pos_label = 0),
                                    'accuracy': make_scorer(accuracy_score),
                                    'fscore_spam': make_scorer(f1_score,pos_label = 0),
                                    'fscore_ham': make_scorer(f1_score,pos_label = 1)},                    
                         return_estimator = True,
                         cv = kf,
                         n_jobs = 12) # Number of jobs to run in parallel. 
                                      # Training the estimator and computing the score are parallelized over the cross-validation splits.

print_metrics(MFNN_tfidf)


"from sklearn.neural_network import MLPClassifier\nfrom sklearn.datasets import make_classification\nfrom sklearn.metrics import accuracy_score\n\n# Initialize the MFNN classifier\nmlp = MLPClassifier(hidden_layer_sizes=(5, 2), max_iter=200,random_state=42)\n\n# Create the pipeline\npipe = Pipeline([('vect', TfidfVectorizer(stop_words = stopwords)), ('samp', RandomUnderSampler()), ('clf', mlp)])\n\n# Fit the pipeline to the data\npipe.fit(X_train, y_train, verbose=2)\n\n# Predict the class labels for the provided data\ny_pred = pipe.predict(X_test)\n\nMFNN_tfidf = cross_validate(pipe,\n                         df.Content.values,\n                         y,\n                         scoring = {'precision_ham': make_scorer(precision_score,pos_label = 1),                         \n                                    'precision_spam': make_scorer(precision_score,pos_label = 0),\n                                    'recall_ham': make_scorer(recall_score,pos_label = 1),\n                   

In [None]:
import pickle
pickle.dump(pipe,open('phishing_MFNN1.pkl','wb'))
loaded_model = pickle.load(open('phishing_MFNN1.pkl', 'rb'))
result = loaded_model.score(X_test,y_test)
print(result)


0.760301436510121


KeyError: 'test_accuracy'