In [1]:
import pandas as pd
import numpy as np 
import wandb
import matplotlib.pyplot as plt
# Make your plots appear within the notebook
%matplotlib inline
from collections import Counter 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [2]:
# fix the random seed for reproducible projects  
#1
np.random.seed(5)
# return the whole dataframe to see the whole ouput  
pd.set_option("display.max_rows", None, "display.max_columns", None)
#2
fish_df = pd.read_csv('C:\\Users\\fderango\\Downloads\\Phishingproject.csv') 
#3
fish_df.head(10)

Unnamed: 0,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,port,HTTPS_token,Request_URL,URL_of_Anchor,Links_in_tags,SFH,Submitting_to_email,Abnormal_URL,Redirect,on_mouseover,RightClick,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
0,-1,1,1,1,-1,-1,-1,-1,-1,1,1,-1,1,-1,1,-1,-1,-1,0,1,1,1,1,-1,-1,-1,-1,1,1,-1,-1
1,1,1,1,1,1,-1,0,1,-1,1,1,-1,1,0,-1,-1,1,1,0,1,1,1,1,-1,-1,0,-1,1,1,1,-1
2,1,0,1,1,1,-1,-1,-1,-1,1,1,-1,1,0,-1,-1,-1,-1,0,1,1,1,1,1,-1,1,-1,1,0,-1,-1
3,1,0,1,1,1,-1,-1,-1,1,1,1,-1,-1,0,0,-1,1,1,0,1,1,1,1,-1,-1,1,-1,1,-1,1,-1
4,1,0,-1,1,1,-1,1,1,-1,1,1,1,1,0,0,-1,1,1,0,-1,1,-1,1,-1,-1,0,-1,1,1,1,1
5,-1,0,-1,1,-1,-1,1,1,-1,1,1,-1,1,0,0,-1,-1,-1,0,1,1,1,1,1,1,1,-1,1,-1,-1,1
6,1,0,-1,1,1,-1,-1,-1,1,1,1,1,-1,-1,0,-1,-1,-1,0,1,1,1,1,1,-1,-1,-1,1,0,-1,-1
7,1,0,1,1,1,-1,-1,-1,1,1,1,-1,-1,0,-1,-1,1,1,0,1,1,1,1,-1,-1,0,-1,1,0,1,-1
8,1,0,-1,1,1,-1,1,1,-1,1,1,-1,1,0,1,-1,1,1,0,1,1,1,1,1,-1,1,1,1,0,1,1
9,1,1,-1,1,1,-1,-1,1,-1,1,1,1,1,0,1,-1,1,1,0,1,1,1,1,1,-1,0,-1,1,0,1,-1


In [3]:
# Milestone #1 rename the labels for result for negative values 
fish_df['Result'] = fish_df['Result'].replace({-1:0}) 
fish_df['Result'].head()

0    0
1    0
2    0
3    0
4    1
Name: Result, dtype: int64

In [4]:
# Milestone #2 check for missing values 
#fish_df.isnull().values.any()
fish_df.isnull().sum()

having_IP_Address              0
URL_Length                     0
Shortining_Service             0
having_At_Symbol               0
double_slash_redirecting       0
Prefix_Suffix                  0
having_Sub_Domain              0
SSLfinal_State                 0
Domain_registeration_length    0
Favicon                        0
port                           0
HTTPS_token                    0
Request_URL                    0
URL_of_Anchor                  0
Links_in_tags                  0
SFH                            0
Submitting_to_email            0
Abnormal_URL                   0
Redirect                       0
on_mouseover                   0
RightClick                     0
popUpWidnow                    0
Iframe                         0
age_of_domain                  0
DNSRecord                      0
web_traffic                    0
Page_Rank                      0
Google_Index                   0
Links_pointing_to_page         0
Statistical_report             0
Result    

In [5]:
# checking if zeros
# get the count of the 
(fish_df == 0).sum()

having_IP_Address                 0
URL_Length                      135
Shortining_Service                0
having_At_Symbol                  0
double_slash_redirecting          0
Prefix_Suffix                     0
having_Sub_Domain              3622
SSLfinal_State                 1167
Domain_registeration_length       0
Favicon                           0
port                              0
HTTPS_token                       0
Request_URL                       0
URL_of_Anchor                  5337
Links_in_tags                  4449
SFH                             761
Submitting_to_email               0
Abnormal_URL                      0
Redirect                       9776
on_mouseover                      0
RightClick                        0
popUpWidnow                       0
Iframe                            0
age_of_domain                     0
DNSRecord                         0
web_traffic                    2569
Page_Rank                         0
Google_Index                

In [6]:
# changing the zeros into nan's so we can fill them with the mean for each column 

columns = ['URL_Length','having_Sub_Domain','SSLfinal_State','URL_of_Anchor','Links_in_tags','SFH',
           'Redirect','web_traffic','Links_pointing_to_page']
# need to reassign the columns back because we need the whole dataset not just these columns 
fish_df[columns] = fish_df[columns].replace({0:np.nan})


In [7]:
fish_df.head()

Unnamed: 0,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,port,HTTPS_token,Request_URL,URL_of_Anchor,Links_in_tags,SFH,Submitting_to_email,Abnormal_URL,Redirect,on_mouseover,RightClick,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
0,-1,1.0,1,1,-1,-1,-1.0,-1.0,-1,1,1,-1,1,-1.0,1.0,-1.0,-1,-1,,1,1,1,1,-1,-1,-1.0,-1,1,1.0,-1,0
1,1,1.0,1,1,1,-1,,1.0,-1,1,1,-1,1,,-1.0,-1.0,1,1,,1,1,1,1,-1,-1,,-1,1,1.0,1,0
2,1,,1,1,1,-1,-1.0,-1.0,-1,1,1,-1,1,,-1.0,-1.0,-1,-1,,1,1,1,1,1,-1,1.0,-1,1,,-1,0
3,1,,1,1,1,-1,-1.0,-1.0,1,1,1,-1,-1,,,-1.0,1,1,,1,1,1,1,-1,-1,1.0,-1,1,-1.0,1,0
4,1,,-1,1,1,-1,1.0,1.0,-1,1,1,1,1,,,-1.0,1,1,,-1,1,-1,1,-1,-1,,-1,1,1.0,1,1


In [8]:
# fill the nan's with the mean
fish_df.fillna(fish_df[columns].mean(), inplace=True)
fish_df.head()
#dataframe.fillna(dataframe['Count'].mean(), inplace = True)
#fish_df.head()

Unnamed: 0,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,port,HTTPS_token,Request_URL,URL_of_Anchor,Links_in_tags,SFH,Submitting_to_email,Abnormal_URL,Redirect,on_mouseover,RightClick,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
0,-1,1.0,1,1,-1,-1,-1.0,-1.0,-1,1,1,-1,1,-1.0,1.0,-1.0,-1,-1,1.0,1,1,1,1,-1,-1,-1.0,-1,1,1.0,-1,0
1,1,1.0,1,1,1,-1,0.095116,1.0,-1,1,1,-1,1,-0.147954,-1.0,-1.0,1,1,1.0,1,1,1,1,-1,-1,0.374263,-1,1,1.0,1,0
2,1,-0.641026,1,1,1,-1,-1.0,-1.0,-1,1,1,-1,1,-0.147954,-1.0,-1.0,-1,-1,1.0,1,1,1,1,1,-1,1.0,-1,1,0.776281,-1,0
3,1,-0.641026,1,1,1,-1,-1.0,-1.0,1,1,1,-1,-1,-0.147954,-0.197699,-1.0,1,1,1.0,1,1,1,1,-1,-1,1.0,-1,1,-1.0,1,0
4,1,-0.641026,-1,1,1,-1,1.0,1.0,-1,1,1,1,1,-0.147954,-0.197699,-1.0,1,1,1.0,-1,1,-1,1,-1,-1,0.374263,-1,1,1.0,1,1


In [9]:
# milestone #3 split the data into train and test 
# get the predictions and 
#from sklearn.model_selection import train_test_split

# this is the predictions labels
X = fish_df.iloc[:,0:30]
#predictions.head()
# class labels 
y = fish_df.iloc[:,30:31]
#y.head()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [10]:
X_train.head()

Unnamed: 0,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,port,HTTPS_token,Request_URL,URL_of_Anchor,Links_in_tags,SFH,Submitting_to_email,Abnormal_URL,Redirect,on_mouseover,RightClick,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report
480,1,-1.0,1,1,1,-1,1.0,1.0,-1,1,1,1,1,1.0,1.0,-1.0,1,1,1.0,1,1,1,1,1,1,1.0,-1,1,0.776281,1
10812,-1,-1.0,1,1,1,-1,0.095116,1.0,1,1,1,1,-1,-0.147954,-1.0,-1.0,1,1,1.0,1,1,1,1,1,1,0.374263,-1,-1,0.776281,1
4064,1,1.0,1,1,1,1,0.095116,1.0,-1,1,1,1,1,-0.147954,-1.0,1.0,1,1,1.0,1,1,1,1,-1,1,0.374263,-1,-1,0.776281,1
8225,-1,-1.0,-1,-1,-1,-1,1.0,-1.0,-1,1,1,-1,1,-0.147954,-0.197699,-0.63979,1,-1,1.0,1,1,1,1,1,-1,0.374263,-1,-1,1.0,1
9432,-1,-1.0,1,1,1,-1,-1.0,-1.0,-1,1,1,-1,-1,-1.0,-0.197699,-0.63979,1,1,1.0,1,1,1,1,1,1,1.0,-1,-1,0.776281,1


In [11]:
#Milestone 4 train a logistic model 
#1. instantiated and fit the model 

log = LogisticRegression()
log.fit(X_train, y_train)
predict = log.predict(X_test)
predict


  return f(*args, **kwargs)


array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [12]:
#2. evalute the model and report 
# get the report 
# Precision quantifies the number of positive class predictions that actually belong to the positive class
# Precision = TP/TP+FP
# Recall quantifies the number of positive class predictions made out of all positive examples in the dataset.
# Recall = TP/TP+FN
# F-Measure provides a single score that balances both the concerns of precision and recall in one number.
# F1 Score = 2*(Recall * Precision) / (Recall + Precision)
# Accuracy = TP+TN/TP+FP+FN+TN
# Accuracy is the most intuitive performance measure and it is simply a ratio of correctly 
# predicted observation to the total observations.
report = classification_report(y_test, predict)
print(report) 

              precision    recall  f1-score   support

           0       0.92      0.90      0.91       956
           1       0.92      0.94      0.93      1255

    accuracy                           0.92      2211
   macro avg       0.92      0.92      0.92      2211
weighted avg       0.92      0.92      0.92      2211



In [13]:
# 3 and 4 change the hyperparmeters to improve the model and began the random search process 

model = LogisticRegression()
# make hyperparamters 
penalty = ['l1','l2']
c = [0.8,0.9,1.0]
tol = [0.01, 0.001, 0.0001]
max_iter = [100, 150, 200, 250]
solvers = ['newton-cg', 'lbfgs', 'liblinear']
grid = dict(solver=solvers,penalty=penalty,C=c,tol=tol,max_iter=max_iter)
grid_search = RandomizedSearchCV(estimator=model, param_distributions=grid, n_jobs=-1,scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)
print("Best score: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best score: 0.919946 using {'tol': 0.0001, 'solver': 'liblinear', 'penalty': 'l2', 'max_iter': 200, 'C': 0.8}


  return f(*args, **kwargs)


In [15]:
# 5 evalate the best estimator with the hyperparameters   

pre = grid_result.predict(X_test)
report_2 = classification_report(y_test, pre)
print(report_2)

              precision    recall  f1-score   support

           0       0.92      0.90      0.91       956
           1       0.92      0.94      0.93      1255

    accuracy                           0.92      2211
   macro avg       0.92      0.92      0.92      2211
weighted avg       0.92      0.92      0.92      2211



In [None]:
# 6 
# Import libraries
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.linear_model import LogisticRegression
import wandb
import time



# Define a function to train a model and log the model evaluation metrics 
def train_eval_pipeline(model, train_data, test_data, name):
    # Initialize Weights and Biases
    wandb.init(project="phishing-websites-detection", name=name)
    
    # Segregate the datasets
    (X_train, y_train) = train_data
    (X_test, y_test) = test_data
    
    # Train the model and log all the necessary metrics
    start = time.time()
    model.fit(X_train, y_train)
    end = time.time() - start
    prediction = model.predict(X_test)
    
    # this will log all the metrics from the model 
    wandb.log({"accuracy":accuracy_score(y_test, prediction)*100.0,\
               "precision": precision_recall_fscore_support(y_test, prediction, average='macro')[0],# this is the first number
               "recall": precision_recall_fscore_support(y_test, prediction, average='macro')[1], # this is the second number retured
               "training_time":end})
    
    print("Accuracy score of the Logistic Regression classifier with default hyperparameter values {0:.2f}%"\
              .format(accuracy_score(y_test, prediction)*100.))
    print("\n")
    print("----Classification report of the Logistic Regression classifier with default hyperparameter value----")
    
    print("\n")
    print(classification_report(y_test, prediction, target_names=["Phishing Websites", "Normal Websites"]))
    
# Log the hyperparameter values with which we are going to train the model
config = wandb.config
config.tol = 0.001
config.penalty = "l2"
config.C = 1.0
    
    # Train the model and print the summary of model performance
logreg = LogisticRegression(tol=config.tol, penalty=config.penalty, max_iter=250, C=config.C)
    
train_eval_pipeline(logreg, (X_train, y_train),(X_test, y_test), "logistic-regression-random-search")

    
    
#wandb.login()
#wandb.init(project="my-phlishing-project")

wandb: You can find your API key in your browser here: https://wandb.ai/authorize
