In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split
import numpy as np

# Fix the random seed
np.random.seed(7)
# Identify  how many rows and columns we may need to clean 
path=  "./Phishing.csv"  
data=pd.read_csv(path)    
(rows, cols) = data.shape
print ("Read", rows, "rows with", cols, "Columns")
data.rename(columns={"Result": "Class"}, inplace=True)

data["Class"] = data["Class"].map({-1:0, 1:1})
data["Class"].unique()


Read 11055 rows with 31 Columns


array([0, 1], dtype=int64)

In [2]:
#We can look for missing values in the dataset like so -
data.isna().sum()

having_IP_Address              0
URL_Length                     0
Shortining_Service             0
having_At_Symbol               0
double_slash_redirecting       0
Prefix_Suffix                  0
having_Sub_Domain              0
SSLfinal_State                 0
Domain_registeration_length    0
Favicon                        0
port                           0
HTTPS_token                    0
Request_URL                    0
URL_of_Anchor                  0
Links_in_tags                  0
SFH                            0
Submitting_to_email            0
Abnormal_URL                   0
Redirect                       0
on_mouseover                   0
RightClick                     0
popUpWidnow                    0
Iframe                         0
age_of_domain                  0
DNSRecord                      0
web_traffic                    0
Page_Rank                      0
Google_Index                   0
Links_pointing_to_page         0
Statistical_report             0
Class     

In [3]:
# split up into training an test sets . 
# Let's now split the dataset in a 80:20 ratio. 
 
from sklearn.model_selection import train_test_split
 
X = data.iloc[:,0:30].values.astype(int)
y = data.iloc[:,30].values.astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=np.random.seed(7))


print ("Training DF     has ", X_train.shape[0], "rows with", X_train.shape[1], "Columns")
print ("Test     DF     has ", X_test.shape[0], "rows with", X_test.shape[1], "Columns")
#print ("Training Result has ", len(y_train), "results ")
#print ("Test     Result has ", len(y_test), "results ")


Training DF     has  8844 rows with 30 Columns
Test     DF     has  2211 rows with 30 Columns


In [4]:
# Serialize the numpy arrays
np.save("X_train.npy", X_train), np.save("y_train.npy", y_train)
np.save("X_test.npy", X_train), np.save("y_test.npy", y_train)

(None, None)

In [5]:
# instantiate a logistic regression model, and fit with X and y
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model = model.fit(X_train, y_train)


In [6]:
predicted = model.predict(X_test)
print("predicted" , predicted)
# check the accuracy on the training set


predicted [0 0 0 ... 0 1 0]


In [7]:
model.score(X_train, y_train)
probs = model.predict_proba(X_test) 
print (probs)

[[9.97743414e-01 2.25658600e-03]
 [8.13428411e-01 1.86571589e-01]
 [7.67475752e-01 2.32524248e-01]
 ...
 [9.94115162e-01 5.88483766e-03]
 [1.58703772e-01 8.41296228e-01]
 [9.99856616e-01 1.43384102e-04]]


In [8]:
# generate evaluation metrics
from sklearn import metrics
print ("accuracy score",metrics.accuracy_score(y_test, predicted))
print("roc_auc_score",metrics.roc_auc_score(y_test, probs[:, 1]))


accuracy score 0.9371325192220714
roc_auc_score 0.9839028151502526


In [9]:
# confusion_matrix to see how predictins compared with the real results 
from sklearn import metrics
# True Positives (good)| False Positives (bad)
# False Negatives (bad)| true Negatives(good)
 
print("confusion_matrix\n",metrics.confusion_matrix(y_test, predicted))
# classification 
print("classification_report\n",metrics.classification_report(y_test, predicted))

confusion_matrix
 [[ 896   78]
 [  61 1176]]
classification_report
               precision    recall  f1-score   support

           0       0.94      0.92      0.93       974
           1       0.94      0.95      0.94      1237

    accuracy                           0.94      2211
   macro avg       0.94      0.94      0.94      2211
weighted avg       0.94      0.94      0.94      2211



In [15]:
# try to find a better fit using RandomizedSearchCV
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import numpy as np

from time import time
import scipy.stats as stats
from sklearn.utils.fixes import loguniform
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
 

# Define the grid of values
penalty = ['l1', 'l2'] 
C = (0.8, 0.9, 1.0) 
tol = [0.01, 0.001 ,0.0001]
max_iter = [100, 150, 200, 250]
# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty,tol=tol,max_iter=max_iter)
print(hyperparameters)
# Create randomized search 5-fold cross validation and 100 iterations
clf = RandomizedSearchCV(model, hyperparameters, random_state=1, n_iter=100, cv=5, verbose=0, n_jobs=-1)


#
print('expected BestPenalty:   l1')
print('expected Best C:        1.0')
print('expected Best tol:      0.001')
print('expected Best max_iter: 250')

# Fit randomized search
best_model = clf.fit(X_train, y_train)
# View Hyperparameter Values Of Best Model

 
# View best hyperparameters
print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Best C:      ', best_model.best_estimator_.get_params()['C'])
print('Best tol:    ', best_model.best_estimator_.get_params()['tol'])
print('Best max_iter', best_model.best_estimator_.get_params()['max_iter'])
 


{'C': (0.8, 0.9, 1.0), 'penalty': ['l1', 'l2'], 'tol': [0.01, 0.001, 0.0001], 'max_iter': [100, 150, 200, 250]}
expected BestPenalty:   l1
expected Best C:        1.0
expected Best tol:      0.001
expected Best max_iter: 250




Best Penalty: l2
Best C:       0.9
Best tol:     0.01
Best max_iter 100


In [16]:
# these results unfortunately do not match what was predicted in the exercise 
from sklearn import metrics
predicted = best_model.predict(X_test)
print("confusion_matrix...\n",metrics.confusion_matrix(y_test, predicted))
# classification 
print("classification_report\n",metrics.classification_report(y_test, predicted))

confusion_matrix...
 [[ 896   78]
 [  61 1176]]
classification_report
               precision    recall  f1-score   support

           0       0.94      0.92      0.93       974
           1       0.94      0.95      0.94      1237

    accuracy                           0.94      2211
   macro avg       0.94      0.94      0.94      2211
weighted avg       0.94      0.94      0.94      2211



In [17]:
# save best params  to wandb
import wandb

wandb.init(project="my-project")
wandb.log({'C': best_model.best_estimator_.get_params()['C']
           , 'penalty': best_model.best_estimator_.get_params()['penalty']
          , 'tol' : best_model.best_estimator_.get_params()['tol']
          , 'max_iter' : best_model.best_estimator_.get_params()['max_iter']}) 
