## Detecting URL Phishing
### Team: MTA_Lightning


In [143]:
# EDA Packages
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, recall_score, accuracy_score,confusion_matrix,classification_report
import random


# Machine Learning Packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [370]:
# Load Url Data 
urls_data = pd.read_csv("final_data_hack_2023.csv")

In [329]:
urls_data['label'][urls_data['label']=='benign'] = 0
urls_data['label'][urls_data['label']=='good'] = 0
urls_data['label'][urls_data['label']=='defacement'] = 1
urls_data['label'][urls_data['label']=='phishing'] = 1
urls_data['label'][urls_data['label']=='malware'] = 1
urls_data['label'][urls_data['label']=='spam'] = 1
urls_data['label'][urls_data['label']=='bad'] = 1

In [331]:
urls_data.label.value_counts()

0    1117944
1     407052
Name: label, dtype: int64

In [332]:
urls_data.url = 'http://'+urls_data.url

In [333]:
urls_data.head()

Unnamed: 0,url,label
0,http://diaryofagameaddict.com,1
1,http://espdesign.com.au,1
2,http://iamagameaddict.com,1
3,http://kalantzis.net,1
4,http://slightlyoffcenter.net,1


### Data Vectorization Using TfidVectorizer
#### Create A tokenizer
 + Split ,Remove Repetitions and "Com"

In [334]:
def makeTokens(f):
    tkns_BySlash = str(f.encode('utf-8')).split('/')	# make tokens after splitting by slash
    total_Tokens = []
    for i in tkns_BySlash:
        tokens = str(i).split('-')	# make tokens after splitting by dash
        tkns_ByDot = []
        for j in range(0,len(tokens)):
            temp_Tokens = str(tokens[j]).split('.')	# make tokens after splitting by dot
            tkns_ByDot = tkns_ByDot + temp_Tokens
        total_Tokens = total_Tokens + tokens + tkns_ByDot
    total_Tokens = list(set(total_Tokens))	#remove redundant tokens
    if 'com' in total_Tokens:
        total_Tokens.remove('com')	#removing .com since it occurs a lot of times and it should not be included in our features
    return total_Tokens

In [354]:
# Labels
y = urls_data["label"].astype(int)

In [356]:
# Features
url_list = urls_data["url"]

In [359]:
# Using Default Tokenizer
#vectorizer = TfidfVectorizer()

# Using Custom Tokenizer
vectorizer = TfidfVectorizer(tokenizer=makeTokens)

In [360]:
# Store vectors into X variable as Our XFeatures
X = vectorizer.fit_transform(url_list)



In [366]:
pickle.dump(vectorizer, open("tfidf_new.pkl", "wb"))

In [349]:
from sklearn.model_selection import StratifiedKFold
kf = StratifiedKFold(n_splits=2,shuffle=True,random_state=3)

In [363]:
import imblearn

spl=RandomUnderSampler()
model = LogisticRegression(max_iter=7600)

model.fit(X,y)
y_pred = model.predict(X)

acc_arr = accuracy_score(y, y_pred)
f1_arr= f1_score(y, y_pred)
print("%0.2f f1 score" %(f1_arr))
print("%0.2f accuracy" %(acc_arr))
print('----------------------')


0.98 f1 score
0.99 accuracy
----------------------


In [364]:
x_real=pd.read_csv('labeled_public_test_hackathon_vcs_2023.csv').url
y_real=pd.read_csv('labeled_public_test_hackathon_vcs_2023.csv').type
# y_real=y_real.map({0:'good',1:'bad'})
x_r = vectorizer.transform(x_real)
print(confusion_matrix(y_real, model.predict(x_r)))  
        

[[44  0]
 [ 5 51]]


In [368]:
import pickle
filename = 'final_model_new.pkl'
pickle.dump(model, open(filename, 'wb'))