<a href="https://colab.research.google.com/github/dieagus/PhishBot/blob/main/phish.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import modules for data
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import re
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# import data and take the first 30k datapoints

import pandas as pd
import numpy as np
import datetime
df = pd.read_csv("drive/MyDrive/phish.csv")
df = df.iloc[:30000]


In [None]:
df = df.drop(columns=['receiver', 'date', 'urls'])
# drop these columns i dont think i want to use

In [None]:

from sklearn.model_selection import train_test_split


# inputs and output feautures
X = df[['sender', 'subject', 'body']]
# label: 0 = real 1 = spam
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#print all features
print("Training Features:\n", X_train)
print("\nTesting Features:\n", X_test)
print("\nTraining Target:\n", y_train)
print("\nTesting Target:\n", y_test)

Training Features:
                                                   sender  \
21753             hervey shenglu <leandro1@alcoa.com.br>   
251                Christoph Cordes <uiaregi@clamav.net>   
22941       "John M. Dlugosz" <dhcgnd702@sneakemail.com>   
618    Library of Congress <loc@service.govdelivery.com>   
17090                Tarek Ziadé <vgwil.zhzzs@gmail.com>   
...                                                  ...   
29802           "Brian W. Fitzpatrick" <bmlh@google.com>   
5390   Hannah Stover <DellawoodlandBullard@northweste...   
860            InstantBooster <promotebusines@gmail.com>   
15795                    nzvoq@zaphod.in.tu-clausthal.de   
23654                   duploelabs <dieaahq@hotmail.com>   

                                                 subject  \
21753                                    Does it matter?   
251                [clamav-virusdb] Update (daily: 5742)   
22941                       Humorous but serious article   
618    Library of C

In [None]:
# combines sender subject and body of emial into one text column
df['text'] = df['sender'] + ' ' + df['subject'] + ' ' + df['body']

# drop empty cells
df = df.dropna(subset=['text', 'label'])

In [None]:
# cleaning text, removes all special characters for the machine learning model
def clean_text(text):
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.lower().strip()
# cleans text and assigns train test
df['clean_text'] = df['text'].apply(clean_text)
X = df['clean_text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# vectorizes clean text into weighted phrases and characters
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
# logistic regression model, fit using the vectorized data and the labels
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# predictions
y_pred = model.predict(X_test_tfidf)
# accuracy + classification report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9926617745163442

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      2467
           1       0.99      0.99      0.99      3529

    accuracy                           0.99      5996
   macro avg       0.99      0.99      0.99      5996
weighted avg       0.99      0.99      0.99      5996



In [None]:
y_pred

array([1, 1, 1, ..., 0, 1, 1])

In [None]:
y_test

Unnamed: 0,label
28580,1
24433,1
9379,1
7765,1
27464,1
...,...
17095,1
19668,0
29949,0
27337,1


In [None]:
# gives words from the vectorizer that appeared to have the most weight on the result
feature_names = vectorizer.get_feature_names_out()


coefficients = model.coef_[0]


feature_importance = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

feature_importance['Abs_Coefficient'] = np.abs(feature_importance['Coefficient'])
feature_importance = feature_importance.sort_values(by='Abs_Coefficient', ascending=False)


# feature improtances
print(feature_importance.head(10))


       Feature  Coefficient  Abs_Coefficient
3600    python    -5.918660         5.918660
2707      love     5.808831         5.808831
979        com     5.701216         5.701216
4951     wrote    -5.666323         5.666323
4979      your     5.542068         5.542068
3163  opensuse    -4.139091         4.139091
2006     gmail    -3.967222         3.967222
2858       men     3.876570         3.876570
2652      list    -3.847659         3.847659
3316      perl    -3.798888         3.798888
