**Phishing** continues to prove one of the most successful and effective ways for cybercriminals to defraud us and steal our personal and financial information.
Our growing reliance on the internet to conduct much of our day-to-day business has provided fraudsters with the perfect environment to launch targeted phishing attacks. The phishing attacks taking place today are sophisticated and increasingly more difficult to spot. A study conducted by Intel found that 97% of security experts fail at identifying phishing emails from genuine emails.

Website phishing detection is the process of identifying fake or malicious websites that try to trick people into giving away sensitive information, like passwords or credit card numbers.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv(r'/kaggle/input/web-phish/phishing_site_urls.csv')

In [3]:
df.shape

(549346, 2)

In [4]:
df.head(10)

Unnamed: 0,URL,Label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad
3,mail.printakid.com/www.online.americanexpress....,bad
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad
5,smilesvoegol.servebbs.org/voegol.php,bad
6,premierpaymentprocessing.com/includes/boleto-2...,bad
7,myxxxcollection.com/v1/js/jih321/bpd.com.do/do...,bad
8,super1000.info/docs,bad
9,horizonsgallery.com/js/bin/ssl1/_id/www.paypal...,bad


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 549346 entries, 0 to 549345
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   URL     549346 non-null  object
 1   Label   549346 non-null  object
dtypes: object(2)
memory usage: 8.4+ MB


In [6]:
df.isnull().sum()

URL      0
Label    0
dtype: int64

In [7]:
df.Label.value_counts()

Label
good    392924
bad     156422
Name: count, dtype: int64

In [8]:
df.URL

0         nobell.it/70ffb52d079109dca5664cce6f317373782/...
1         www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...
2         serviciosbys.com/paypal.cgi.bin.get-into.herf....
3         mail.printakid.com/www.online.americanexpress....
4         thewhiskeydregs.com/wp-content/themes/widescre...
                                ...                        
549341                                      23.227.196.215/
549342                                   apple-checker.org/
549343                                    apple-iclods.org/
549344                                   apple-uptoday.org/
549345                                    apple-search.info
Name: URL, Length: 549346, dtype: object

In [9]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: nltk
Successfully installed nltk-3.9.1
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [10]:
from nltk.tokenize import RegexpTokenizer

In [11]:
tokenizer=RegexpTokenizer(r'[A-Za-z0-9]+')

In [12]:
df.URL[0]

'nobell.it/70ffb52d079109dca5664cce6f317373782/login.SkyPe.com/en/cgi-bin/verification/login/70ffb52d079109dca5664cce6f317373/index.php?cmd=_profile-ach&outdated_page_tmpl=p/gen/failed-to-load&nav=0.5.1&login_access=1322408526'

In [13]:
tokenizer.tokenize(df.URL[0])

['nobell',
 'it',
 '70ffb52d079109dca5664cce6f317373782',
 'login',
 'SkyPe',
 'com',
 'en',
 'cgi',
 'bin',
 'verification',
 'login',
 '70ffb52d079109dca5664cce6f317373',
 'index',
 'php',
 'cmd',
 'profile',
 'ach',
 'outdated',
 'page',
 'tmpl',
 'p',
 'gen',
 'failed',
 'to',
 'load',
 'nav',
 '0',
 '5',
 '1',
 'login',
 'access',
 '1322408526']

In [14]:
# tokenize the URLs
df['tokens'] = df.URL.map(lambda x: tokenizer.tokenize(x))

In [15]:
df.head()

Unnamed: 0,URL,Label,tokens
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad,"[nobell, it, 70ffb52d079109dca5664cce6f3173737..."
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad,"[www, dghjdgf, com, paypal, co, uk, cycgi, bin..."
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad,"[serviciosbys, com, paypal, cgi, bin, get, int..."
3,mail.printakid.com/www.online.americanexpress....,bad,"[mail, printakid, com, www, online, americanex..."
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad,"[thewhiskeydregs, com, wp, content, themes, wi..."


In [16]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")


In [17]:
df['text_stem'] = df.tokens.map(lambda x: [stemmer.stem(y) for y in x])

In [18]:
df.head()

Unnamed: 0,URL,Label,tokens,text_stem
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad,"[nobell, it, 70ffb52d079109dca5664cce6f3173737...","[nobel, it, 70ffb52d079109dca5664cce6f31737378..."
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad,"[www, dghjdgf, com, paypal, co, uk, cycgi, bin...","[www, dghjdgf, com, paypal, co, uk, cycgi, bin..."
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad,"[serviciosbys, com, paypal, cgi, bin, get, int...","[serviciosbi, com, paypal, cgi, bin, get, into..."
3,mail.printakid.com/www.online.americanexpress....,bad,"[mail, printakid, com, www, online, americanex...","[mail, printakid, com, www, onlin, americanexp..."
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad,"[thewhiskeydregs, com, wp, content, themes, wi...","[thewhiskeydreg, com, wp, content, theme, wide..."


In [19]:
df['text'] = df['text_stem'].map(lambda x: ' '.join(x))

In [20]:
sns.countplot(df.Label)

NameError: name 'sns' is not defined

In [None]:
df.head()

In [None]:
good_sites = df[df.Label == 'good']
bad_sites = df[df.Label == 'bad']

In [None]:
good_sites.head()

In [None]:
!pip install wordcloud

In [None]:
# word cloud
from wordcloud import WordCloud
good_sites_text = ' '.join(good_sites.text)
good_sites_wc = WordCloud(width=800, height=400, background_color='white').generate(good_sites_text)
plt.figure(figsize=(10, 5))
plt.imshow(good_sites_wc, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for Good Sites')
plt.show()

In [None]:
bad_sites_text = ' '.join(bad_sites.text)
bad_sites_wc = WordCloud(width=800, height=400, background_color='white').generate(bad_sites_text)
plt.figure(figsize=(10, 5))
plt.imshow(bad_sites_wc, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for Bad Sites')
plt.show()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
CV= CountVectorizer()

In [None]:
features = CV.fit_transform(df.text)

In [None]:
features[:5]

In [None]:
features[:5].toarray()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, df.Label, test_size=0.2, random_state=42)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

Model Training

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

model.fit(X_train, y_train)

In [None]:
y_pred_train = model.predict(X_train)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

accuracy = accuracy_score(y_train, y_pred_train)
print(f'Accuracy: {accuracy:.2f}')

In [None]:
y_pred = model.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred))

In [None]:
# from sklearn.tree import DecisionTreeClassifier
# dt_model = DecisionTreeClassifier()

In [None]:
# dt_model.fit(X_train, y_train)
# y_pred_dt = dt_model.predict(X_test)
# accuracy_dt = accuracy_score(y_test, y_pred_dt)
# print(f'Decision Tree Accuracy: {accuracy_dt:.2f}')

In [None]:
# dt_model.fit(X_train, y_train)
# y_pred_train = dt_model.predict(X_train)
# accuracy_dt = accuracy_score(y_train, y_pred_train)
# print(f'Decision Tree Accuracy: {accuracy_dt:.2f}')

In [None]:
# use random forest classifier
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)

In [None]:
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f'Random Forest Accuracy: {accuracy_rf:.2f}')

In [None]:
rf_model.fit(X_train, y_train)
y_pred_train = rf_model.predict(X_train)
accuracy_rf = accuracy_score(y_train, y_pred_train)
print(f'Random Forest Accuracy for training set: {accuracy_rf:.2f}')

In [None]:
confusion_mat = pd.DataFrame(confusion_matrix(rf_model.predict(X_test), y_test), columns=['Good', 'Bad'], index=['Good', 'Bad'])

In [None]:
confusion_mat

In [None]:
!pip install seaborn

In [None]:
import seaborn as sns
print("Confusion Matrix:")
plt.figure(figsize=(6, 4))
sns.heatmap(confusion_mat, annot=True, fmt='d', cmap='Blues')
plt.show()

In [None]:
import pickle

pickle.dump(model, open('rf_model.pkl', 'wb'))

In [None]:
pickle.dump(CV, open('vectorizer.pkl', 'wb'))

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier()

In [None]:
dt_model.fit(X_train, y_train)
y_pred_train = dt_model.predict(X_train)
accuracy_dt = accuracy_score(y_train, y_pred_train)
print(f'Decision Tree Accuracy: {accuracy_dt:.2f}')

In [None]:
dt_model.fit(X_train, y_train)
y_pred_test = dt_model.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_test)
print(f'Decision Tree Accuracy: {accuracy_dt:.2f}')

In [None]:
confusion_mat = pd.DataFrame(confusion_matrix(dt_model.predict(X_test), y_test), columns=['Good', 'Bad'], index=['Good', 'Bad'])
confusion_mat

In [None]:
import seaborn as sns
print("Confusion Matrix:")
plt.figure(figsize=(6, 4))
sns.heatmap(confusion_mat, annot=True, fmt='d', cmap='Blues')
plt.show()

In [None]:
import pickle

pickle.dump(model, open('dt_model.pkl', 'wb'))

In [None]:
df.tail()

In [None]:
trans_data = cv.transform(df['text'][0])

In [None]:
prediction = rf_model.predict(trans_data)
print(prediction)