### Import lib

In [50]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

### Import phishing_site_urls dataset

In [11]:
urls_df = pd.read_csv(r'phishing_site_urls.csv')

urls_df.head()

Unnamed: 0,URL,Label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad
3,mail.printakid.com/www.online.americanexpress....,bad
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad


In [12]:
urls_df.tail()

Unnamed: 0,URL,Label
549341,23.227.196.215/,bad
549342,apple-checker.org/,bad
549343,apple-iclods.org/,bad
549344,apple-uptoday.org/,bad
549345,apple-search.info,bad


In [13]:
urls_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 549346 entries, 0 to 549345
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   URL     549346 non-null  object
 1   Label   549346 non-null  object
dtypes: object(2)
memory usage: 8.4+ MB


In [14]:
urls_df.isnull().sum()

URL      0
Label    0
dtype: int64

In [15]:
label_counter = pd.DataFrame(urls_df.Label.value_counts())

In [26]:
print(label_counter.Label)
fig = px.bar(label_counter, x=label_counter.index, y=label_counter.Label)
fig.show()

good    392924
bad     156422
Name: Label, dtype: int64


### Preprocessing

In [40]:
tokenizer = RegexpTokenizer(r'[A-Za-z]+')
urls_df.URL[0]

'nobell.it/70ffb52d079109dca5664cce6f317373782/login.SkyPe.com/en/cgi-bin/verification/login/70ffb52d079109dca5664cce6f317373/index.php?cmd=_profile-ach&outdated_page_tmpl=p/gen/failed-to-load&nav=0.5.1&login_access=1322408526'

In [41]:
tokenizer.tokenize(urls_df.URL[0])

['nobell',
 'it',
 'ffb',
 'd',
 'dca',
 'cce',
 'f',
 'login',
 'SkyPe',
 'com',
 'en',
 'cgi',
 'bin',
 'verification',
 'login',
 'ffb',
 'd',
 'dca',
 'cce',
 'f',
 'index',
 'php',
 'cmd',
 'profile',
 'ach',
 'outdated',
 'page',
 'tmpl',
 'p',
 'gen',
 'failed',
 'to',
 'load',
 'nav',
 'login',
 'access']

In [53]:
urls_df['text_tokenized'] = urls_df.URL.map(lambda t: tokenizer.tokenize(t))

In [54]:
urls_df.sample(5)

Unnamed: 0,URL,Label,text_tokenized
49783,www.mikaylamackaness.com/hold,good,"[www, mikaylamackaness, com, hold]"
275535,amazon.com/Human-Resource-Management-Consultin...,good,"[amazon, com, Human, Resource, Management, Con..."
121578,backtrackantarctica.com.au/googledocuments/dpbx/,bad,"[backtrackantarctica, com, au, googledocuments..."
170546,en.wikipedia.org/wiki/2009_CONCACAF_Gold_Cup,good,"[en, wikipedia, org, wiki, CONCACAF, Gold, Cup]"
376683,linkedin.com/pub/dir/laurence/owen,good,"[linkedin, com, pub, dir, laurence, owen]"


In [55]:
stemmer = SnowballStemmer("english")
urls_df['text_stemmed'] = urls_df['text_tokenized'].map(lambda l: [stemmer.stem(word) for word in l])


In [56]:
urls_df.sample(5)

Unnamed: 0,URL,Label,text_tokenized,text_stemmed
302350,cfl-fanatics.com/,good,"[cfl, fanatics, com]","[cfl, fanat, com]"
400044,myspace.com/mikeslamer,good,"[myspace, com, mikeslamer]","[myspac, com, mikeslam]"
320246,ehow.com/list_6006307_bars-uniontown_-pennsylv...,good,"[ehow, com, list, bars, uniontown, pennsylvani...","[ehow, com, list, bar, uniontown, pennsylvania..."
194786,grandview.fox4kc.com/news/health/61468-grandvi...,good,"[grandview, fox, kc, com, news, health, grandv...","[grandview, fox, kc, com, news, health, grandv..."
80066,www.scaramanga.co.uk/firestorm/,good,"[www, scaramanga, co, uk, firestorm]","[www, scaramanga, co, uk, firestorm]"


In [57]:
urls_df['text_sent'] = urls_df['text_stemmed'].map(lambda l: ' '.join(l))

In [58]:
urls_df.sample(5)

Unnamed: 0,URL,Label,text_tokenized,text_stemmed,text_sent
242655,springsteenlyrics.com/,good,"[springsteenlyrics, com]","[springsteenlyr, com]",springsteenlyr com
406766,notstarring.com/actors/lisi-virna,good,"[notstarring, com, actors, lisi, virna]","[notstar, com, actor, lisi, virna]",notstar com actor lisi virna
548906,eedbeacdmkfockbn.com /,bad,"[eedbeacdmkfockbn, com]","[eedbeacdmkfockbn, com]",eedbeacdmkfockbn com
64170,www.theregister.co.uk/2001/05/31/manhunt_start...,good,"[www, theregister, co, uk, manhunt, starts, fo...","[www, theregist, co, uk, manhunt, start, for, ...",www theregist co uk manhunt start for sex com
469336,yelp.com/biz/davis-law-firm-san-antonio-2,good,"[yelp, com, biz, davis, law, firm, san, antonio]","[yelp, com, biz, davi, law, firm, san, antonio]",yelp com biz davi law firm san antonio


### Create Model

In [59]:
cv = CountVectorizer()