In [55]:
import pandas as pd

## Collection of Data

In [56]:
legitimate_urls = pd.read_csv("legitimate-urls.csv")
phishing_urls = pd.read_csv("phishing-urls.csv")

In [57]:
legitimate_urls.head(10)
phishing_urls.head(10)

Unnamed: 0,Domain,Having_@_symbol,Having_IP,Path,Prefix_suffix_separation,Protocol,Redirection_//_symbol,Sub_domains,URL_Length,age_domain,dns_record,domain_registration_length,http_tokens,label,statistical_report,tiny_url,web_traffic
0,asesoresvelfit.com,0,0,/media/datacredito.co/,0,http,0,0,0,0,0,1,0,1,0,1,1
1,caixa.com.br.fgtsagendesaqueconta.com,0,0,/consulta8523211/principal.php,0,http,0,1,1,0,0,1,0,1,1,0,1
2,hissoulreason.com,0,0,/js/homepage/home/,0,http,0,0,0,0,0,1,0,1,0,0,1
3,unauthorizd.newebpage.com,0,0,/webapps/66fbf/,0,http,0,0,0,0,0,1,0,1,1,0,1
4,133.130.103.10,0,1,/23/,0,http,0,2,0,1,0,1,0,1,0,0,1
5,dj00.co.vu,1,0,/css/,0,http,0,0,2,1,1,1,0,1,1,0,0
6,133.130.103.10,0,1,/21/logar/,0,http,0,2,0,1,0,1,0,1,0,0,1
7,httpssicredi.esy.es,0,0,/servico/sicredi/validarclientes/mobi/index.php,0,http,0,2,2,1,1,1,1,1,1,0,1
8,gamesaty.ga,0,0,/wp-content///yh/en/,0,http,1,0,2,1,0,1,0,1,0,0,1
9,luxuryupgradepro.com,0,0,/ymailNew/ymailNew/,0,http,0,0,0,0,0,1,0,1,0,0,1


## Data PreProcessing
#### Data is in two data frames so we merge them to make one dataframe
Note: two dataframes has same column names

In [58]:
urls = legitimate_urls.append(phishing_urls)


In [59]:
urls.head(5)

Unnamed: 0,Domain,Having_@_symbol,Having_IP,Path,Prefix_suffix_separation,Protocol,Redirection_//_symbol,Sub_domains,URL_Length,age_domain,dns_record,domain_registration_length,http_tokens,label,statistical_report,tiny_url,web_traffic
0,www.liquidgeneration.com,0,0,/,0,http,0,0,0,0,0,1,0,0,0,0,2
1,www.onlineanime.org,0,0,/,0,http,0,0,0,0,0,1,0,0,1,0,1
2,www.ceres.dti.ne.jp,0,0,/~nekoi/senno/senfirst.html,0,http,0,1,0,1,0,1,0,0,0,0,0
3,www.galeon.com,0,0,/kmh/,0,http,0,0,0,0,0,0,0,0,0,0,0
4,www.fanworkrecs.com,0,0,/,0,http,0,0,0,1,1,1,0,0,1,0,1


In [60]:
urls.columns

Index(['Domain', 'Having_@_symbol', 'Having_IP', 'Path',
       'Prefix_suffix_separation', 'Protocol', 'Redirection_//_symbol',
       'Sub_domains', 'URL_Length', 'age_domain', 'dns_record',
       'domain_registration_length', 'http_tokens', 'label',
       'statistical_report', 'tiny_url', 'web_traffic'],
      dtype='object')

#### Removing Unnecessary columns

In [61]:
urls = urls.drop(urls.columns[[0,3,5]],axis=1)

#### Since we merged two dataframes top 1000 rows will have legitimate urls and bottom 1000 rows will have phishing urls. So if we split the data now and create a model for it will overfit so we need to shuffle the rows before splitting the data into training set and test set

In [62]:
# shuffling the rows in the dataset so that when splitting the train and test set are equally distributed
urls = urls.sample(frac=1).reset_index(drop=True)

#### Removing class variable from the dataset

In [63]:
urls_without_labels = urls.drop('label',axis=1)
urls_without_labels.columns
labels = urls['label']


#### splitting the data into train data and test data

In [64]:
from sklearn.model_selection import train_test_split
data_train, data_test, labels_train, labels_test = train_test_split(urls_without_labels, labels, test_size=0.20, random_state=100)

In [65]:
print(len(data_train),len(data_test),len(labels_train),len(labels_test))

1612 403 1612 403


In [66]:
labels_train.value_counts()

#labels_train[labels_train == 0].count()
#labels_train[labels_train == 1].count()

0    820
1    792
Name: label, dtype: int64

In [67]:
labels_test.value_counts()

1    206
0    197
Name: label, dtype: int64

#### creating the model and fitting the data into the model

In [68]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(data_train,labels_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

#### predicting the result for test data

In [69]:
pred_label = model.predict(data_test)

In [70]:
#print(pred_label),print(list(labels_test))

#### creating confusion matrix and checking the accuracy

In [71]:
from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(labels_test,pred_label)
print(cm)
accuracy_score(labels_test,pred_label)

[[179  18]
 [ 52 154]]


0.826302729528536