<a href="https://colab.research.google.com/github/dalalRohit/shraddha-proj/blob/master/url_detection_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import pandas as pd
import numpy as np
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [0]:
df=pd.read_csv('/content/drive/My Drive/dataset_combined.csv')

In [0]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71436 entries, 0 to 71435
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   URL                      71429 non-null  object 
 1   rank_host                71429 non-null  float64
 2   rank_country             71429 non-null  float64
 3   host                     71429 non-null  object 
 4   path                     52659 non-null  object 
 5   Length_of_url            71429 non-null  float64
 6   Length_of_host           71429 non-null  float64
 7   No_of_dots               71429 non-null  float64
 8   avg_token_length         71429 non-null  float64
 9   token_count              71429 non-null  float64
 10  largest_token            71429 non-null  float64
 11  avg_domain_token_length  71429 non-null  float64
 12  domain_token_count       71429 non-null  float64
 13  largest_domain           71429 non-null  float64
 14  avg_path_token        

In [0]:
columns=[
    'rank_host','rank_country','Length_of_url',
    'Length_of_host','No_of_dots','avg_token_length','token_count',
    'largest_token','avg_domain_token_length','domain_token_count',
    'largest_domain','avg_path_token','path_token_count','largest_path',
    'sec_sen_word_cnt','IPaddress_presence',
]

## Pre-processing

In [0]:
#https://stackoverflow.com/questions/34779961/scikit-learn-input-contains-nan-infinity-or-a-value-too-large-for-dtype-flo
df.isnull().any()

URL                        True
rank_host                  True
rank_country               True
host                       True
path                       True
Length_of_url              True
Length_of_host             True
No_of_dots                 True
avg_token_length           True
token_count                True
largest_token              True
avg_domain_token_length    True
domain_token_count         True
largest_domain             True
avg_path_token             True
path_token_count           True
largest_path               True
sec_sen_word_cnt           True
IPaddress_presence         True
ASNno                      True
label                      True
dtype: bool

In [0]:
df=df.fillna(0)

In [0]:
# https://stackoverflow.com/questions/17477979/dropping-infinite-values-from-dataframes-in-pandas
df=df.replace([np.inf,-np.inf],np.nan).dropna(subset=columns,how='all')

In [0]:
df=df.dropna(how='all')

In [0]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71436 entries, 0 to 71435
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   URL                      71436 non-null  object 
 1   rank_host                71436 non-null  float64
 2   rank_country             71436 non-null  float64
 3   host                     71436 non-null  object 
 4   path                     71436 non-null  object 
 5   Length_of_url            71436 non-null  float64
 6   Length_of_host           71436 non-null  float64
 7   No_of_dots               71436 non-null  float64
 8   avg_token_length         71436 non-null  float64
 9   token_count              71436 non-null  float64
 10  largest_token            71436 non-null  float64
 11  avg_domain_token_length  71436 non-null  float64
 12  domain_token_count       71436 non-null  float64
 13  largest_domain           71436 non-null  float64
 14  avg_path_token        

In [0]:
#Get dataset in X and Y
X=df[columns]
y=df['label']

## Train_test_split

In [0]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [0]:
print('Training dataset X=>length: [X_train]',len(X_train))
print('Training dataset Y=>length: [y_train] ',len(y_train))

print('\nTesting dataset X=>length: [X_test]',len(X_test))
print('Training dataset Y=>length: [y_test]',len(y_test))


Training dataset X=>length: [X_train] 50005
Training dataset Y=>length: [y_train]  50005

Testing dataset X=>length: [X_test] 21431
Training dataset Y=>length: [y_test] 21431


## Model

In [0]:
clf = LogisticRegression(random_state=0,solver='liblinear')

In [0]:
clf.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
y_pred=clf.predict(X_test)

In [0]:
clf.score(X_test,y_test)

0.8589893145443517

In [0]:
clf.predict_proba(X_test)

array([[0.25752699, 0.74247301],
       [0.30563718, 0.69436282],
       [0.32884617, 0.67115383],
       ...,
       [0.5948177 , 0.4051823 ],
       [0.30175351, 0.69824649],
       [0.58238754, 0.41761246]])

In [0]:
clf.classes_

array([0., 1.])

## Confusion Matrix Sample
![Confusion Matrix Sample](https://media.geeksforgeeks.org/wp-content/uploads/Confusion_Matrix1_1.png)

In [23]:
confusion_matrix(y_test, y_pred)

array([[ 6204,  1751],
       [ 1271, 12205]])

In [0]:
clf.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 0,
 'solver': 'liblinear',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}