In [None]:
#import dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plot
from urllib.parse import urlparse
from tld import get_tld, is_tld
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [None]:
#df1 = pd.read_csv('urldata.csv')
#df2 = pd.read_csv('data.csv')
#combined_df = pd.concat([df1, df2], ignore_index=True)
#combined_df.to_csv('final_dataset.csv', index=False)

In [None]:
#load dataset
urlDataset = pd.read_csv('final_dataset.csv')
#print dataset
urlDataset.head()

In [None]:
urlDataset.info()

In [None]:
#Add labels to the url type to catagorize the url type numerically
urlDataset["urlType_numeric"] = urlDataset["type"].replace({
    'benign':0,
    'good':0,
    'malicious':1,
    'bad':1
});
urlDataset.head()

In [None]:
import seaborn as sns
count = urlDataset.urlType_numeric.value_counts()
count

In [None]:
sns.barplot(x=count.index, y=count)
plot.xlabel('Types')
plot.ylabel('Count');

In [None]:
#Lexical feature extraction

#length of URL
def getUrlLength(url):
    return len(str(url))
    
#apply the function to all the urls in dataset
urlDataset['urlLength'] = urlDataset['url'].apply(lambda x: getUrlLength(x)) 

urlDataset.head()

In [None]:
#length of hostname
def getHostnameLength(url):
    try:
        parsedURL = urlparse(url)
        return len(parsedURL.netloc)
    except:
        return 0

#apply the function to all the urls in dataset
urlDataset['hostnameLength'] = urlDataset['url'].apply(lambda x: getHostnameLength(x)) 

urlDataset.head()

In [None]:
#length of path
def getPathLength(url):
    try:
        return len(urlparse(url).path)
    except:
        return 0

#apply the function to all the urls in dataset
urlDataset['pathLength'] = urlDataset['url'].apply(lambda x: getPathLength(x)) 

urlDataset.head()

In [None]:
#length of Top level domain
def getTldLength(url):
    try:
        tld = get_tld(url, fail_silently=True,fix_protocol=True)
        return len(tld)
    except:
        return 0

#apply the function to all the urls in dataset
urlDataset['tldLength'] = urlDataset['url'].apply(lambda x: getTldLength(x))  
urlDataset.head()

In [None]:
#Number of character '-'
def getNumOfHyphen(url):
    return url.count('-')

#apply the function to all the urls in dataset
urlDataset['"-"'] = urlDataset['url'].apply(lambda x: getNumOfHyphen(x))  
urlDataset.head()

In [None]:
#Number of character '@'
def getNumOfAt(url):
    return url.count('@')

#apply the function to all the urls in dataset
urlDataset['"@"'] = urlDataset['url'].apply(lambda x: getNumOfAt(x))  
urlDataset.head()

In [None]:
#Number of character '?'
def getNumOfQueMark(url):
    return url.count('?')

#apply the function to all the urls in dataset
urlDataset['"?"'] = urlDataset['url'].apply(lambda x: getNumOfQueMark(x))  
urlDataset.head()

In [None]:
#Number of character '%'
def getNumOfPercMark(url):
    return url.count('%')

#apply the function to all the urls in dataset
urlDataset['"%"'] = urlDataset['url'].apply(lambda x: getNumOfPercMark(x))  
urlDataset.head()

In [None]:
#Number of character '/'
def getNumOfSlash(url):
    return url.count('/')

#apply the function to all the urls in dataset
urlDataset['"/"'] = urlDataset['url'].apply(lambda x: getNumOfSlash(x))  
urlDataset.head()

In [None]:
#Number of character '.'
def getNumOfFullstop(url):
    return url.count('.')

#apply the function to all the urls in dataset
urlDataset['"."'] = urlDataset['url'].apply(lambda x: getNumOfFullstop(x))  
urlDataset.head()

In [None]:
#Number of character '&'
def getNumOfAmpersand(url):
    return url.count('&')

#apply the function to all the urls in dataset
urlDataset['"&"'] = urlDataset['url'].apply(lambda x: getNumOfAmpersand(x))  
urlDataset.head()

In [None]:
#Number of character '_'
def getNumOfUnderscore(url):
    return url.count('_')

#apply the function to all the urls in dataset
urlDataset['"_"'] = urlDataset['url'].apply(lambda x: getNumOfUnderscore(x))  
urlDataset.head()

In [None]:
#Number of character '='
def getNumOfEqual(url):
    return url.count('=')

#apply the function to all the urls in dataset
urlDataset['"="'] = urlDataset['url'].apply(lambda x: getNumOfEqual(x))  
urlDataset.head()

In [None]:
#Number of character '!'
def getNumOfExclamationMark(url):
    return url.count('!')

#apply the function to all the urls in dataset
urlDataset['"!"'] = urlDataset['url'].apply(lambda x: getNumOfExclamationMark(x))  
urlDataset.head()

In [None]:
#Number of digits
def getNumOfDigits(url):
    noOfDigits = 0
    noOfDigits = sum(c.isdigit() for c in url)
    return noOfDigits

#apply the function to all the urls in dataset
urlDataset['noOfDigits'] = urlDataset['url'].apply(lambda x: getNumOfDigits(x))  
urlDataset.head()

In [None]:
#Number of letters
def getNumOfLetters(url):
    noOfLetters = 0
    noOfLetters = sum(c.isalpha() for c in url)
    return noOfLetters

#apply the function to all the urls in dataset
urlDataset['noOfLetters'] = urlDataset['url'].apply(lambda x: getNumOfLetters(x))  
urlDataset.head()

In [None]:
#Number of directories
def getNumOfDirectories(url):
    try:
        path = urlparse(url).path
        noOfDirectories = 0
        noOfDirectories = path.count('/')
        return noOfDirectories
    except:
        return 0

#apply the function to all the urls in dataset
urlDataset['noOfDir'] = urlDataset['url'].apply(lambda x: getNumOfDirectories(x))  
urlDataset.head()

In [None]:
#Use of IP (1-yes, 0-no)
import re
def hasIP(url):
    checkIP = re.search('(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.'
        '([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\/)|' 
        '((0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\/)'
        '(?:[a-fA-F0-9]{1,4}:){7}[a-fA-F0-9]{1,4}',url)
    if checkIP:
        return 1
    else:
        return 0
        
#apply the function to all the urls in dataset
urlDataset['hasIP'] = urlDataset['url'].apply(lambda x: hasIP(x))  
urlDataset.head()

In [None]:
#URL entropy - 
from scipy.stats import entropy
import math
def getEntropy(url):
    url = url.lower()
    uniqueCharacters = set(url) #extracts the unique characters of the url
    probs = (url.count(c) / len(url) for c in uniqueCharacters) #probabilities of each unique character are calculated by counting their occurrences and dividing by the total URL length.
    e = -sum([p * math.log(p) / math.log(2.0) for p in probs]) # Calculate entropy using the Shannon entropy formula
    return e

#apply the function to all the urls in dataset
urlDataset['entropy'] = urlDataset['url'].apply(lambda x: getEntropy(x))  
urlDataset.head()

In [None]:
#has HTTP
def hashttp(url):
   if url.startswith('http://'):
       return 1
   else:
       return 0


#apply the function to all the urls in dataset
urlDataset['hasHttp'] = urlDataset['url'].apply(lambda x: hashttp(x))  
urlDataset.head()

In [None]:
#has HTTPS
def hashttps(url):
   if url.startswith('https://'):
       return 1
   else:
       return 0

#apply the function to all the urls in dataset
urlDataset['hasHttps'] = urlDataset['url'].apply(lambda x: hashttps(x))  
urlDataset.head()

In [None]:
def fdLength(url):
    try:
        urlpath= urlparse(url).path
        return len(urlpath.split('/')[1])
    except:
        return 0
urlDataset['fdLength'] = urlDataset['url'].apply(lambda x: fdLength(x))
urlDataset.head()

In [None]:
def numParameters(url):
        params = url.split('&')
        return len(params) - 1
urlDataset['numParameters'] = urlDataset['url'].apply(lambda x: numParameters(x))

In [None]:
def numSubDomains(url):
        subdomains = url.split('//')[-1].split('/')
        return len(subdomains)-1

urlDataset['numSubDomains'] = urlDataset['url'].apply(lambda x: numSubDomains(x))
urlDataset.head()

In [None]:
#features
x = urlDataset[['urlType_numeric','urlLength',
       'hostnameLength', 'pathLength', 'tldLength', '"-"', '"?"', '"@"',
       '"%"','"/"', '"."','"&"','"_"', '"="', '"!"','noOfDigits','noOfLetters', 'noOfDir', 'hasIP', 'entropy', 'hasHttp', 'hasHttps',
       'fdLength','numParameters','numSubDomains']]
x.head()

In [None]:
def labels(url):
    if url.startswith("http://"):
        return 1 
    elif url.startswith("https://"):
        return 2  
    else:
        return 0

urlDataset['labels'] = urlDataset['url'].apply(lambda x: labels(x))

In [None]:
y1 = urlDataset['labels']

In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(x, y1)

In [None]:
resampled_data = pd.DataFrame(X_resampled)

http_count = len(resampled_data[(resampled_data['hasHttp'] == 1)])
https_count = len(resampled_data[(resampled_data['hasHttps'] == 1)])
no_protocol_count = len(resampled_data[(resampled_data['hasHttp'] == 0) & (resampled_data['hasHttps'] == 0)])

http = len(x[(x['hasHttp'] == 1)])
https = len(x[(x['hasHttps'] == 1)])
no_protocol = len(x[(x['hasHttp'] == 0) & (x['hasHttps'] == 0)])

print("HTTP URLs count:", http)
print("HTTPS URLs count:", https)
print("No Protocol URLs count:", no_protocol)

print("HTTP URLs count:", http_count)
print("HTTPS URLs count:", https_count)
print("No Protocol URLs count:", no_protocol_count)

In [None]:
#target 
y = resampled_data['urlType_numeric']
y.head()

In [None]:
resampled_data = resampled_data.drop('urlType_numeric', axis=1)

In [None]:
#split the training and testing data
X_train, X_test, Y_train, Y_test = train_test_split(resampled_data, y, test_size=0.3, shuffle=True,random_state=42)
print(f"X_train Shape : {X_train.shape}")
print(f"X_test  Shape : {X_test.shape}")

In [None]:
from collections import Counter

#oversample the minor classes using SMOTE technique to avoid class imbalance
count1 = Counter(Y_train) 
oversample = SMOTE()
X_trainosm,Y_trainosm = oversample.fit_resample(X_train,Y_train)
count2 = Counter(Y_trainosm)

print(count1)
print(count2)

In [None]:
X_trainosm = pd.DataFrame(X_trainosm)

In [None]:
X_trainosm.isnull().sum()

In [None]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100,class_weight='balanced')

In [None]:
rfc.fit(X_trainosm,Y_trainosm)

In [None]:
rfc_predictions = rfc.predict(X_test)
accuracy_score(Y_test, rfc_predictions)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(Y_test,rfc_predictions)
cmDisplay = ConfusionMatrixDisplay(cm,display_labels=['benign','malicious'])
cmDisplay.plot()

In [None]:
y_true = Y_test
y_pred = rfc_predictions
TN = cm[0][0]
FP = cm[0][1]
specificity = TN/(TN+FP)
print(specificity)

In [None]:
TP = cm[1][1]
FN = cm[1][0]
sensitivity = TP/(TP+FN)
print(sensitivity)

In [None]:
print(classification_report(Y_test,rfc_predictions,target_names=['benign', 'malicious']))

In [None]:
featureImportance = pd.Series(rfc.feature_importances_, index=X_trainosm.columns)
featureImportance.sort_values().plot(kind="barh",figsize=(8, 6))

In [None]:
feature_names = X_trainosm.columns

In [None]:
import joblib
urlModel = joblib.load("url_model_final.pkl")

In [None]:
#make predictions
def featureExtraction(url):
    features = []
    features.append(getUrlLength(url))
    features.append(getHostnameLength(url))
    features.append(getPathLength(url))
    features.append(getTldLength(url))
    features.append(getNumOfHyphen(url))
    features.append(getNumOfAt(url))
    features.append(getNumOfQueMark(url))
    features.append(getNumOfPercMark(url))
    features.append(getNumOfSlash(url))
    features.append(getNumOfFullstop(url))
    features.append(getNumOfAmpersand(url))
    features.append(getNumOfUnderscore(url))
    features.append(getNumOfEqual(url))
    features.append(getNumOfExclamationMark(url))
    features.append(getNumOfDigits(url))
    features.append(getNumOfLetters(url))
    features.append(getNumOfDirectories(url))
    features.append(hasIP(url))
    features.append(getEntropy(url))
    features.append(hashttp(url))
    features.append(hashttps(url))
    features.append(fdLength(url))
    features.append(numParameters(url))
    features.append(numSubDomains(url))
    return features

In [None]:
def makePrediction(url):
    featuresTest = []
   
    featuresTest = featureExtraction(url)
    reshapedFeatures = np.array(featuresTest).reshape((1, -1))
    reshapedFeatures_df = pd.DataFrame(reshapedFeatures)
    prediction = urlModel.predict(reshapedFeatures_df)
    if int(prediction[0]) == 0:
        status="Benign"
        return status
    elif int(prediction[0]) == 1:
        status="Malicious"
        return status

In [None]:
#pass entered url
print(makePrediction('imetrica.net/css/'))

In [None]:
#Save the model to a file
#import joblib
#filename = 'url_model_final.pkl'
#joblib.dump(rfc,filename)