In [1]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.svm import NuSVC

In [2]:
raw_data = pd.read_csv("Phishing_dataset_03.csv")
raw_data.head()

Unnamed: 0,qty_dot_url,qty_hyphen_url,qty_underline_url,qty_slash_url,qty_questionmark_url,qty_equal_url,qty_at_url,qty_and_url,qty_exclamation_url,qty_space_url,...,qty_ip_resolved,qty_nameservers,qty_mx_servers,ttl_hostname,tls_ssl_certificate,qty_redirects,url_google_index,domain_google_index,url_shortened,phishing
0,2,0,0,0,0,0,0,0,0,0,...,1,4,2,3598,0,0,0,0,0,0
1,4,0,0,2,0,0,0,0,0,0,...,1,4,1,3977,1,0,0,0,0,0
2,1,0,0,1,0,0,0,0,0,0,...,1,2,1,10788,0,0,0,0,0,0
3,2,0,0,3,0,0,0,0,0,0,...,1,2,1,14339,1,0,0,0,0,1
4,1,1,0,4,0,0,0,0,0,0,...,1,2,1,389,1,1,0,0,0,1


In [3]:
print(len(raw_data.index))

58645


### List all column names

In [4]:
for column in raw_data.columns:
    print(column)

qty_dot_url
qty_hyphen_url
qty_underline_url
qty_slash_url
qty_questionmark_url
qty_equal_url
qty_at_url
qty_and_url
qty_exclamation_url
qty_space_url
qty_tilde_url
qty_comma_url
qty_plus_url
qty_asterisk_url
qty_hashtag_url
qty_dollar_url
qty_percent_url
qty_tld_url
length_url
qty_dot_domain
qty_hyphen_domain
qty_underline_domain
qty_slash_domain
qty_questionmark_domain
qty_equal_domain
qty_at_domain
qty_and_domain
qty_exclamation_domain
qty_space_domain
qty_tilde_domain
qty_comma_domain
qty_plus_domain
qty_asterisk_domain
qty_hashtag_domain
qty_dollar_domain
qty_percent_domain
qty_vowels_domain
domain_length
domain_in_ip
server_client_domain
qty_dot_directory
qty_hyphen_directory
qty_underline_directory
qty_slash_directory
qty_questionmark_directory
qty_equal_directory
qty_at_directory
qty_and_directory
qty_exclamation_directory
qty_space_directory
qty_tilde_directory
qty_comma_directory
qty_plus_directory
qty_asterisk_directory
qty_hashtag_directory
qty_dollar_directory
qty_percent_

# Clean Data

In [5]:
clean_data = raw_data.dropna()
print(len(clean_data.index))

58645


Note how the number of rows has not changed, this is certainly one complete dataset, whoop!

In [6]:
y = clean_data.phishing
feature_columns = ["qty_dot_url","qty_hyphen_url","qty_questionmark_url","qty_and_url","length_url",
                   "domain_length","qty_dot_directory","directory_length","file_length","qty_params","url_shortened"]
X = clean_data[feature_columns]
train_X, val_X, train_y, val_y = train_test_split(X,y, random_state=1)

In [None]:
model = NuSVC(kernel = 'linear',gamma = 'scale', shrinking = False)
model.fit(train_X, train_y)

In [None]:
val_predictions = model.predict(val_X)
print(val_predictions)
print(val_y.head())

In [None]:
from sklearn.metrics import mean_absolute_error

In [None]:
val_mae = mean_absolute_error(val_predictions, val_y)
print(val_mae)

I think the MAE in this context represents the percentage chance the model will be wrong, as the target is a boolean. Let's try reducing the amount of features, and see how this affects the MAE

In [None]:
feature_columns = ["qty_dot_url","length_url",
                   "domain_length","qty_dot_directory","directory_length","file_length","qty_params","url_shortened"]
X = clean_data[feature_columns]
train_X, val_X, train_y, val_y = train_test_split(X,y, random_state=1)
model.fit(train_X, train_y)
val_predictions = model.predict(val_X)
val_mae = mean_absolute_error(val_predictions, val_y)
print(val_mae)

In [None]:
def get_mae(X, y, model):
    train_X, val_X, train_y, val_y = train_test_split(X,y, random_state=1)
    model.fit(train_X, train_y)
    val_predictions = model.predict(val_X)
    val_mae = mean_absolute_error(val_predictions, val_y)
    print(val_mae)

Hmm, it seemed to have no effect? Lets try calculating the MAE when all columns are used as feature columns? I also created a function to calculate the MAE.

In [None]:
feature_for_columns = []
for column in clean_data.columns:
    feature_for_columns.append(column)
X=clean_data[feature_for_columns]
model = NuSVC(kernel = 'linear',gamma = 'scale', shrinking = False)
#get_mae(X,y,model)

In [None]:
feature_columns = ["qty_dot_url","qty_hyphen_url","qty_equal_url","qty_questionmark_url","qty_and_url","length_url",
                   "domain_length","qty_dot_directory","qty_hyphen_directory","qty_comma_directory","qty_slash_directory","qty_at_directory",
                   "qty_and_directory","directory_length","qty_dot_file","qty_hyphen_file","qty_underline_file","qty_slash_file",
                   "qty_questionmark_file","qty_equal_file","file_length","qty_dot_params"
                    ,"qty_hyphen_params","qty_underline_params","qty_slash_params","qty_params","url_shortened"
                   ,"qty_underline_url","qty_slash_url","email_in_url","time_response","domain_spf","tls_ssl_certificate"
                   ,"qty_redirects"]
for feature in feature_columns:
    X=clean_data[feature_columns]
    model = NuSVC(kernel = 'linear',gamma = 'scale', shrinking = False)
    get_mae(X,y,model)
    feature_columns.pop()



In [None]:
#pickle.dump(model, open("url_model.pkl","wb"))