# SVC SVM Machine Learning with 1-Dimensional URL Dataset

In [16]:
import pandas as pd
from sklearn import svm

## Reading in Benign dataset into Dataframe

In [17]:
df_benign = pd.read_csv('/Users/chrismclearnon/Developer/QUB-CSC3032-FYP/src/data/Malicious-DS/URL/Benign_list_big_final.csv')
df_benign["URLType"] = "Benign"
print(df_benign)

URL URLType
0      http://1337x.to/torrent/1048648/American-Snipe...  Benign
1      http://1337x.to/torrent/1110018/Blackhat-2015-...  Benign
2      http://1337x.to/torrent/1122940/Blackhat-2015-...  Benign
3      http://1337x.to/torrent/1124395/Fast-and-Furio...  Benign
4      http://1337x.to/torrent/1145504/Avengers-Age-o...  Benign
...                                                  ...     ...
35373  https://lastpass.com/signup2.php?ac=1&from_uri...  Benign
35374  https://lastpass.com/signup2.php?ac=1&from_uri...  Benign
35375  https://lastpass.com/signup2.php?ac=1&from_uri...  Benign
35376  https://lastpass.com/signup2.php?ac=1&from_uri...  Benign
35377  https://asana.com/guide/videos/%22//fast.wisti...  Benign

[35378 rows x 2 columns]


## Reading in Malicious dataset into Dataframe

In [18]:
df_malicious = pd.read_csv('/Users/chrismclearnon/Developer/QUB-CSC3032-FYP/src/data/Malicious-DS/URL/Malware_dataset.csv')
df_malicious["URLType"] = "Malicious"
print(df_malicious)

URL    URLType
0      http://gzzax.livechatvalue.com/chat/chatClient...  Malicious
1      http://gzzax.livechatvalue.com/chat/chatClient...  Malicious
2      http://gzzax.livechatvalue.com/chat/chatClient...  Malicious
3      http://gzzax.livechatvalue.com/chat/chatClient...  Malicious
4      http://mtsx.com.cn/UploadFiles/2011-08/admin/%...  Malicious
...                                                  ...        ...
11561  http://www.plastischechirurgie.net/gesicht/fac...  Malicious
11562  http://www.plastischechirurgie.net/haartranspl...  Malicious
11563  http://www.plastischechirurgie.net/korper/gyna...  Malicious
11564  http://dl.get1993desk.com/n/50517366/RealPlaye...  Malicious
11565  http://fb.com.accounts.login.userid.492739.fbs...  Malicious

[11566 rows x 2 columns]


In [19]:
dfs = [df_benign, df_malicious]
df = pd.concat(dfs)
df.reset_index(drop=True, inplace=True)
df = df.sample(frac=1).reset_index(drop=True)
print(df)



URL    URLType
0      http://nguyentandung.org/hai-quan-my-cong-bo-k...     Benign
1      http://taboola.com/sites/default/files/js/js_k...     Benign
2      http://distractify.com/post/related/id/55524aa...     Benign
3      http://searchengineland.com/13-semantic-markup...     Benign
4      http://3cf.ru/2015/07/20/stroitelstvo-energomo...  Malicious
...                                                  ...        ...
46939  http://zozo.jp/shop/adonisgreen/?price=proper&...     Benign
46940  http://9779.info/%E8%B1%86%E5%AD%90%E7%B2%98%E...  Malicious
46941  http://chinacxyy.com/newscodejs.asp?lm2=198&li...  Malicious
46942  http://elcomercio.pe/gastronomia/recetas/sandr...     Benign
46943  http://pikabu.ru/tag/%D0%B7%D0%B5%D0%BC%D0%BB%...     Benign

[46944 rows x 2 columns]


## Encode Binary URL Classifications

Here we are encoding the URLType binary classifications: "Benign" / "Malicious", using the LabelEncoder()

In [20]:
from sklearn import preprocessing

enc = preprocessing.LabelEncoder()
df["URLType"] = enc.fit_transform(df.URLType.values)
print(df)

URL  URLType
0      http://nguyentandung.org/hai-quan-my-cong-bo-k...        0
1      http://taboola.com/sites/default/files/js/js_k...        0
2      http://distractify.com/post/related/id/55524aa...        0
3      http://searchengineland.com/13-semantic-markup...        0
4      http://3cf.ru/2015/07/20/stroitelstvo-energomo...        1
...                                                  ...      ...
46939  http://zozo.jp/shop/adonisgreen/?price=proper&...        0
46940  http://9779.info/%E8%B1%86%E5%AD%90%E7%B2%98%E...        1
46941  http://chinacxyy.com/newscodejs.asp?lm2=198&li...        1
46942  http://elcomercio.pe/gastronomia/recetas/sandr...        0
46943  http://pikabu.ru/tag/%D0%B7%D0%B5%D0%BC%D0%BB%...        0

[46944 rows x 2 columns]


## Extracting Lexical Features from Dataset

Here is where different lexical features are extracted from the full dataset (Features based on the vocabulary/makeup of the actual URL string itself).

In [21]:
from featureprocessing import LexicalFeatureProcessing as lfp
from datacleaning import URLDataCleaning as ucl

feature_list = []
for index, row in df.iterrows():
    row["URL"] = ucl.clean_data(row["URL"])
    feature_dict = lfp.extract(row["URL"])
    feature_dict.update({"URLType": row["URLType"]})
    feature_list.append(feature_dict)

df_ft = pd.DataFrame(feature_list)
print(df_ft)

URLLength  HostLength  TLDLength  DotCount  DashCount  @Count  %Count  \
0            110          17          0         2         16       0       0   
1             91          11          0         2          2       0       0   
2             87          15          0         1          0       0       0   
3             89          20          0         1         10       0       0   
4             82           6          0         1          8       0       0   
...          ...         ...        ...       ...        ...     ...     ...   
46939        108           7          0         1          0       0       0   
46940         63           9          0         1          0       0      15   
46941        170          13          0         3          0       0       0   
46942        145          13          0         1          9       0       0   
46943        103           9          0         1          0       0      26   

       =Count  ?Count  DigitCount  UniqueCharC

In [33]:
from sklearn.model_selection import train_test_split

x = df_ft.drop('URLType', axis=1)
y = df_ft["URLType"]

print(x.shape)
print(y.shape)

(46944, 11)
(46944,)


In [34]:
# df_scaled = preprocessing.scale(df_ft)
# print(df_scaled)
x_scaled = preprocessing.scale(x)
print(x_scaled.shape)

(46944, 11)


In [38]:
from sklearn.model_selection import train_test_split

X_train, x_test, Y_train, y_test = train_test_split(x_scaled, y, test_size = 0.2, random_state = 0)
print(X_train.shape)
print(Y_train.shape)

(37555, 11)
(37555,)


In [39]:
from sklearn.svm import SVC

clf = SVC(gamma='auto')
clf.fit(X_train, Y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [45]:
y_pred = clf.predict(x_test)

from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.9661305783363511