使用一些URL的特征来作为属性进行判断：

In [1]:
from __future__ import division
import os
import sys
import re
import matplotlib
import pandas as pd
import numpy as np
from os.path import splitext
import ipaddress as ip
import tldextract
import whois
import datetime
from urllib.parse import urlparse
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import pickle as pkl

In [2]:
#2016's top most suspicious TLD and words
Suspicious_TLD=['zip','cricket','link','work','party','gq','kim','country','science','tk']
Suspicious_Domain=['luckytime.co.kr','mattfoll.eu.interia.pl','trafficholder.com','dl.baixaki.com.br','bembed.redtube.comr','tags.expo9.exponential.com','deepspacer.com','funad.co.kr','trafficconverter.biz']
#trend micro's top malicious domains 

In [3]:
# Method to count number of dots
def countdots(url):  
    return url.count('.')

In [4]:
# Method to count number of delimeters
def countdelim(url):
    count = 0
    delim=[';','_','?','=','&']
    for each in url:
        if each in delim:
            count = count + 1
    
    return count

In [5]:
# Is IP addr present as th hostname, let's validate

import ipaddress as ip #works only in python 3

def isip(uri):
    try:
        if ip.ip_address(uri):
            return 1
    except:
        return 0

In [6]:
#method to check the presence of @

def isPresentAt(url):
    return url.count('@')

In [7]:
def isPresentDSlash(url):
    return url.count('//')

In [8]:
def countSubDir(url):
    return url.count('/')

In [9]:
def get_ext(url):
    """Return the filename extension from url, or ''."""
    
    root, ext = splitext(url)
    return ext

In [10]:
def countSubDomain(subdomain):
    if not subdomain:
        return 0
    else:
        return len(subdomain.split('.'))

In [11]:
def countQueries(query):
    if not query:
        return 0
    else:
        return len(query.split('&'))

### 导入数据

In [12]:
df = pd.read_csv("dataset.csv", encoding='utf-8')
#df=df.sample(frac=1)
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,url,label
0,http://www.wezhike.com/wp-content/plugins/css-...,1
1,http://leesangku.com/pruboard/technote/board/g...,1
2,https://www.zac.sweeny.bmo.verification.inform...,1
3,https://www.commbank.com.au,0
4,https://www.royalbank.com,0


In [13]:
len(df)

7998

利用以上函数统计出每个url的属性

In [14]:
featureSet = pd.DataFrame(columns=('url','no of dots','len of url','presence of at',\
'presence of double slash','no of subdir','no of subdomain','len of domain','no of queries','is IP','presence of Suspicious_TLD',\
'presence of suspicious domain','label'))

In [15]:
from urllib.parse import urlparse
import tldextract
def getFeatures(url, label): 
    result = []
    url = str(url)
    
    #add the url to feature set
    result.append(url)
    
    #parse the URL and extract the domain information
    path = urlparse(url)
    ext = tldextract.extract(url)
    
#     print("1:"+path.scheme)
#    print("2:"+path.netloc)
#     print("3:"+path.path)
#     print("4:"+ext.subdomain)
#     print("5:"+ext.suffix)
    
    #counting number of dots in subdomain    
    result.append(countdots(ext.subdomain))
    
    #length of URL    
    result.append(len(url))
    
    #checking @ in the url    
    result.append(isPresentAt(path.netloc))
    
    #checking presence of double slash    
    result.append(isPresentDSlash(path.path))
    
    #Count number of subdir    
    result.append(countSubDir(path.path))
    
    #number of sub domain    
    result.append(countSubDomain(ext.subdomain))
    
    #length of domain name    
    result.append(len(path.netloc))
    
    #count number of queries    
    result.append(len(path.query))
    
    #Adding domain information
    
    #if IP address is being used as a URL     
    result.append(isip(ext.domain))
    
    #presence of Suspicious_TLD
    result.append(1 if ext.suffix in Suspicious_TLD else 0)
    
    #presence of suspicious domain
    result.append(1 if '.'.join(ext[1:]) in Suspicious_Domain else 0 )
         
    #result.append(get_ext(path.path))
    result.append(str(label))
    return result
                  
 

In [16]:
for i in range(len(df)):
    features = getFeatures(df["url"].loc[i], df["label"].loc[i])    
    featureSet.loc[i] = features

In [17]:
featureSet.head(10)

Unnamed: 0,url,no of dots,len of url,presence of at,presence of double slash,no of subdir,no of subdomain,len of domain,no of queries,is IP,presence of Suspicious_TLD,presence of suspicious domain,label
0,http://www.wezhike.com/wp-content/plugins/css-...,0,250,0,0,6,1,15,165,0,0,0,1
1,http://leesangku.com/pruboard/technote/board/g...,0,86,0,0,6,0,13,0,0,0,0,1
2,https://www.zac.sweeny.bmo.verification.inform...,4,62,0,0,1,5,53,0,0,0,0,1
3,https://www.commbank.com.au,0,27,0,0,0,1,19,0,0,0,0,0
4,https://www.royalbank.com,0,25,0,0,0,1,17,0,0,0,0,0
5,http://redcosb.com/rock/manager/aproval/s/shar...,0,90,0,0,8,0,11,0,0,0,0,1
6,https://horizontemistico.com.br/font-awesome/f...,0,79,0,0,6,0,23,0,0,0,0,1
7,http://192.com/atoz/people/reed/donald/,0,39,0,0,5,0,7,0,0,0,0,0
8,http://proxyprincipal.appspot.com/www.paypal.c...,0,59,0,0,4,1,26,0,0,0,0,1
9,https://onedrive.live.com/view.aspx?resid=73F4...,0,111,0,0,1,1,17,75,0,0,0,1


做一个数据样本的恶意网址和正常网址的统计

In [18]:
featureSet.groupby(featureSet['label']).size()

label
0    3999
1    3999
dtype: int64

### 开始训练

In [19]:
import sklearn.ensemble as ek
from sklearn.model_selection import train_test_split
from sklearn import tree, linear_model
from sklearn.feature_selection import SelectFromModel
from sklearn.externals import joblib
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn import svm



把属性存在 X，类别标签存在 y,drop函数作用为删除列

In [20]:
X = featureSet.drop(['url','label'],axis=1).values
y = featureSet['label'].values

把数据集分为训练集和测试集，其中 test_size=0.2，即测试集占总数据的 20%：

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size=0.2)

用决策树训练模型

In [22]:
clf = tree.DecisionTreeClassifier(max_depth=10)
clf.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [23]:
res = clf.predict(X)
mt = confusion_matrix(y, res)

In [24]:
print("%f,%f" % (mt[0][0],mt[0][1]))
print("%f,%f" % (mt[1][0],mt[1][1]))
print("Accuracy rate : %f %%" % (((mt[0][0]+mt[1][1]) / float(mt.sum()))*100))
print('Recall rate : %f %%' % ( (mt[0][0] / float(mt.sum(axis=1)[0])*100)))

3654.000000,345.000000
250.000000,3749.000000
Accuracy rate : 92.560640 %
Recall rate : 91.372843 %


### 测试

In [30]:
result = pd.DataFrame(columns=('url','no of dots','len of url','presence of at',\
'presence of double slash','no of subdir','no of subdomain','len of domain','no of queries','is IP','presence of Suspicious_TLD',\
'presence of suspicious domain','label'))

results = getFeatures('www.baidu.com', '0')
result.loc[0] = results
result = result.drop(['url','label'],axis=1).values
if(clf.predict(result)=="0"):
    print("Benign URL")
else:
    print("Malicious URL")

Benign URL


In [31]:
result = pd.DataFrame(columns=('url','no of dots','len of url','presence of at',\
'presence of double slash','no of subdir','no of subdomain','len of domain','no of queries','is IP','presence of Suspicious_TLD',\
'presence of suspicious domain','label'))

results = getFeatures('dinas.tomsk.ru/err/?paypal.ch/ch/cgi-bin/webscr1.htm?cmd=_login-run&dispatch=5885d80a13c0db1f1ff80d546411d7f8a8350c132bc41e0934cfc023d4r4ere32132', '1')
result.loc[0] = results
result = result.drop(['url','label'],axis=1).values
if(clf.predict(result)=="0"):
    print("Benign URL")
else:
    print("Malicious URL")

Malicious URL
