## Importing Dataset

In [11]:
import pandas as pd

In [12]:
df = pd.read_csv("https://raw.githubusercontent.com/gurpreet0610/DDoS-attack-detection-using-HTTP-packet-clustering-pattern/master/WP_Dataset/wplogs.csv")

In [13]:
len(df)

14469

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14469 entries, 0 to 14468
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   @timestamp           14469 non-null  object 
 1   @timestamp.1         14469 non-null  object 
 2   _id                  14469 non-null  object 
 3   bytes                14469 non-null  object 
 4   clientip             14469 non-null  object 
 5   geoip.country_code3  11535 non-null  object 
 6   httpversion          14453 non-null  float64
 7   request              14453 non-null  object 
 8   response             14469 non-null  int64  
 9   useragent.device     14469 non-null  object 
 10  useragent.name       14469 non-null  object 
 11  useragent.os         14469 non-null  object 
 12  verb                 14453 non-null  object 
 13  useragent.os_name    14469 non-null  object 
dtypes: float64(1), int64(1), object(12)
memory usage: 1.5+ MB


## Data Preprocessing

In [15]:
df.drop(["@timestamp.1","_id",],axis=1,inplace=True)

In [16]:
df = df[df.clientip != "127.0.0.1"]

In [17]:
df["geoip.country_code3"].fillna("unknown",inplace=True)

In [18]:
df["httpversion"].fillna("error",inplace=True)

In [19]:
df["request"].fillna("error",inplace=True)

#### Country Code

In [20]:
freq=df["geoip.country_code3"].value_counts()
cond=freq<300
mask_obs = freq[cond].index
mask_dict = dict.fromkeys(mask_obs, 'others')

In [21]:
df['geoip.country_code3'] = df['geoip.country_code3'].replace(mask_dict)

In [22]:
df["geoip.country_code3"].value_counts()

IN         6177
US         3170
others     2188
unknown     396
Name: geoip.country_code3, dtype: int64

#### Bytes Data Transfer

In [23]:
df['bytes'] = df['bytes'].str.extract('(\d+)', expand=False)
df['bytes']=df['bytes'].astype('int')

#### HTTP Version

In [24]:
df['httpversion'].value_counts()

1.1      11899
error       16
1.0         16
Name: httpversion, dtype: int64

#### HTTP Response

In [25]:
df['response'].value_counts()

200    6353
408    5432
404      45
500      38
301      32
400      18
302      13
Name: response, dtype: int64

#### User Agent    

In [26]:
df['useragent.device'].value_counts()

Other                    10407
Generic Smartphone         455
iPhone                     360
iPad                       108
Samsung SM-G950F            80
Samsung SM-G973F            53
EML-AL00                    34
BlackBerry Playbook         30
Nokia N950                  30
Spider                      30
Samsung SM-T537A            27
A500                        20
Nokia E63                   20
Samsung SGH-A867            20
OnePlus ONEPLUS A5010       18
Samsung SM-G900F            17
Samsung SM-J530F            17
Samsung SM-J337AZ           15
Samsung SCH-R970            10
Lumia 920                   10
Motorola Milestone          10
sdk                         10
Lumia 530                   10
Samsung GT-P7100            10
Nokia 920                   10
Samsung Galaxy S II         10
Nokia 6230i                 10
Samsung SM-G9550            10
Samsung SPH-M900            10
HTC ADR6300                 10
Ericsson K800i              10
Motorola Droid              10
Lumia 92

In [27]:
df =df.drop(['useragent.device'],axis=1)

In [28]:
df['useragent.name'].value_counts()

Chrome                   7982
Firefox                  1352
Chrome Mobile             470
Mobile Safari             381
Other                     273
IE                        217
Opera                     213
Edge                      144
Samsung Internet          133
Android                   110
Chrome Mobile iOS          77
Safari                     74
UC Browser                 54
IE Mobile                  50
QupZilla                   31
Arora                      30
Nokia Browser              30
BlackBerry WebKit          30
WordPress                  26
Chromium                   25
Konqueror                  23
Puffin                     20
Maxthon                    15
Chrome Mobile WebView      15
Coc Coc                    15
Yandex Browser             15
NetFront                   10
UP.Browser                 10
Iceweasel                  10
Vivaldi                    10
Opera Mobile               10
iTunes                     10
Lynx                       10
Edge Mobil

In [29]:
freq=df["useragent.name"].value_counts()
cond=freq<300
mask_obs = freq[cond].index
mask_dict = dict.fromkeys(mask_obs, 'others')
df['useragent.name'] = df['useragent.name'].replace(mask_dict)

In [30]:
df['useragent.name'].value_counts()

Chrome           7982
others           1746
Firefox          1352
Chrome Mobile     470
Mobile Safari     381
Name: useragent.name, dtype: int64

In [31]:
df['useragent.os'].value_counts()

Mac OS X                5743
Windows                 3722
Android                  806
Linux                    519
iOS                      458
Other                    341
Ubuntu                   110
Windows Phone             40
Symbian OS                30
MeeGo                     30
BlackBerry Tablet OS      30
Fedora                    20
OpenBSD                   20
Chrome OS                 12
FreeBSD                   10
Debian                    10
Linux Mint                10
Slackware                 10
Solaris                   10
Name: useragent.os, dtype: int64

In [32]:
freq=df["useragent.os"].value_counts()
cond=freq<350
mask_obs = freq[cond].index
mask_dict = dict.fromkeys(mask_obs, 'others')
df['useragent.os'] = df['useragent.os'].replace(mask_dict)

In [33]:
df['useragent.os'].value_counts()

Mac OS X    5743
Windows     3722
Android      806
others       683
Linux        519
iOS          458
Name: useragent.os, dtype: int64

In [34]:
df['useragent.os_name'].value_counts()

Mac OS X                5743
Windows                 3722
Android                  806
Linux                    519
iOS                      458
Other                    341
Ubuntu                   110
Windows Phone             40
Symbian OS                30
MeeGo                     30
BlackBerry Tablet OS      30
Fedora                    20
OpenBSD                   20
Chrome OS                 12
FreeBSD                   10
Debian                    10
Linux Mint                10
Slackware                 10
Solaris                   10
Name: useragent.os_name, dtype: int64

In [35]:
df =df.drop(['useragent.os_name'],axis=1)

In [36]:
df['verb'].value_counts()

GET        11832
POST          66
CONNECT       10
HEAD           7
Name: verb, dtype: int64

In [37]:
df.columns

Index(['@timestamp', 'bytes', 'clientip', 'geoip.country_code3', 'httpversion',
       'request', 'response', 'useragent.name', 'useragent.os', 'verb'],
      dtype='object')

In [38]:
# df.to_csv("/content/preprocess.csv")

In [39]:
df =pd.get_dummies(df, columns=['geoip.country_code3', 'httpversion',
        'response', 'useragent.name', 'useragent.os', 'verb'],drop_first=True)

In [40]:
df.head()

Unnamed: 0,@timestamp,bytes,clientip,request,geoip.country_code3_US,geoip.country_code3_others,geoip.country_code3_unknown,httpversion_1.1,httpversion_error,response_301,response_302,response_400,response_404,response_408,response_500,useragent.name_Chrome Mobile,useragent.name_Firefox,useragent.name_Mobile Safari,useragent.name_others,useragent.os_Linux,useragent.os_Mac OS X,useragent.os_Windows,useragent.os_iOS,useragent.os_others,verb_GET,verb_HEAD,verb_POST
18,"Jun 20, 2020 @ 14:45:53.000",0,103.224.146.167,error,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0
50,"Jun 20, 2020 @ 14:45:21.000",410,195.54.160.135,/vendor/phpunit/phpunit/src/Util/PHP/eval-stdi...,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0
52,"Jun 20, 2020 @ 14:45:21.000",15,195.54.160.135,/vendor/phpunit/phpunit/src/Util/PHP/eval-stdi...,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0
70,"Jun 20, 2020 @ 14:45:03.000",99,103.224.146.167,/wp-content/themes/amphibious/webfonts/fa-soli...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
71,"Jun 20, 2020 @ 14:45:03.000",194,103.224.146.167,/wp-content/themes/amphibious/webfonts/fa-soli...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0


In [41]:
#df.to_csv("/content/encoded.csv")

In [42]:
edf=df.drop(["@timestamp","request","clientip"],axis=1)

In [43]:
len(df)

11931

## Scaling Dataset and Fitting it

In [44]:
import pandas as pd
from sklearn import preprocessing
x = edf.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_norm = pd.DataFrame(x_scaled,columns=edf.columns)

In [45]:
from sklearn.cluster import KMeans
Kmean = KMeans(n_clusters=4)
Kmean.fit(df_norm)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [46]:
Kmean.cluster_centers_

array([[ 2.72320682e-02,  6.89434889e-01,  3.49720253e-15,
         8.55036855e-02,  9.89189189e-01,  5.40540541e-03,
         5.89680590e-03,  4.42260442e-03,  6.38820639e-03,
         8.35380835e-03,  6.19164619e-02,  1.47420147e-03,
         2.02948403e-01,  1.34152334e-01,  1.73464373e-01,
         3.20884521e-01,  1.59705160e-01,  1.20393120e-01,
        -1.11022302e-14,  1.91154791e-01,  2.21621622e-01,
         9.68058968e-01,  9.82800983e-04,  2.06388206e-02],
       [ 8.21815180e-05,  3.02535774e-14, -2.28705943e-14,
         6.77710843e-03,  1.00000000e+00, -1.79760720e-16,
        -3.59521440e-16, -6.76542156e-17, -3.49113100e-17,
         4.43221848e-16,  9.93222892e-01,  1.33140027e-16,
         1.65839564e-15,  3.57680723e-03, -2.51187959e-15,
         1.18793864e-14,  1.88253012e-04,  9.99811747e-01,
        -1.28785871e-14,  1.93595140e-15,  4.87804241e-15,
         1.00000000e+00,  2.77555756e-17,  2.70616862e-16],
       [ 1.38235764e-02, -8.88178420e-16,  8.82970137e

## Test

In [47]:
y_kmeans = Kmean.predict(df_norm)

In [48]:
type(df["clientip"].values)

numpy.ndarray

In [49]:
fdf =pd.DataFrame({"ip":df["clientip"].values,"result":y_kmeans})

In [52]:
fdf[fdf["result"]==1]

Unnamed: 0,ip,result
0,103.224.146.167,0
3,103.224.146.167,0
4,103.224.146.167,0
5,103.224.146.167,0
6,103.224.146.167,0
...,...,...
5757,103.224.146.167,1
11920,103.224.146.167,0
11927,103.224.146.167,0
11928,103.224.146.167,0


In [53]:
fdf[fdf["result"]==0]

Unnamed: 0,ip,result
0,103.224.146.167,0
3,103.224.146.167,0
4,103.224.146.167,0
5,103.224.146.167,0
6,103.224.146.167,0
...,...,...
11926,142.93.113.16,0
11927,103.224.146.167,0
11928,103.224.146.167,0
11929,13.233.254.101,0
