In [29]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.feature_selection import SelectKBest,RFE
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score



In [30]:
col_names = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]

In [3]:
train = pd.read_csv("KDDTrain+_2.csv",names=col_names)
test = pd.read_csv("KDDTest+_2.csv",names=col_names)

In [4]:
# colums that are categorical and not binary yet: protocol_type (column 2), service (column 3), flag (column 4).
# explore categorical features
print('Training set:')
for col_name in train.columns:
    if train[col_name].dtypes == 'object' :
        unique_cat = len(train[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))

#see how distributed the feature service is, it is evenly distributed and therefore we need to make dummies for all.
print()
print('Distribution of categories in service:')
print(train['service'].value_counts().sort_values(ascending=False).head())

Training set:
Feature 'protocol_type' has 3 categories
Feature 'service' has 70 categories
Feature 'flag' has 11 categories
Feature 'label' has 23 categories

Distribution of categories in service:
http        40338
private     21853
domain_u     9043
smtp         7313
ftp_data     6860
Name: service, dtype: int64


In [5]:
# Test set
print('Test set:')
for col_name in test.columns:
    if test[col_name].dtypes == 'object' :
        unique_cat = len(test[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))

Test set:
Feature 'protocol_type' has 3 categories
Feature 'service' has 64 categories
Feature 'flag' has 11 categories
Feature 'label' has 38 categories


In [6]:
categorical_features = ['protocol_type','service','flag',]

In [7]:
def one_hot(data,categorical_features=categorical_features):
    for col in categorical_features:
        one_hot =pd.get_dummies(data[col],prefix=col)

        data =  pd.concat([data,one_hot],axis=1)
    # data.drop(categorical_features,axis=1)
    return data

In [8]:
train1 = train.copy()

In [9]:
train1.to_csv('train1.csv',index=False)

In [10]:
train = one_hot(train)
test = one_hot(test)

In [11]:
trainservice=train['service'].tolist()
testservice= test['service'].tolist()
difference=list(set(trainservice) - set(testservice))
string = 'service_'
difference=[string + x for x in difference]
difference

['service_urh_i',
 'service_aol',
 'service_red_i',
 'service_harvest',
 'service_http_8001',
 'service_http_2784']

In [12]:
for col in difference:
    test[col] = 0

In [13]:
train = train.drop(categorical_features,axis=1)
test = test.drop(categorical_features,axis=1)


In [14]:
# print(np.mean(cross_val_score(DS,X,Y,cv=10)))

In [15]:
X = train.drop("label",axis=1)
Y = train['label']
X_test = test.drop("label",axis=1)
Y_test = test['label']

In [16]:
DS = RandomForestClassifier(n_estimators=1000)

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
DS.fit(X,Y)

RandomForestClassifier(n_estimators=800)

In [19]:
from sklearn import tree

In [20]:
test = DS.predict(X_test)

Feature names must be in the same order as they were in fit.



In [21]:
from sklearn.metrics import classification_report

In [22]:
print(classification_report(Y_test,test,labels=train['label'].unique()))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                 precision    recall  f1-score   support

         normal       0.59      0.98      0.74      9711
        neptune       0.99      0.99      0.99      4657
    warezclient       0.00      0.00      0.00         0
        ipsweep       0.99      0.99      0.99       141
      portsweep       0.75      0.97      0.85       157
       teardrop       0.24      0.75      0.36        12
           nmap       1.00      1.00      1.00        73
          satan       0.67      1.00      0.80       735
          smurf       1.00      0.00      0.01       665
            pod       0.88      0.17      0.29        41
           back       1.00      0.75      0.86       359
   guess_passwd       0.00      0.00      0.00      1231
      ftp_write       0.00      0.00      0.00         3
       multihop       0.00      0.00      0.00        18
        rootkit       0.00      0.00      0.00        13
buffer_overflow       0.00      0.00      0.00        20
           imap       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
accuracy_score(Y_test,test)

0.6871451383960255

In [23]:
import pickle

In [24]:
pickle.dump(DS,open('model.sav','wb'))