In [4]:
import random 
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.naive_bayes import GaussianNB




# Import random dataset 

In [16]:
flow_file = "smallFlow_stats.csv"
flow_df = pd.read_csv(flow_file,names=['srcip','srcport','dstip','dstport','proto','total_fpackets','total_fvolume',
                                              'total_bpackets','total_bvolume','min_fpktl','mean_fpktl','max_fpktl','std_fpktl',
                                              'min_bpktl','mean_bpktl','max_bpktl','std_bpktl','min_fiat','mean_fiat','max_fiat',
                                              'std_fiat','min_biat','mean_biat','max_biat','std_biat','duration','min_active',
                                              'mean_active','max_active','std_active','min_idle','mean_idle','max_idle','std_idle',
                                              'sflow_fpackets','sflow_fbytes','sflow_bpackets','sflow_bbytes','fpsh_cnt','bpsh_cnt',
                                              'furg_cnt','burg_cnt','total_fhlen','total_bhlen','misc'])

# Create random data to use for device classification 

In [31]:
#0 is Nest, 1 is TiVo, 2 is Printer, 3 is Domain Controller, 4 is File Server

def create_data(rand_df):
    ip_dict = {"1.2.3.4":0,"5.6.7.8":1,"9.10.11.12":2,"13.14.15.16":3,
           "17.18.19.20":4}
    
    for index,row in rand_df.iterrows():
        rand_ip = random.sample(ip_dict.keys(),2)
        rand_label = random.sample(ip_dict.values(),1)
        rand_label = rand_label[0]

        rand_df.loc[index,'srcip'] = rand_ip[0]
        rand_df.loc[index,'dstip'] = rand_ip[1]
        
        
    rand_df['port'] = rand_df.apply(lambda x: min(x['srcport'],x['dstport']),axis=1)
    rand_df['label'] = rand_df.apply(lambda x: ip_dict[x['srcip']],axis=1)
    
    del rand_df['misc']
            
    return rand_df

In [32]:
df = create_data(flow_df)
df

Unnamed: 0,srcip,srcport,dstip,dstport,proto,total_fpackets,total_fvolume,total_bpackets,total_bvolume,min_fpktl,...,sflow_bpackets,sflow_bbytes,fpsh_cnt,bpsh_cnt,furg_cnt,burg_cnt,total_fhlen,total_bhlen,port,label
0,5.6.7.8,55950,1.2.3.4,80,6,5,1156,3,619,40,...,3,619,1,1,0,0,212,132,80,1
1,1.2.3.4,55955,17.18.19.20,80,6,5,660,3,407,40,...,3,407,1,1,0,0,212,128,80,0
2,5.6.7.8,55954,1.2.3.4,80,6,13,2369,19,25234,40,...,19,25234,3,4,0,0,532,764,80,1
3,5.6.7.8,58264,1.2.3.4,80,6,5,765,4,388,40,...,4,388,1,1,0,0,212,172,80,1
4,9.10.11.12,58265,17.18.19.20,80,6,6,858,8,6980,40,...,8,6980,1,4,0,0,252,332,80,2
5,1.2.3.4,58272,13.14.15.16,80,6,7,856,8,6916,40,...,8,6916,1,4,0,0,292,332,80,0
6,13.14.15.16,55963,9.10.11.12,80,6,12,1685,13,13111,40,...,13,13111,3,3,0,0,492,532,80,3
7,13.14.15.16,55973,17.18.19.20,80,6,16,1040,26,34256,40,...,26,34256,1,1,0,0,652,1052,80,3
8,17.18.19.20,55960,1.2.3.4,80,6,6,763,4,694,40,...,4,694,1,1,0,0,252,172,80,4
9,5.6.7.8,52201,13.14.15.16,443,6,6,434,5,2255,40,...,5,2255,1,1,0,0,252,212,443,1


# Create training and testing data

In [33]:
stats = df.ix[:,'proto':'port']
stats

Unnamed: 0,proto,total_fpackets,total_fvolume,total_bpackets,total_bvolume,min_fpktl,mean_fpktl,max_fpktl,std_fpktl,min_bpktl,...,sflow_fbytes,sflow_bpackets,sflow_bbytes,fpsh_cnt,bpsh_cnt,furg_cnt,burg_cnt,total_fhlen,total_bhlen,port
0,6,5,1156,3,619,40,231,984,420,40,...,1156,3,619,1,1,0,0,212,132,80
1,6,5,660,3,407,40,132,488,199,40,...,660,3,407,1,1,0,0,212,128,80
2,6,13,2369,19,25234,40,182,692,268,44,...,2369,19,25234,3,4,0,0,532,764,80
3,6,5,765,4,388,40,153,593,246,40,...,765,4,388,1,1,0,0,212,172,80
4,6,6,858,8,6980,40,143,646,246,40,...,858,8,6980,1,4,0,0,252,332,80
5,6,7,856,8,6916,40,122,604,212,40,...,856,8,6916,1,4,0,0,292,332,80
6,6,12,1685,13,13111,40,140,438,179,40,...,1685,13,13111,3,3,0,0,492,532,80
7,6,16,1040,26,34256,40,65,428,96,40,...,1040,26,34256,1,1,0,0,652,1052,80
8,6,6,763,4,694,40,127,551,207,40,...,763,4,694,1,1,0,0,252,172,80
9,6,6,434,5,2255,40,72,222,73,40,...,434,5,2255,1,1,0,0,252,212,443


# Scale data for use with models

In [None]:
scaled_stats = preprocessing.scale(stats)
scaled_stats

# Split data into test and train data

In [36]:
X_train, X_test, y_train, y_test = train_test_split(scaled_stats,
                                                    df['label'].values, test_size=0.2, random_state=41)

# Logistic Regression Model

In [37]:
lgs = linear_model.LogisticRegression(C=1e5)
lgs_accuracy = lgs.fit(X_train, y_train).score(X_test, y_test)
print "Logistic regression accuracy:", lgs_accuracy
lgs_result = lgs.predict(X_test)
print classification_report(y_test, lgs_result)

Logistic regression accuracy: 0.197916666667
             precision    recall  f1-score   support

          0       0.10      0.04      0.06        25
          1       0.18      0.29      0.22        17
          2       0.26      0.28      0.27        18
          3       0.15      0.20      0.17        20
          4       0.33      0.25      0.29        16

avg / total       0.19      0.20      0.19        96



# Support Vector Machine (SVM) Model

In [38]:
clf = svm.SVC()
clf_accuracy = clf.fit(X_train, y_train).score(X_test, y_test)
print "SVM accuracy:",clf_accuracy
clf_result = clf.predict(X_test)
print classification_report(y_test, clf_result)

SVM accuracy: 0.260416666667
             precision    recall  f1-score   support

          0       0.33      0.12      0.18        25
          1       0.29      0.12      0.17        17
          2       0.22      0.50      0.31        18
          3       0.30      0.35      0.33        20
          4       0.25      0.25      0.25        16

avg / total       0.28      0.26      0.24        96



# Naive Bayes Model

In [39]:
gnb = GaussianNB()
gnb_accuracy = gnb.fit(X_train, y_train).score(X_test, y_test)
print "Naive Bayes Classifer", gnb_accuracy
gnb_result = gnb.predict(X_test)
print classification_report(y_test, gnb_result)

Naive Bayes Classifer 0.229166666667
             precision    recall  f1-score   support

          0       0.60      0.12      0.20        25
          1       0.33      0.12      0.17        17
          2       0.38      0.28      0.32        18
          3       0.00      0.00      0.00        20
          4       0.18      0.75      0.29        16

avg / total       0.32      0.23      0.19        96

