In [311]:
import csv
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Data PreProcessing

In [312]:
#Read network_intrusion_data.csv file and load data into network_df dataframe 
network_df= pd.read_csv('network_intrusion_data.csv')
#network_df = network_df[0:10000]

In [313]:
#Add column headers to the data in the dataframe
network_df.columns = [
'duration',
'protocol_type',
'service',
'flag',
'src_bytes',
'dst_bytes',
'land',
'wrong_fragment',
'urgent',
'hot',
'num_failed_logins',
'logged_in',
'num_compromised',
'root_shell',
'su_attempted',
'num_root',
'num_file_creations',
'num_shells',
'num_access_files',
'num_outbound_cmds',
'is_host_login',
'is_guest_login',
'count',
'srv_count',
'serror_rate',
'srv_serror_rate',
'rerror_rate',
'srv_rerror_rate',
'same_srv_rate',
'diff_srv_rate',
'srv_diff_host_rate',
'dst_host_count',
'dst_host_srv_count',
'dst_host_same_srv_rate',
'dst_host_diff_srv_rate',
'dst_host_same_src_port_rate',
'dst_host_srv_diff_host_rate',
'dst_host_serror_rate',
'dst_host_srv_serror_rate',
'dst_host_rerror_rate',
'dst_host_srv_rerror_rate',
'outcome'
]

In [314]:
#Select only relevant columns for processing
network_input_df = network_df[['duration',
                         'protocol_type',
                         'service',
                         'src_bytes',
                         'dst_bytes',
                         'flag',
                         'land',
                         'wrong_fragment',
                         'urgent',
                         'count',
                         'srv_count',
                         'serror_rate',
                         'srv_serror_rate',
                         'rerror_rate',
                         'srv_rerror_rate',
                         'same_srv_rate',
                         'diff_srv_rate',
                         'srv_diff_host_rate',
                         'dst_host_count',
                         'dst_host_srv_count',
                         'dst_host_same_srv_rate',
                         'dst_host_diff_srv_rate',
                         'dst_host_same_src_port_rate',
                         'dst_host_srv_diff_host_rate',
                         'dst_host_serror_rate',
                         'dst_host_srv_serror_rate',
                         'dst_host_rerror_rate',
                         'dst_host_srv_rerror_rate']]

In [315]:
outcome_df = network_df[['outcome']]

In [316]:
#Drop any row with missing values
network_input_df = network_input_df.dropna()

In [317]:
#Normalize numeric features

def normalize_numeric_minmax(df, name):
    df[name] = ((df[name] - df[name].min()) / (df[name].max() - df[name].min()))
    
    
normalize_numeric_minmax(network_input_df,"duration") 
normalize_numeric_minmax(network_input_df,"src_bytes") 
normalize_numeric_minmax(network_input_df,"dst_bytes") 
normalize_numeric_minmax(network_input_df,"wrong_fragment") 
normalize_numeric_minmax(network_input_df,"urgent") 
normalize_numeric_minmax(network_input_df,"count") 
normalize_numeric_minmax(network_input_df,"srv_count") 

normalize_numeric_minmax(network_input_df,"serror_rate") 
normalize_numeric_minmax(network_input_df,"srv_serror_rate") 
normalize_numeric_minmax(network_input_df,"rerror_rate") 
normalize_numeric_minmax(network_input_df,"srv_rerror_rate") 
normalize_numeric_minmax(network_input_df,"rerror_rate") 
normalize_numeric_minmax(network_input_df,"srv_rerror_rate") 
normalize_numeric_minmax(network_input_df,"same_srv_rate") 
normalize_numeric_minmax(network_input_df,"diff_srv_rate") 
normalize_numeric_minmax(network_input_df,"srv_diff_host_rate") 


normalize_numeric_minmax(network_input_df,"dst_host_count") 
normalize_numeric_minmax(network_input_df,"dst_host_srv_count") 
normalize_numeric_minmax(network_input_df,"dst_host_same_srv_rate") 
normalize_numeric_minmax(network_input_df,"dst_host_diff_srv_rate") 
normalize_numeric_minmax(network_input_df,"dst_host_same_src_port_rate") 
normalize_numeric_minmax(network_input_df,"dst_host_srv_diff_host_rate") 
normalize_numeric_minmax(network_input_df,"dst_host_serror_rate") 
normalize_numeric_minmax(network_input_df,"dst_host_srv_serror_rate") 
normalize_numeric_minmax(network_input_df,"dst_host_rerror_rate") 
normalize_numeric_minmax(network_input_df,"dst_host_srv_rerror_rate") 



In [318]:
# one-hot cooding of categorical columns

def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

encode_text_dummy(network_input_df,"protocol_type") 
encode_text_dummy(network_input_df,"service") 
encode_text_dummy(network_input_df,"flag") 
encode_text_dummy(network_input_df,"land") 



In [319]:
network_input_df

Unnamed: 0,duration,src_bytes,dst_bytes,wrong_fragment,urgent,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,...,flag-RSTOS0,flag-RSTR,flag-S0,flag-S1,flag-S2,flag-S3,flag-SF,flag-SH,land-0,land-1
0,0.0,3.446905e-07,0.000094,0.0,0.0,0.015656,0.015656,0.00,0.00,0.0,...,0,0,0,0,0,0,1,0,1,0
1,0.0,3.389216e-07,0.000259,0.0,0.0,0.015656,0.015656,0.00,0.00,0.0,...,0,0,0,0,0,0,1,0,1,0
2,0.0,3.158461e-07,0.000259,0.0,0.0,0.011742,0.011742,0.00,0.00,0.0,...,0,0,0,0,0,0,1,0,1,0
3,0.0,3.129617e-07,0.000394,0.0,0.0,0.011742,0.011742,0.00,0.00,0.0,...,0,0,0,0,0,0,1,0,1,0
4,0.0,3.129617e-07,0.000394,0.0,0.0,0.011742,0.011742,0.00,0.00,0.0,...,0,0,0,0,0,0,1,0,1,0
5,0.0,3.057506e-07,0.000376,0.0,0.0,0.001957,0.003914,0.00,0.00,0.0,...,0,0,0,0,0,0,1,0,1,0
6,0.0,2.293129e-07,0.000793,0.0,0.0,0.009785,0.009785,0.00,0.00,0.0,...,0,0,0,0,0,0,1,0,1,0
7,0.0,3.028661e-07,0.000029,0.0,0.0,0.015656,0.015656,0.00,0.00,0.0,...,0,0,0,0,0,0,1,0,1,0
8,0.0,3.057506e-07,0.000152,0.0,0.0,0.015656,0.015656,0.00,0.00,0.0,...,0,0,0,0,0,0,1,0,1,0
9,0.0,3.028661e-07,0.000121,0.0,0.0,0.035225,0.035225,0.00,0.00,0.0,...,0,0,0,0,0,0,1,0,1,0


In [320]:
# create a function called encodeLabelBinarizer

encodeLabelBinary = lambda x: 0 if x == 'normal.' else 1


In [321]:
outcome_df['outcome'] = outcome_df['outcome'].apply(encodeLabelBinary)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [322]:
outcome_df['outcome'].shape

(494020,)

In [335]:
#input_matrix = network_input_df.as_matrix()

  """Entry point for launching an IPython kernel.


In [336]:
#input_matrix

array([[0.00000000e+00, 3.44690506e-07, 9.42688423e-05, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 3.38921627e-07, 2.59336301e-04, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 3.15846112e-07, 2.59336301e-04, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       ...,
       [0.00000000e+00, 2.92770597e-07, 2.32762574e-04, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 4.19685930e-07, 2.32762574e-04, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 3.15846112e-07, 2.39357513e-04, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00]])

In [337]:
x_train, x_test, y_train, y_test = train_test_split(network_input_df, outcome_df['outcome'] , test_size=0.2, random_state=42)

In [338]:
x_train.shape

(395216, 106)

In [339]:
y_train.shape

(395216,)

In [340]:
x_test.shape

(98804, 106)

In [341]:
y_test.shape

(98804,)

# Training and Prediction using Regression

In [342]:
# linear regression

lin_reg_model = LinearRegression()

lin_reg_model.fit(x_train, y_train)

y_pred_linear = lin_reg_model.predict(x_test)