In [53]:
def import_data(path, sample=0.1):

  import pandas as pd, numpy as np
  pd.set_option('display.max_columns', None)
  df = pd.read_csv(path)
  df = df.sample(frac=sample, random_state=3)

  df['missing_test'] = np.nan

  return df

In [54]:
def bin_categories(df, features=[], cutoff=0.05, replace_with='Other', messages=True):
  import pandas as pd

  if len(features) == 0: features = df.columns

  for feat in features:
    if feat in df.columns:
      if not pd.api.types.is_numeric_dtype(df[feat]):
        other_list = df[feat].value_counts()[df[feat].value_counts() / df.shape[0] < cutoff].index
        df.loc[df[feat].isin(other_list), feat] = replace_with
        if messages: print(f'{feat} has been binned by setting {other_list} to {replace_with}')
    else:
      if messages: print(f'{feat} not found in the DataFrame provided. No binning performed')

  return df

In [55]:
def Xandy(df, label):
  import pandas as pd

  y = df[label]
  X = df.drop(columns = [label])
  return X, y

In [56]:
def dummy_code(X):
  import pandas as pd
  X = pd.get_dummies(X, drop_first=True)
  return X

In [57]:
def missing_data(df, label, row_thresh=0.7, col_thresh=0.9, random=False):
  import pandas as pd
  # Drop any row that has a missing label
  df.dropna(axis='rows', subset=[label], inplace=True)

  # Drop rows and columns have are 100% missing
  df.dropna(axis='columns', thresh=1, inplace=True)
  df.dropna(axis='rows', thresh=1, inplace=True)

  # Drop rows and columns that are below the thresholds
  df.dropna(axis='columns', thresh=round(df.shape[0] * row_thresh), inplace=True)
  df.dropna(axis='rows', thresh=round(df.shape[1] * col_thresh), inplace=True)

  # Impute the remaining values
  if df.isna().sum().sum() > 0:
    from sklearn.experimental import enable_iterative_imputer
    from sklearn.impute import IterativeImputer, KNNImputer

    X, y = Xandy(df, label)
    X = dummy_code(X.copy())

    random_state = 3
    if random: random_state = 0
    imp = IterativeImputer(max_iter=10, random_state=random_state)
    X = pd.DataFrame(imp.fit_transform(X), columns=X.columns, index=X.index)
    df = X.merge(y, left_index=True, right_index=True)

  # Return the DataFrame
  return df

In [58]:
def fit_cv_model(df, label, k=5, repeat=True, random=False):
  from sklearn.model_selection import KFold, RepeatedKFold, cross_val_score
  import pandas as pd
  from numpy import mean

  X, y = Xandy(df, label)
  X = dummy_code(X.copy())

  if repeat:
    cv = RepeatedKFold(n_splits=k, n_repeats=5)
  else:
    cv = KFold(n_splits=k)

  random_state=3
  if random==True: random_state = 0

  if pd.api.types.is_numeric_dtype(df[label]):
    from sklearn.ensemble import RandomForestRegressor
  else:
    from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
    from sklearn.linear_model import RidgeClassifier, LogisticRegression

    model_rfc = RandomForestClassifier(random_state=random_state)
    model_ridge = RidgeClassifier(random_state=random_state)
    model_gbc = GradientBoostingClassifier(random_state=random_state)
    model_log = LogisticRegression(random_state=random_state, max_iter=10000)

    scores_rfc = cross_val_score(model_rfc, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    scores_ridge = cross_val_score(model_ridge, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    scores_gbc = cross_val_score(model_gbc, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    scores_log = cross_val_score(model_log, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

    scores = {mean(scores_rfc):model_rfc,
              mean(scores_ridge):model_ridge,
              mean(scores_gbc):model_gbc,
              mean(scores_log):model_log}

    print(f'Accuracy (RandomForest):\t{mean(scores_rfc)}')
    print(f'Accuracy (GradientBoosting):\t{mean(scores_gbc)}')
    print(f'Accuracy (Logistic):\t\t{mean(scores_log)}')
    print(f'Accuracy (Ridge):\t\t{mean(scores_ridge)}')

    return scores[max(scores.keys())].fit(X,y)

In [59]:
def save_model(model, file_name):
  import pickle
  pickle.dump(model, open(file_name, "wb"))

In [60]:
# Import Data
df = import_data('network_traffic.csv', sample=.015)

# Data Preparation -> Choosing the values that make model fit the highest
df = bin_categories(df, cutoff=0.02, messages=False)
df = missing_data(df, 'attack', col_thresh=0.95)

# Modeling (segretation, modeling, evaluation)
# CV: Choose the values that require the least processing power, but do not change the model fit very much
# Algorithms: Choose the values that give the highest model fit
model = fit_cv_model(df, 'attack', k=5, repeat=False)

# Deployment (save the best trained model)
save_model(model, 'saved_model.sav')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracy (RandomForest):	0.9968253968253968
Accuracy (GradientBoosting):	0.9920634920634921
Accuracy (Logistic):		0.9232804232804233
Accuracy (Ridge):		0.9582010582010583


In [61]:
# Run the pipeline
df = import_data('network_traffic.csv')
df = bin_categories(df, cutoff = 0.02, messages=False)
df = missing_data(df, 'attack', col_thresh=0.92)
df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,last_flag
117958,0,tcp,http,SF,235,1248,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,255,255,1.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,normal,21
25436,0,tcp,Other,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,236,20,1.0,1.0,0.0,0.0,0.08,0.06,0.0,255,20,0.08,0.07,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
58615,0,icmp,ecr_i,SF,1032,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,507,507,0.0,0.0,0.0,0.0,1.0,0.0,0.0,255,160,0.63,0.03,0.63,0.0,0.0,0.0,0.0,0.0,smurf,18
40862,1,tcp,smtp,SF,2875,337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,5,0.0,0.0,0.0,0.0,1.0,0.0,1.0,108,62,0.12,0.66,0.01,0.03,0.0,0.0,0.61,0.0,normal,21
92386,0,tcp,Other,REJ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,297,20,0.0,0.0,1.0,1.0,0.07,0.06,0.0,255,20,0.08,0.07,0.0,0.0,0.0,0.0,1.0,1.0,neptune,20
