In [30]:
# CRISP-DM = The CRoss Industry Standard Process for Data Mining
#     - business understanding
#     - API (pull in data, often an older snippet or capture)
#     - Data Understanding
#     - Data preparation
#     - (no data segregation)
#     - Modeling
#     - Evaluation
#     - Deployment

# ML Pipeline
#     - Problem Definition
#     - Data Import (live connection)
#     - (No data understanding?)
#     - Data Preparation
#     - Data Segregation
#         - training and testing sets
#     - Model Training
#     - Model Evaluation
#     - Model Deployment
#     - Pereformance Monitoring


# in ML Pipeline, EVERYTHING should be a function

In [31]:
def import_data(path):    
    import pandas as pd, numpy as np
    df = pd.read_csv(path)
    # Add missing data to test creating a new missing data function
    # Delete these lines when the pipeline is ready
    df.iloc[1:4, 3] = np.nan
    df.iloc[2:3, 3:7] = np.nan
    df.iloc[0] = np.nan
    df['missing_test'] = np.nan
    return df

In [32]:
def bin_categories(df, features=[], cutoff=0.05, replace_with='Other', messages=True):
  import pandas as pd

  if len(features) == 0: features = df.columns

  for feat in features:
    if feat in df.columns:
      if not pd.api.types.is_numeric_dtype(df[feat]):
        other_list = df[feat].value_counts()[df[feat].value_counts() / df.shape[0] < cutoff].index
        df.loc[df[feat].isin(other_list), feat] = replace_with
        if messages: print(f'{feat} has been binned by setting {other_list} to {replace_with}')
    else:
      if messages: print(f'{feat} not found in the DataFrame provided. No binning performed')

  return df


In [33]:
def missing_data(df, label, row_thresh=0.7, col_thresh=0.9):
    #drop any row that has a missing label
    df.dropna(axis='rows', subset=[label], inplace=True)
    # drop rows and columns that are 100% missing
    df.dropna(axis='columns', thresh=1, inplace=True)
    df.dropna(axis='rows', thresh=1, inplace=True)
    # drop rows and columns that are below the threshold
    df.dropna(axis='columns', thresh=round(df.shape[0] * row_thresh), inplace=True)
    df.dropna(axis='rows', thresh=round(df.shape[1] * row_thresh), inplace=True)
    # impute the remaining values

    #return dataframe
    return df

In [34]:
# run the pipeline
df = import_data('network_traffic.csv')
df = bin_categories(df, cutoff=0.02)
df = missing_data(df, 'attack')
print(df.attack.value_counts()/ df.shape[0])
df.head()

protocol_type has been binned by setting Index([], dtype='object', name='protocol_type') to Other
service has been binned by setting Index(['telnet', 'finger', 'ftp', 'auth', 'Z39_50', 'uucp', 'courier', 'bgp',
       'whois', 'uucp_path', 'iso_tsap', 'time', 'imap4', 'nnsp', 'vmnet',
       'urp_i', 'domain', 'ctf', 'csnet_ns', 'supdup', 'discard', 'http_443',
       'daytime', 'gopher', 'efs', 'systat', 'link', 'exec', 'hostnames',
       'name', 'mtp', 'echo', 'klogin', 'login', 'ldap', 'netbios_dgm',
       'sunrpc', 'netbios_ssn', 'netstat', 'netbios_ns', 'ssh', 'kshell',
       'nntp', 'pop_3', 'sql_net', 'IRC', 'ntp_u', 'rje', 'remote_job',
       'pop_2', 'X11', 'printer', 'shell', 'urh_i', 'tim_i', 'red_i',
       'pm_dump', 'tftp_u', 'http_8001', 'aol', 'harvest', 'http_2784'],
      dtype='object', name='service') to Other
flag has been binned by setting Index(['RSTR', 'RSTO', 'S1', 'SH', 'S2', 'RSTOS0', 'S3', 'OTH'], dtype='object', name='flag') to Other
attack has been bin

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,last_flag
1,0.0,udp,other,,146.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,1.0,0.0,0.0,0.0,0.0,0.08,0.15,0.0,255.0,1.0,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15.0
2,0.0,tcp,private,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,123.0,6.0,1.0,1.0,0.0,0.0,0.05,0.07,0.0,255.0,26.0,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19.0
3,0.0,tcp,http,,232.0,8153.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,5.0,0.2,0.2,0.0,0.0,1.0,0.0,0.0,30.0,255.0,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21.0
4,0.0,tcp,http,SF,199.0,420.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,32.0,0.0,0.0,0.0,0.0,1.0,0.0,0.09,255.0,255.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21.0
5,0.0,tcp,private,REJ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,121.0,19.0,0.0,0.0,1.0,1.0,0.16,0.06,0.0,255.0,19.0,0.07,0.07,0.0,0.0,0.0,0.0,1.0,1.0,neptune,21.0
