In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', 6)
pd.set_option('display.max_rows', 5)

train_path = 'KDDTrain+.csv'
test_path = 'KDDTest+.csv'

train_df = pd.read_csv(train_path, header=None)
test_df = pd.read_csv(test_path, header=None)


print("Read {} rows.".format(len(train_df)))
print("Read {} rows.".format(len(test_df)))

train_df.dropna(inplace=True, axis=1)
test_df.dropna(inplace=True, axis=1)


#label names are courtesy of a KDDCup Data file analysis video
train_df.columns = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in',
    'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
    'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login',
    'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
    'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'outcome', 'difficulty_level'
]

test_df.columns = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in',
    'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
    'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login',
    'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
    'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'outcome', 'difficulty_level'
]

train_unique_labels = train_df['outcome'].unique()
test_unique_labels = test_df['outcome'].unique()


final_train_df = pd.DataFrame(columns=train_df.columns)
final_test_df = pd.DataFrame(columns=test_df.columns)

for label in train_unique_labels:
    label_train_df = train_df[train_df['outcome'] == label]
    final_train_df = pd.concat([final_train_df, label_train_df])

for label in test_unique_labels:
    label_test_df = test_df[test_df['outcome'] == label]
    final_test_df = pd.concat([final_test_df, label_test_df])

train_labels = final_train_df['outcome']
test_labels = final_test_df['outcome']

print('Training set distribution:')
print(train_labels.value_counts(normalize=True))
print('\nTest set distribution:')
print(test_labels.value_counts(normalize=True))

#outputs the training and testing files with headings. The files have not formally been split to make sure each label is represented, though.
final_train_df.to_csv('train_detailed.csv', index=False)
final_test_df.to_csv('test_detailed.csv', index=False)

print("New training set:")
print(final_train_df.head())
print("\nNew test set:")
print(final_test_df.head())

Read 125973 rows.
Read 22544 rows.


  final_train_df = pd.concat([final_train_df, label_train_df])
  final_test_df = pd.concat([final_test_df, label_test_df])


Training set distribution:
outcome
normal     0.534583
neptune    0.327165
             ...   
perl       0.000024
spy        0.000016
Name: proportion, Length: 23, dtype: float64

Test set distribution:
outcome
normal     0.430758
neptune    0.206574
             ...   
phf        0.000089
imap       0.000044
Name: proportion, Length: 38, dtype: float64
New training set:
   duration protocol_type   service  ... dst_host_srv_rerror_rate outcome  \
0         0           tcp  ftp_data  ...                     0.00  normal   
1         0           udp     other  ...                     0.00  normal   
3         0           tcp      http  ...                     0.01  normal   
4         0           tcp      http  ...                     0.00  normal   
12        0           tcp      http  ...                     0.00  normal   

   difficulty_level  
0                20  
1                15  
3                21  
4                21  
12               21  

[5 rows x 43 columns]

New te