In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', 6)
pd.set_option('display.max_rows', 5)

#takes in the detailed train and test files that have been previously made.
train_path = 'train_detailed.csv'
test_path = 'test_detailed.csv'

train_df = pd.read_csv(train_path, header=0)
test_df = pd.read_csv(test_path, header=0)

#the files are merged together
df = pd.concat([train_df, test_df], ignore_index=True)

print("Read {} rows.".format(len(df)))
df.dropna(inplace=True, axis=1)

unique_labels = df['outcome'].unique()

final_train_df = pd.DataFrame(columns=df.columns)
final_test_df = pd.DataFrame(columns=df.columns)

#the files are split to ensure an equal representation of each label in both the training and testing files. 70-30 split is shown.
for label in unique_labels:
    label_df = df[df['outcome'] == label]
    train_label_df, test_label_df = train_test_split(label_df, test_size=0.3, random_state=42)
    
    final_train_df = pd.concat([final_train_df, train_label_df])
    final_test_df = pd.concat([final_test_df, test_label_df])

train_labels = final_train_df['outcome']
test_labels = final_test_df['outcome']

print('Training set distribution:')
print(train_labels.value_counts(normalize=True))
print('\nTest set distribution:')
print(test_labels.value_counts(normalize=True))

#the csv files are downloaded, however preprocessing will still need to be done manually.
final_train_df.to_csv('train-70.csv', index=False)
final_test_df.to_csv('test-30.csv', index=False)
#df.to_csv('combined.csv', index=False)
print("New training set:")
print(final_train_df.head())
print("\nNew test set:")
print(final_test_df.head())

Read 148517 rows.


  final_train_df = pd.concat([final_train_df, train_label_df])
  final_test_df = pd.concat([final_test_df, test_label_df])


Training set distribution:
outcome
normal       0.518909
neptune      0.308910
               ...   
sqlattack    0.000010
udpstorm     0.000010
Name: proportion, Length: 40, dtype: float64

Test set distribution:
outcome
normal       0.518621
neptune      0.308745
               ...   
sqlattack    0.000022
udpstorm     0.000022
Name: proportion, Length: 40, dtype: float64
New training set:
       duration protocol_type   service  ... dst_host_srv_rerror_rate outcome  \
25718         0           tcp      http  ...                     0.99  normal   
63642        76           udp  domain_u  ...                     0.00  normal   
20383         0           udp     ntp_u  ...                     0.00  normal   
40177         0           tcp      http  ...                     0.00  normal   
130802        0           tcp  ftp_data  ...                     0.00  normal   

       difficulty_level  
25718                21  
63642                21  
20383                21  
40177         