In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', 6)
pd.set_option('display.max_rows', 5)

kddcup_path = 'kddcup.data.gz'
corrected_path = 'corrected.gz'

kddcup_df = pd.read_csv(kddcup_path, compression='gzip', header=None)
corrected_df = pd.read_csv(corrected_path, compression='gzip', header=None)

df = pd.concat([kddcup_df, corrected_df], ignore_index=True)

print("Read {} rows.".format(len(df)))
df.dropna(inplace=True, axis=1)

df.columns = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in',
    'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
    'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login',
    'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
    'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'outcome'
]

unique_labels = df['outcome'].unique()

final_train_df = pd.DataFrame(columns=df.columns)
final_test_df = pd.DataFrame(columns=df.columns)

for label in unique_labels:
    label_df = df[df['outcome'] == label]
    train_label_df, test_label_df = train_test_split(label_df, test_size=0.2, random_state=42)
    
    final_train_df = pd.concat([final_train_df, train_label_df])
    final_test_df = pd.concat([final_test_df, test_label_df])

train_labels = final_train_df['outcome']
test_labels = final_test_df['outcome']

print('Training set distribution:')
print(train_labels.value_counts(normalize=True))
print('\nTest set distribution:')
print(test_labels.value_counts(normalize=True))

final_train_df.to_csv('new_train_data.csv', index=False)
final_test_df.to_csv('new_test_data.csv', index=False)

print("New training set:")
print(final_train_df.head())
print("\nNew test set:")
print(final_test_df.head())


Read 5209460 rows.


  final_train_df = pd.concat([final_train_df, train_label_df])
  final_test_df = pd.concat([final_test_df, test_label_df])


Training set distribution:
outcome
smurf.        5.704985e-01
neptune.      2.169174e-01
                  ...     
worm.         2.399491e-07
sqlattack.    2.399491e-07
Name: proportion, Length: 40, dtype: float64

Test set distribution:
outcome
smurf.        5.704869e-01
neptune.      2.169132e-01
                  ...     
sqlattack.    9.597758e-07
perl.         9.597758e-07
Name: proportion, Length: 40, dtype: float64
New training set:
        duration protocol_type service  ... dst_host_rerror_rate  \
1453274        0           tcp    http  ...                  1.0   
1437726     7248           udp   other  ...                  0.0   
835616         0           tcp    http  ...                  0.0   
1039101        0           tcp    http  ...                  0.0   
7719          20           tcp     ftp  ...                  0.0   

        dst_host_srv_rerror_rate  outcome  
1453274                      1.0  normal.  
1437726                      0.0  normal.  
835616        