# Import Library

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

from scipy.stats import spearmanr
from sklearn.preprocessing import MinMaxScaler
from ydata_profiling import ProfileReport
from scipy.interpolate import PchipInterpolator
from imblearn.combine import SMOTEENN
from sklearn.model_selection import train_test_split

# Import Dataset

In [2]:
# Path to training dataset
train_PATH = 'D:\DAC-2023\Dataset\DataTrain_Preliminary.csv'

# Load dataset
df = pd.read_csv(train_PATH, delimiter=';')
df.shape

(112446, 42)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112446 entries, 0 to 112445
Data columns (total 42 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     112446 non-null  object 
 1   protocol_type                111778 non-null  object 
 2   service                      111426 non-null  object 
 3   flag                         111593 non-null  object 
 4   src_bytes                    112446 non-null  object 
 5   dst_bytes                    112446 non-null  object 
 6   land                         112446 non-null  object 
 7   wrong_fragment               112446 non-null  object 
 8   urgent                       112446 non-null  object 
 9   hot                          112446 non-null  object 
 10  num_failed_logins            112446 non-null  object 
 11  logged_in                    112446 non-null  object 
 12  num_compromised              112446 non-null  object 
 13 

# Data Cleaning

## 1. Investigate Unique Value

In [4]:
# Initialize a dictionary to store missing value representations
missing_value_representations = ['', ' ', '-', 'NA', 'N/A', '999', '-999', None, np.nan, '*', '9999', '-9999', '99999', '-99999']

# Initialize a set to store unique types of missing values
unique_missing_value_types = set()

# Iterate through columns and check for missing value representations
for column in df.columns:
    for value in df[column]:
        if value in missing_value_representations:
            unique_missing_value_types.add(value)

# Boolean indexing to filter rows with unique missing value types
filtered_df = df[df.apply(lambda row: any(val in unique_missing_value_types for val in row), axis=1)]

# Display the filtered DataFrame
print("Unique types of missing values found in the DataFrame:")
print(unique_missing_value_types)
print("Rows containing unique missing value types:")
filtered_df

Unique types of missing values found in the DataFrame:
{nan, '999', '99999', '*'}
Rows containing unique missing value types:


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,type_of_attack
1,0,tcp,private,S0,0,0,0,0,0,0,...,5,0.02,0.08,0.00,0,1.00,1,0,0,neptune
2,0,tcp,http,SF,285,3623,0,0,0,0,...,228,1.00,0.00,0.01,*,0.00,0,0,0,normal
3,0,tcp,http,SF,232,584,0,0,0,0,...,255,1.00,0.00,0.17,*,0.00,0,0,0,normal
4,1,tcp,smtp,SF,1080,327,0,0,0,0,...,154,0.58,0.02,0.00,*,0.00,0,0,0,normal
5,0,tcp,discard,S0,0,0,0,0,0,0,...,18,0.07,0.05,0.00,0,1.00,1,0,0,neptune
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112441,0,tcp,http,REJ,0,0,0,0,0,0,...,255,1.00,0.00,0.14,*,0.00,0,*,*,normal
112442,0,tcp,http,SF,309,758,0,0,0,0,...,255,1.00,0.00,0.02,*,0.02,*,0,0,normal
112443,0,tcp,http,SF,363,721,0,0,0,0,...,255,1.00,0.00,0.00,0,0.00,0,0,0,normal
112444,0,tcp,discard,RSTO,0,0,0,0,0,0,...,7,0.03,0.06,0.00,0,0.00,0,1,1,neptune


## 2. Remove NaN Value

In [5]:
# Count rows with NaN values in specific columns
columns_to_check = ['protocol_type', 'service', 'flag', 'type_of_attack']
nan_rows_count = df[columns_to_check].isnull().any(axis=1).sum()

print("Number of rows with NaN values in the specified columns:", nan_rows_count)

Number of rows with NaN values in the specified columns: 2442


In [6]:
# Calculate the percentage of rows with missing values in column 'protocol_type', 'service', 'flag', 'type_of_attack'
percentage_rows_with_missing = (nan_rows_count / len(df)) * 100
print(f"Persentage of rows with NaN value/s: {percentage_rows_with_missing:.2f}%")

Persentage of rows with NaN value/s: 2.17%


Because it is less then 5%, so we could delete it.

In [7]:
df = df.dropna(subset=columns_to_check)
df.shape

(110004, 42)

## 3. Imputation Method

### 3.1. Binary Column

In [8]:
# column that is binary (0=no, 1=yes)
bin_df = df.copy()
bin_df = bin_df[['land','logged_in','root_shell','su_attempted',
                'is_host_login','is_guest_login','serror_rate', 
                'srv_serror_rate','rerror_rate','srv_rerror_rate',
                'same_srv_rate','diff_srv_rate','srv_diff_host_rate',
                'type_of_attack']]

In [9]:
# Function to filter df based on type of attack
def filtered_by_type_of_attack(df, values):
    filtered = df[df['type_of_attack'] == values]
    return filtered

# List of unique attack types
attack_types = bin_df['type_of_attack'].unique()

# Initialize a dictionary to store probabilities for each attack type
probability_dict = {}

# Iterate through each attack type and store probabilities
for attack_type in attack_types:
    filtered_data = filtered_by_type_of_attack(bin_df, attack_type)

    probability_land = filtered_data['land'].value_counts(normalize=True).get('0', 0)
    probability_logged_in = filtered_data['logged_in'].value_counts(normalize=True).get('0', 0)
    probability_root_shell = filtered_data['root_shell'].value_counts(normalize=True).get('0', 0)
    probability_su_attempted = filtered_data['su_attempted'].value_counts(normalize=True).get('0', 0)
    probability_is_host_login = filtered_data['is_host_login'].value_counts(normalize=True).get('0', 0)
    probability_is_guest_login = filtered_data['is_guest_login'].value_counts(normalize=True).get('0', 0)
    probability_serror_rate = filtered_data['serror_rate'].value_counts(normalize=True).get('0', 0)
    probability_srv_serror_rate = filtered_data['srv_serror_rate'].value_counts(normalize=True).get('0', 0)
    probability_rerror_rate = filtered_data['rerror_rate'].value_counts(normalize=True).get('0', 0)
    probability_srv_rerror_rate = filtered_data['srv_rerror_rate'].value_counts(normalize=True).get('0', 0)
    probability_same_srv_rate = filtered_data['same_srv_rate'].value_counts(normalize=True).get('0', 0)
    probability_diff_srv_rate = filtered_data['diff_srv_rate'].value_counts(normalize=True).get('0', 0)
    probability_srv_diff_host_rate = filtered_data['srv_diff_host_rate'].value_counts(normalize=True).get('0', 0)

    # Store probabilities in the dictionary
    probability_dict[attack_type] = {
        'land': probability_land,
        'logged_in': probability_logged_in,
        'root_shell': probability_root_shell,
        'su_attempted': probability_su_attempted,
        'is_host_login': probability_is_host_login,
        'is_guest_login': probability_is_guest_login,
        'serror_rate': probability_serror_rate,
        'srv_serror_rate': probability_srv_serror_rate,
        'rerror_rate': probability_rerror_rate,
        'srv_rerror_rate': probability_srv_rerror_rate,
        'same_srv_rate': probability_same_srv_rate,
        'diff_srv_rate': probability_diff_srv_rate,
        'srv_diff_host_rate': probability_srv_diff_host_rate
    }

    # Function to convert '*' and '99999' based on probability
    def convert_value(value, probability):
      if value == '*' or value == '99999':
        random_number = np.random.rand()
        if random_number <= probability:
          return '0'
        else:
          return '1'
      else:
        return value

    for column in ['land', 'logged_in', 'root_shell', 'su_attempted', 'is_host_login', 'is_guest_login', 
                   'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 
                   'diff_srv_rate', 'srv_diff_host_rate']:
      # Ambil probabilitas yang sesuai dari dictionary
      probability = probability_dict[attack_type][column]

      # Terapkan konversi ke seluruh kolom dalam DataFrame yang sesuai dengan jenis serangan saat ini
      bin_df.loc[bin_df['type_of_attack'] == attack_type, column] = bin_df.loc[bin_df['type_of_attack'] == attack_type, column].apply(lambda x: convert_value(x, probability))

In [10]:
# Delete 'type_of_attack' column
bin_df = bin_df.drop('type_of_attack', axis=1)

# Mengubah tipe data menjadi numerik
bin_df = bin_df.astype(int)

In [11]:
bin_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 110004 entries, 0 to 112445
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype
---  ------              --------------   -----
 0   land                110004 non-null  int32
 1   logged_in           110004 non-null  int32
 2   root_shell          110004 non-null  int32
 3   su_attempted        110004 non-null  int32
 4   is_host_login       110004 non-null  int32
 5   is_guest_login      110004 non-null  int32
 6   serror_rate         110004 non-null  int32
 7   srv_serror_rate     110004 non-null  int32
 8   rerror_rate         110004 non-null  int32
 9   srv_rerror_rate     110004 non-null  int32
 10  same_srv_rate       110004 non-null  int32
 11  diff_srv_rate       110004 non-null  int32
 12  srv_diff_host_rate  110004 non-null  int32
dtypes: int32(13)
memory usage: 6.3 MB


### 3.2. Int Column

In [12]:
# column that has int value
int_df = df.copy()
int_df = int_df[['duration','src_bytes','dst_bytes','wrong_fragment',
                 'urgent','hot','num_failed_logins','num_compromised',
                 'num_root','num_file_creations','num_shells','num_access_files',
                 'num_outbound_cmds','count','srv_count','dst_host_count',
                 'dst_host_srv_count','type_of_attack']]

In [13]:
# 1. konversi * dan 99999 jadi nan value
int_df.replace(['*'], '99999', inplace=True)

# 2. ubah tipe data
# Mengubah tipe data menjadi numerik
columns_to_convert = [col for col in int_df.columns if col != 'type_of_attack']
int_df[columns_to_convert] = int_df[columns_to_convert].astype(int)

# 3. parting based on type of attack
# Function to filter df based on type of attack
def filtered_by_type_of_attack(df, values):
    filtered = df[df['type_of_attack'] == values]
    return filtered

nmap = filtered_by_type_of_attack(int_df, 'nmap')
neptune = filtered_by_type_of_attack(int_df, 'neptune')
normal = filtered_by_type_of_attack(int_df, 'normal')
dos = filtered_by_type_of_attack(int_df, 'Denial of Service Attack')
portsweep = filtered_by_type_of_attack(int_df, 'portsweep')
satan = filtered_by_type_of_attack(int_df, 'satan')
ipsweep = filtered_by_type_of_attack(int_df, 'ipsweep')
smurf = filtered_by_type_of_attack(int_df, 'smurf')

# stored df in list
filtered_dfs = [nmap, neptune, normal, dos, portsweep, satan, ipsweep, smurf]

# Loop through each filtered DataFrame and its corresponding index in the list
for filtered_df, attack_type in zip(filtered_dfs, ['nmap', 'neptune', 'normal', 'Denial of Service Attack', 'portsweep', 'satan', 'ipsweep', 'smurf']):
    for column in filtered_df.columns:
        if pd.api.types.is_numeric_dtype(filtered_df[column]):
            # Check if the column contains numeric data
            missing_mask = filtered_df[column] == 99999
            if missing_mask.any():
                # If there are missing values, interpolate
                x = filtered_df.index[~missing_mask]
                y = filtered_df[column][~missing_mask]
                pchip = PchipInterpolator(x, y, extrapolate='periodic')
                # Replace NaN values with interpolated values in the same DataFrame (filtered_df)
                interpolated_values = pchip(filtered_df.index)
                # Ensure the interpolated values are non-negative
                filtered_df[column].loc[missing_mask] = np.maximum(0, interpolated_values[missing_mask])

    # Copy the interpolated values back to the corresponding rows in the original int_df
    int_df.loc[int_df['type_of_attack'] == attack_type, filtered_df.columns] = filtered_df

# Mengubah tipe data menjadi numerik
columns_to_convert = [col for col in int_df.columns if col != 'type_of_attack']
int_df[columns_to_convert] = int_df[columns_to_convert].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df[column].loc[missing_mask] = np.maximum(0, interpolated_values[missing_mask])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df[column].loc[missing_mask] = np.maximum(0, interpolated_values[missing_mask])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df[column].loc[missing_mask] = np.maximum(0, interpolated_values[missing_mask])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http

In [14]:
# Delete 'type_of_attack' column
int_df = int_df.drop('type_of_attack', axis=1)

In [15]:
int_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 110004 entries, 0 to 112445
Data columns (total 17 columns):
 #   Column              Non-Null Count   Dtype
---  ------              --------------   -----
 0   duration            110004 non-null  int32
 1   src_bytes           110004 non-null  int32
 2   dst_bytes           110004 non-null  int32
 3   wrong_fragment      110004 non-null  int32
 4   urgent              110004 non-null  int32
 5   hot                 110004 non-null  int32
 6   num_failed_logins   110004 non-null  int32
 7   num_compromised     110004 non-null  int32
 8   num_root            110004 non-null  int32
 9   num_file_creations  110004 non-null  int32
 10  num_shells          110004 non-null  int32
 11  num_access_files    110004 non-null  int32
 12  num_outbound_cmds   110004 non-null  int32
 13  count               110004 non-null  int32
 14  srv_count           110004 non-null  int32
 15  dst_host_count      110004 non-null  int32
 16  dst_host_srv_count  11000

### 3.3. Float Column

In [16]:
# column that has float value
float_df = df.copy()
float_df = float_df[['dst_host_same_srv_rate','dst_host_diff_srv_rate',
                    'dst_host_same_src_port_rate','dst_host_srv_diff_host_rate',
                    'dst_host_serror_rate','dst_host_srv_serror_rate',
                    'dst_host_rerror_rate','dst_host_srv_rerror_rate',
                    'type_of_attack']]

In [17]:
# 1. konversi * dan 99999 jadi nan value
float_df.replace(['*','99999',99999], np.nan, inplace=True)

# 2. ubah tipe data
# Mengubah tipe data menjadi numerik
columns_to_convert = [col for col in float_df.columns if col != 'type_of_attack']
float_df[columns_to_convert] = float_df[columns_to_convert].astype(float)

# 3. parting based on type of attack
nmap = filtered_by_type_of_attack(float_df, 'nmap')
neptune = filtered_by_type_of_attack(float_df, 'neptune')
normal = filtered_by_type_of_attack(float_df, 'normal')
dos = filtered_by_type_of_attack(float_df, 'Denial of Service Attack')
portsweep = filtered_by_type_of_attack(float_df, 'portsweep')
satan = filtered_by_type_of_attack(float_df, 'satan')
ipsweep = filtered_by_type_of_attack(float_df, 'ipsweep')
smurf = filtered_by_type_of_attack(float_df, 'smurf')

# stored df in list
filtered_dfs = [nmap, neptune, normal, dos, portsweep, satan, ipsweep, smurf]

# Loop through each filtered DataFrame and its corresponding index in the list
for filtered_df, attack_type in zip(filtered_dfs, ['nmap', 'neptune', 'normal', 'Denial of Service Attack', 'portsweep', 'satan', 'ipsweep', 'smurf']):
    for column in filtered_df.columns:
        if pd.api.types.is_numeric_dtype(filtered_df[column]):
            # Check if the column contains numeric data
            missing_mask = filtered_df[column].isnull()
            if missing_mask.any():
                # If there are missing values, interpolate
                x = filtered_df.index[~missing_mask]
                y = filtered_df[column][~missing_mask]
                pchip = PchipInterpolator(x, y, extrapolate='periodic')
                # Replace NaN values with interpolated values in the same DataFrame (filtered_df)
                interpolated_values = pchip(filtered_df.index)
                # Ensure the interpolated values are non-negative
                filtered_df[column].loc[missing_mask] = np.maximum(0, interpolated_values[missing_mask])

    # Copy the interpolated values back to the corresponding rows in the original float_df
    float_df.loc[float_df['type_of_attack'] == attack_type, filtered_df.columns] = filtered_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df[column].loc[missing_mask] = np.maximum(0, interpolated_values[missing_mask])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df[column].loc[missing_mask] = np.maximum(0, interpolated_values[missing_mask])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df[column].loc[missing_mask] = np.maximum(0, interpolated_values[missing_mask])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http

In [18]:
# Delete 'type_of_attack' column
float_df = float_df.drop('type_of_attack', axis=1)

In [19]:
float_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 110004 entries, 0 to 112445
Data columns (total 8 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   dst_host_same_srv_rate       110004 non-null  float64
 1   dst_host_diff_srv_rate       110004 non-null  float64
 2   dst_host_same_src_port_rate  110004 non-null  float64
 3   dst_host_srv_diff_host_rate  110004 non-null  float64
 4   dst_host_serror_rate         110004 non-null  float64
 5   dst_host_srv_serror_rate     110004 non-null  float64
 6   dst_host_rerror_rate         110004 non-null  float64
 7   dst_host_srv_rerror_rate     110004 non-null  float64
dtypes: float64(8)
memory usage: 7.6 MB


### 3.4. Replace the Data in df

In [20]:
# Create a dictionary where keys are column names and values are corresponding DataFrames
replacement_data = {
    'land': bin_df,
    'logged_in': bin_df,
    'root_shell': bin_df,
    'su_attempted': bin_df,
    'is_host_login': bin_df,
    'is_guest_login': bin_df,
    'serror_rate': bin_df,
    'srv_serror_rate': bin_df,
    'rerror_rate': bin_df,
    'srv_rerror_rate': bin_df,
    'same_srv_rate': bin_df,
    'diff_srv_rate': bin_df,
    'srv_diff_host_rate': bin_df,
    'duration': int_df,
    'src_bytes': int_df,
    'dst_bytes': int_df,
    'wrong_fragment': int_df,
    'urgent': int_df,
    'hot': int_df,
    'num_failed_logins': int_df,
    'num_compromised': int_df,
    'num_root': int_df,
    'num_file_creations': int_df,
    'num_shells': int_df,
    'num_access_files': int_df,
    'num_outbound_cmds': int_df,
    'count': int_df,
    'srv_count': int_df,
    'dst_host_count': int_df,
    'dst_host_srv_count': int_df,
    'dst_host_same_srv_rate': float_df,
    'dst_host_diff_srv_rate': float_df,
    'dst_host_same_src_port_rate': float_df,
    'dst_host_srv_diff_host_rate': float_df,
    'dst_host_serror_rate': float_df,
    'dst_host_srv_serror_rate': float_df,
    'dst_host_rerror_rate': float_df,
    'dst_host_srv_rerror_rate': float_df,
}

# Iterate over the columns and replace data in 'df'
for column_name, source_df in replacement_data.items():
    df[column_name] = source_df[column_name]

## 4. Feature Engineering - One Hot Encoding

### 4.1. Change Categorical Data Into One Hot Encoding

In [21]:
def one_hot_encoder(df, cols_to_encode):
    df = df.copy()
    one_hot_encoded = pd.get_dummies(df[cols_to_encode], columns=cols_to_encode, dtype=int)
    df = df.drop(columns=cols_to_encode)
    df = pd.concat([df, one_hot_encoded], axis=1)
    return df

In [22]:
# Kolom-kolom yang ingin di-one-hot encoding
cols_to_encode = ['protocol_type', 'service', 'flag']

# Terapkan one-hot encoding untuk kolom-kolom tersebut
df = one_hot_encoder(df, cols_to_encode)

In [23]:
df.shape

(110004, 123)

## 5. Remove Duplicate Data

In [24]:
df = df.drop_duplicates()

In [25]:
df.shape

(108489, 123)

## 6. Unbalanced Sampling

In [26]:
clean_df = df.copy()
# feature
X = df.drop(columns=['type_of_attack'])
# label
y = df[['type_of_attack']]

In [27]:
# Create the TreeMap
fig = px.treemap(data_frame=df, path=['type_of_attack'])
fig.show()

In [28]:
smote_enn = SMOTEENN(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

In [None]:
# Menggabungkan data yang sudah diresampling menjadi satu DataFrame
resampled_df = pd.concat([X_resampled, y_resampled], axis=1)

# Buat TreeMap dengan data resampled_df
fig = px.treemap(
    data_frame=resampled_df,
    path=['type_of_attack'],
    color='type_of_attack'
)

# Menampilkan plot
fig.show()

# Exploratory Data Analysis

## 1. Normalization

In [None]:
# minMax = MinMaxScaler()
# for col in X.columns:
#     X[col] = minMax.fit_transform(X[[col]])
# X.head()

## 1. Check Correlation

In [None]:
# Calculate the Spearman correlation matrix
corr = X.corr(method='spearman')

# Change the main diagonal (correlation of columns with themselves) to NaN
for i in range(corr.shape[0]):
    corr.iloc[i, i] = np.nan

# Find columns with correlations > 0.95 or < -0.95
high_corr_cols = (corr.abs() > 0.95).any()

# Get the column names with high correlations and their correlation values
high_corr_columns = X.columns[high_corr_cols]

# Create a dictionary to store the correlated columns and their correlation values
correlation_data = {}

# Calculate and store the correlated columns and their correlation values
for col in high_corr_columns:
    correlated_cols = corr.index[corr[col].abs() > 0.95].tolist()
    if col in correlated_cols:
        correlated_cols.remove(col)  # Remove itself from the list if present
    correlated_data = {}
    for correlated_col in correlated_cols:
        correlation_value, _ = spearmanr(X[col], X[correlated_col])
        correlated_data[correlated_col] = correlation_value
    correlation_data[col] = correlated_data


In [None]:
# Define the file path where you want to save the output
output_file_path = 'high_correlation.txt'

# Open the file for writing
with open(output_file_path, 'w') as file:
    # Iterate through the correlation data dictionary and write it to the file
    for col, correlated_data in correlation_data.items():
        file.write(f"Kolom '{col}' berkorelasi dengan kolom-kolom berikut:\n")
        for correlated_col, correlation_value in correlated_data.items():
            file.write(f"{correlated_col}: {correlation_value:.2f}\n")
        file.write('\n')

In [None]:
# Check heat map with blue colormap
corr_mat = X.corr(method='spearman')
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corr_mat, vmax=.8, square=True, cmap='Blues', ax=ax)
plt.show()

## 2. Unbalanced Sampling

In [None]:
# # percentage based on 'type_of_attack' in df
# df['type_of_attack'].value_counts(normalize=True) * 100

# Train Test Split

In [None]:
# # separate train and test sets
# X_train, X_test, y_train, y_test = train_test_split(
#     resampled_df.drop(labels=['type_of_attack'], axis=1),
#     resampled_df['type_of_attack'],
#     test_size=0.2, 
#     random_state=5)

# X_train.shape, X_test.shape