# Import Library

In [None]:
!pip install ydata-profiling



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.interpolate import PchipInterpolator
from ydata_profiling import ProfileReport

# Load Dataset

In [None]:
# Path to training and prediction dataset
train_PATH = '/content/drive/Shareddrives/DAC 2023/PRELIMINARY ROUND/DataTrain_Preliminary.csv'
pred_PATH = '/content/drive/Shareddrives/DAC 2023/PRELIMINARY ROUND/Data_Prediction.csv'

In [None]:
# Load training data
df = pd.read_csv(train_PATH, delimiter=';')
df

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,type_of_attack
0,0,tcp,private,SH,0,0,0,0,0,0,...,1,0.01,0.94,0.95,0,0.95,1,0,0,nmap
1,0,tcp,private,S0,0,0,0,0,0,0,...,5,0.02,0.08,0.00,0,1.00,1,0,0,neptune
2,0,tcp,http,SF,285,3623,0,0,0,0,...,228,1.00,0.00,0.01,*,0.00,0,0,0,normal
3,0,tcp,http,SF,232,584,0,0,0,0,...,255,1.00,0.00,0.17,*,0.00,0,0,0,normal
4,1,tcp,smtp,SF,1080,327,0,0,0,0,...,154,0.58,0.02,0.00,*,0.00,0,0,0,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112441,0,tcp,http,REJ,0,0,0,0,0,0,...,255,1.00,0.00,0.14,*,0.00,0,*,*,normal
112442,0,tcp,http,SF,309,758,0,0,0,0,...,255,1.00,0.00,0.02,*,0.02,*,0,0,normal
112443,0,tcp,http,SF,363,721,0,0,0,0,...,255,1.00,0.00,0.00,0,0.00,0,0,0,normal
112444,0,tcp,discard,RSTO,0,0,0,0,0,0,...,7,0.03,0.06,0.00,0,0.00,0,1,1,neptune


# Binary

In [None]:
bool_df = df.copy()
bool_df = bool_df[['land', 'logged_in', 'root_shell', 'su_attempted',
                   'is_host_login', 'is_guest_login', 'type_of_attack']]

In [None]:
# Function to filter df based on type of attack
def filtered_by_type_of_attack(df, values):
    filtered = df[df['type_of_attack'] == values]
    return filtered

# List of unique attack types
attack_types = bool_df['type_of_attack'].unique()

# Initialize a dictionary to store probabilities for each attack type
probability_dict = {}

# Iterate through each attack type and store probabilities
for attack_type in attack_types:
    filtered_data = filtered_by_type_of_attack(bool_df, attack_type)

    probability_land = filtered_data['land'].value_counts(normalize=True).get('0', 0)
    probability_logged_in = filtered_data['logged_in'].value_counts(normalize=True).get('0', 0)
    probability_root_shell = filtered_data['root_shell'].value_counts(normalize=True).get('0', 0)
    probability_su_attempted = filtered_data['su_attempted'].value_counts(normalize=True).get('0', 0)
    probability_is_host_login = filtered_data['is_host_login'].value_counts(normalize=True).get('0', 0)
    probability_is_guest_login = filtered_data['is_guest_login'].value_counts(normalize=True).get('0', 0)

    # Store probabilities in the dictionary
    probability_dict[attack_type] = {
        'land': probability_land,
        'logged_in': probability_logged_in,
        'root_shell': probability_root_shell,
        'su_attempted': probability_su_attempted,
        'is_host_login': probability_is_host_login,
        'is_guest_login': probability_is_guest_login
    }

    # Function to convert '*' and '99999' based on probability
    def convert_value(value, probability):
      if value == '*' or value == '99999':
        random_number = np.random.rand()
        if random_number <= probability:
          return '0'
        else:
          return '1'
      else:
        return value

    for column in ['land', 'logged_in', 'root_shell', 'su_attempted', 'is_host_login', 'is_guest_login']:
      # Ambil probabilitas yang sesuai dari dictionary
      probability = probability_dict[attack_type][column]

      # Terapkan konversi ke seluruh kolom dalam DataFrame yang sesuai dengan jenis serangan saat ini
      bool_df.loc[bool_df['type_of_attack'] == attack_type, column] = bool_df.loc[bool_df['type_of_attack'] == attack_type, column].apply(lambda x: convert_value(x, probability))

# int

In [None]:
int_df = df.copy()
int_df = int_df[['duration','src_bytes','dst_bytes','wrong_fragment',
                 'urgent','hot','num_failed_logins','num_compromised',
                 'num_root','num_file_creations','num_shells','num_access_files',
                 'num_outbound_cmds', 'count','srv_count','dst_host_count',
                 'dst_host_srv_count','type_of_attack']]

In [None]:
# 1. konversi * dan 99999 jadi nan value
int_df.replace(['*'], '99999', inplace=True)

# 2. ubah tipe data
# Mengubah tipe data menjadi numerik
columns_to_convert = [col for col in int_df.columns if col != 'type_of_attack']
int_df[columns_to_convert] = int_df[columns_to_convert].astype(int)

# 3. parting based on type of attack
# Function to filter df based on type of attack
def filtered_by_type_of_attack(df, values):
    filtered = df[df['type_of_attack'] == values]
    return filtered

nmap = filtered_by_type_of_attack(int_df, 'nmap')
neptune = filtered_by_type_of_attack(int_df, 'neptune')
normal = filtered_by_type_of_attack(int_df, 'normal')
dos = filtered_by_type_of_attack(int_df, 'Denial of Service Attack')
portsweep = filtered_by_type_of_attack(int_df, 'portsweep')
satan = filtered_by_type_of_attack(int_df, 'satan')
ipsweep = filtered_by_type_of_attack(int_df, 'ipsweep')
smurf = filtered_by_type_of_attack(int_df, 'smurf')

# stored df in list
filtered_dfs = [nmap, neptune, normal, dos, portsweep, satan, ipsweep, smurf]

# Loop through each filtered DataFrame and its corresponding index in the list
for filtered_df, attack_type in zip(filtered_dfs, ['nmap', 'neptune', 'normal', 'Denial of Service Attack', 'portsweep', 'satan', 'ipsweep', 'smurf']):
    for column in filtered_df.columns:
        if pd.api.types.is_numeric_dtype(filtered_df[column]):
            # Check if the column contains numeric data
            missing_mask = filtered_df[column] == 99999
            if missing_mask.any():
                # If there are missing values, interpolate
                x = filtered_df.index[~missing_mask]
                y = filtered_df[column][~missing_mask]
                pchip = PchipInterpolator(x, y, extrapolate='periodic')
                # Replace NaN values with interpolated values in the same DataFrame (filtered_df)
                interpolated_values = pchip(filtered_df.index)
                # Ensure the interpolated values are non-negative
                filtered_df[column].loc[missing_mask] = np.maximum(0, interpolated_values[missing_mask])

    # Copy the interpolated values back to the corresponding rows in the original int_df
    int_df.loc[int_df['type_of_attack'] == attack_type, filtered_df.columns] = filtered_df

# Mengubah tipe data menjadi numerik
columns_to_convert = [col for col in int_df.columns if col != 'type_of_attack']
int_df[columns_to_convert] = int_df[columns_to_convert].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df[column].loc[missing_mask] = np.maximum(0, interpolated_values[missing_mask])


# Duplicate Dataset

In [None]:
ProfileReport(df).get_duplicates()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,type_of_attack,# duplicates
179,0,tcp,http,REJ,0,0,0,0,0,0,...,1.0,0.0,0.5,*,0.0,0,1,*,normal,27
152,0,tcp,http,REJ,0,0,0,0,0,0,...,1.0,0.0,1.0,*,0.0,0,1,*,normal,25
213,0,tcp,http,REJ,0,0,0,0,0,0,...,1.0,0.0,0.25,*,0.0,0,1,*,normal,16
235,0,tcp,http,REJ,0,0,0,0,0,0,...,1.0,0.0,0.14,*,0.0,0,1,*,normal,15
241,0,tcp,http,REJ,0,0,0,0,0,0,...,1.0,0.0,0.12,*,0.0,0,1,*,normal,15
232,0,tcp,http,REJ,0,0,0,0,0,0,...,1.0,0.0,0.17,*,0.0,0,1,*,normal,12
450,0,tcp,other,REJ,0,0,0,0,0,0,...,0.0,1.0,0.0,0,0.11,0,*,1,satan,12
454,0,tcp,other,REJ,0,0,0,0,0,0,...,0.0,1.0,0.0,0,0.15,0,*,1,satan,12
439,0,tcp,other,REJ,0,0,0,0,0,0,...,0.0,1.0,0.0,0,0.0,0,1,1,satan,11
448,0,tcp,other,REJ,0,0,0,0,0,0,...,0.0,1.0,0.0,0,0.09,0,*,1,satan,11


In [None]:
df.drop_duplicates(inplace=True)
df

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,type_of_attack
0,0,tcp,private,SH,0,0,0,0,0,0,...,1,0.01,0.94,0.95,0,0.95,1,0,0,nmap
1,0,tcp,private,S0,0,0,0,0,0,0,...,5,0.02,0.08,0.00,0,1.00,1,0,0,neptune
2,0,tcp,http,SF,285,3623,0,0,0,0,...,228,1.00,0.00,0.01,*,0.00,0,0,0,normal
3,0,tcp,http,SF,232,584,0,0,0,0,...,255,1.00,0.00,0.17,*,0.00,0,0,0,normal
4,1,tcp,smtp,SF,1080,327,0,0,0,0,...,154,0.58,0.02,0.00,*,0.00,0,0,0,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112440,5,tcp,smtp,SF,1377,328,0,0,0,0,...,117,0.66,0.03,0.01,0,0.00,0,0,0,normal
112442,0,tcp,http,SF,309,758,0,0,0,0,...,255,1.00,0.00,0.02,*,0.02,*,0,0,normal
112443,0,tcp,http,SF,363,721,0,0,0,0,...,255,1.00,0.00,0.00,0,0.00,0,0,0,normal
112444,0,tcp,discard,RSTO,0,0,0,0,0,0,...,7,0.03,0.06,0.00,0,0.00,0,1,1,neptune


# a
