In [None]:
import pandas as pd
import numpy as np
import gzip
import requests
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder

# from ann_visualizer.visualize import ann_viz    

from keras.models import Sequential
from keras.utils import plot_model
from keras.layers import Dense
from keras.optimizers import Adam

In [None]:
df = pd.read_csv('kddcup.data_10_percent.gz', header=None)
cols = pd.read_csv('kddcup.names',header=None)

df.head()

In [None]:
with open('training_attack_types', 'r') as f:
    attack_types = f.read()
    print(attack_types)

In [None]:
## Creating a dictionary of attack types
types = dict()
types['normal'] = 'normal'
for line in attack_types.split("\n"):
    if line:
        attack, description = line.split(" ", 1)
        types[attack] = description

print(types)

In [None]:
if cols[0][0] == 'back':
    cols = cols.drop(cols.index[0])
    cols.reset_index(drop=True, inplace=True)

cols = cols.dropna(axis=1)
cols.head()

In [None]:
cols[[0,1]] = cols[0].str.split(':',expand = True)

cols.head()

In [None]:
names = cols[0].tolist()
names.append('label')
df.columns = names

In [None]:
df.head()

In [None]:
df['Attack Type'] = df['label'].apply(lambda x: types[x[:-1]])
df.head()

In [None]:
AT_count = df['Attack Type'].value_counts()
AT_count

In [None]:
AT_per = AT_count/len(df)*100
AT_per

In [None]:
lab_count = df.label.value_counts()
lab_count

In [None]:
lab_per = lab_count/len(df)*100
lab_per

In [None]:
print("Shape :",df.shape)
print("Number of features :",len(df.columns))
print("Number of unique services :",df.service.nunique())
print("Number of labels :",len(df['label'].unique()))
print("missing values :",df.isnull().sum().sum())

In [None]:
df.isna().sum().sum()

In [None]:
categorical = df.select_dtypes(include=['object']).columns
print("Categorical features :",categorical)
categorical = categorical.tolist()
print("Categorical features list:",categorical)
categorical.remove('label')
categorical.remove('Attack Type')
print("Extracted categorical features list:",categorical)

In [None]:
fig, ax = plt.subplots(figsize=(7, 7))
sns.countplot(x='protocol_type', data=df, ax=ax, palette='Blues_d')
sns.set_style("darkgrid")

for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))

In [None]:
df.protocol_type.value_counts()/len(df)*100

In [None]:
fig, ax = plt.subplots(figsize=(17, 7))
sns.countplot(x='service', data=df, ax=ax, palette='Spectral', order=df['service'].value_counts().index,linewidth=0)
sns.set_style("dark")
plt.xticks(rotation=90)
plt.show()

In [None]:
df.service.value_counts()/len(df)*100

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
sns.countplot(x='flag', data=df, ax=ax, palette='Blues_r', order=df['flag'].value_counts().index,linewidth=0)
plt.show()

In [None]:
df.flag.value_counts()/len(df)*100

In [None]:
# sum of all the attack types except SF, S0 and REJ
(df['Attack Type'].value_counts().sum() - df['Attack Type'].value_counts()[0] - df['Attack Type'].value_counts()[1] - df['Attack Type'].value_counts()[2])/(df['Attack Type'].value_counts().sum())*100

In [None]:
fig, ax = plt.subplots(figsize=(15,5))
sns.countplot(x='Attack Type', data=df, ax=ax, palette='Greens_r', order=df['Attack Type'].value_counts().index,linewidth=0)
plt.show()
print('Top 3 the attack types are : ',df['Attack Type'].value_counts().index[:3].tolist())

In [None]:
fig,axis = plt.subplots(figsize=(12,10))
sns.heatmap(df.isnull(), cmap='cool')
print("we can see that there are no missing values in the dataset")
plt.title("Missing values in the dataset")
axis.set_xlabel("Features")
axis.set_ylabel("Rows")
plt.show()

In [None]:
# Seleccionar solo las columnas numéricas
df_numeric = df.select_dtypes(include=np.number)

# Eliminar las columnas con una única categoría
df_numeric = df_numeric[[col for col in df_numeric if df_numeric[col].nunique() > 1]]

# Calcular la matriz de correlación
corr = df_numeric.corr()

# Crear el gráfico de calor
fig, ax = plt.subplots(figsize=(17, 15))
sns.heatmap(corr, cmap='coolwarm', ax=ax, linewidths=0.1)
plt.title("Correlación entre variables")
plt.show()

In [None]:
high_corr = corr[abs(corr) > 0.8] # type: ignore

high_corr_pairs = high_corr.unstack().sort_values(kind="quicksort", ascending=False).drop_duplicates()

high_corr_pairs

In [None]:
df.shape


In [None]:
df.dtypes

In [None]:
df['protocol_type'].value_counts()

In [None]:
df['service'].value_counts()

In [None]:
df['flag'].value_counts()

In [None]:
Le = LabelEncoder()
df['protocol_type'] = Le.fit_transform(df['protocol_type'])
df['service'] = Le.fit_transform(df['service'])
df['flag'] = Le.fit_transform(df['flag'])

In [None]:
df['protocol_type'].value_counts()

In [None]:
df = df.drop(['service'], axis=1)

In [None]:
df['flag'].value_counts()

In [None]:
df.head()

In [None]:
df.to_csv('processed_kdd.csv', index=False)

In [None]:
X = df.drop(['label', 'Attack Type'], axis=1)
y = df['Attack Type']
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
y_train = np.array(y_train)
y_test = np.array(y_test)
encoder = OneHotEncoder()
y_train = encoder.fit_transform(y_train.reshape(-1, 1)).toarray()
y_test = encoder.fit_transform(y_test.reshape(-1, 1)).toarray()