<p style="font-family:font-family; font-size: 2em; color: red; font-weight: bold; text-align: center;">
Anomaly Detection Using Gaussian Mixture Probability Model to 
<p style="font-family:font-family; font-size: 2em; color: red; font-weight: bold; text-align: center;">
   Implement Intrusion Detection System 
    
</p>


<p style="font-family: Arial; font-size:1.75em;color:red; font-style:bold"><br>
I. Data Understanding :</p><br>


<p style="font-family: Arial; font-size:1.75em;color:green; font-style:bold"><br>
  1.Importing required libraries:</p><br>

In [None]:
import numpy as np
import pandas as pd
import io
import re
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.io import arff
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import seaborn as sns

pd.pandas.set_option('display.max_columns',None)

In [None]:
with open('KDDTrain+.ARFF', 'r') as file:
    content1 = file.read()

content1 = re.sub(r"\s+'icmp'", "'icmp'", content1)

data, meta = arff.loadarff(io.StringIO(content1))

data_train = pd.DataFrame(data)

with open('KDDTest+.ARFF', 'r') as file:
    content2 = file.read()

content2 = re.sub(r"\s+'icmp'", "'icmp'", content2)

data, meta = arff.loadarff(io.StringIO(content2))

data_test = pd.DataFrame(data)

data_train = data_train.applymap(lambda x: x.decode() if isinstance(x, bytes) else x) #pour Le decodage du type byte string 

data_test = data_test.applymap(lambda x: x.decode() if isinstance(x, bytes) else x) #pour Le decodage du type byte string 

## data train: 

In [None]:
data_train.head(7)

In [None]:
data_train.tail(7)

In [None]:
print(f'\033[94mNumber of records (rows) in the train_data are: {data_train.shape[0]}')
print(f'\033[94mNumber of features (columns) in train_data are: {data_train.shape[1]}')
print(f'\033[94mNumber of duplicate entries in train_data are: {data_train.duplicated().sum()}')
print(f'\033[94mNumber missing values in the train_data are: {sum(data_train.isna().sum())}')

In [None]:
data_train.describe()

In [None]:
data_train.info()

## data test:

In [None]:
data_test.head(7)

In [None]:
data_test.tail(7)

In [None]:
print(f'\033[94mNumber of records (rows) in the test_data are: {data_test.shape[0]}')
print(f'\033[94mNumber of features (columns) in test_data are: {data_test.shape[1]}')
print(f'\033[94mNumber of duplicate entries in test_data are: {data_test.duplicated().sum()}')
print(f'\033[94mNumber missing values in the test_data are: {sum(data_test.isna().sum())}')

In [None]:
data_train.describe()

In [None]:
data_train.info()

In [None]:
def afficher_shapes(ensemble_train, ensemble_test):
    print("Shape of data_train :", ensemble_train.shape)
    print("Shape of data_test :", ensemble_test.shape)
afficher_shapes(data_train, data_test)

### Visualization of the Data :

<p style="font-family: Arial; font-size:1.75em;color:green; font-style:bold"><br>
 2. Exploratory data analysis :
</p><br>


### A) Relationships :

### Correlation Matrix HeatMap

In [None]:
def correlation_matrix_heatMap (data_set):
    correlation_matrix = data_set.corr()

    plt.figure(figsize=(30, 30))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
    plt.title('Correlation Matrix Heatmap')
    plt.show()
   

In [None]:
 data=pd.concat([data_train,data_test]) # regrouping the data set together
correlation_matrix_heatMap(data)

In [None]:
sizes_test = dict(data_test['protocol_type'].value_counts())
fig, (ax2) = plt.subplots(1, figsize=(15, 15))
ax2.set_title("Distribution of protocol types in data_test")
ax2.pie(sizes_test.values(), labels=sizes_test.keys(), autopct="%.1f%%", pctdistance=0.85, shadow=True)
ax2.legend(title="xAttack", labels=sizes_test.keys(), bbox_to_anchor=(1, 1))
my_circle2 = plt.Circle((0, 0), 0.7, color='white')
ax2.add_artist(my_circle2)

plt.show()

<p style="font-family: Arial; font-size:1.75em;color:red; font-style:bold"><br>
II. Data Preparation:</p><br>

## 1) Checking for NAN values :

In [None]:
print(data_train.isnull().values.any())
print(data_test.isnull().values.any())

## 2) Checking for Duplicates values :

In [None]:
#Checking for duplicates in the Xtrain 
duplicate_data_train = data_train[data_train.duplicated(keep = 'last')]
duplicate_data_train

In [None]:
#Checking for duplicates in the Xtest
duplicate_data_test = data_test[data_test.duplicated(keep = 'last')]
duplicate_data_test

In [None]:
data_train = data_train.drop_duplicates()
data_test = data_test.drop_duplicates()

## Droping coorelated features:


there are only three categorical features (protocol, service and flag) that are not
independent from each other so We have removed only the service and flag  

In [None]:
data_train = data_train.drop(columns=['flag','service'],axis=1)
data_test= data_test.drop(columns=['flag','service'],axis=1)

In [None]:
data_test = data_test[data_test['protocol_type'] == 'tcp']
data_train = data_train[data_train['protocol_type'] == 'tcp']

In [None]:
data_test = data_test.drop('protocol_type', axis=1)
data_train = data_train.drop('protocol_type', axis=1)

In [None]:
data_train.shape

The unsupervised features are designed to highlight anomalies, so we should concentrate on features where there is router activity (message transmission). In such cases, attacks can occur. Therefore, we need to exclude all features that are above the third layer (network layer).

In [None]:
features_to_drop = ['hot', 'logged_in', 'num_failed_logins', 'num_compromised', 'root_shell', 'su_attempted',
                    'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'is_host_login', 'is_guest_login', 'urgent']

data_test = data_test.drop(features_to_drop, axis=1)
data_train = data_train.drop(features_to_drop, axis=1)

In [None]:
data_train.shape

As observed in the correlation heatmap, we will exclude 'num_outbound_cmds' as it exhibits no correlation with any other feature

In [None]:
data_test = data_test.drop('num_outbound_cmds', axis=1)
data_train = data_train.drop('num_outbound_cmds', axis=1)

In [None]:
data_train.shape

In [None]:
data_train1=data_train.copy()
data_test1=data_test.copy()
data_test = data_test.drop('class', axis=1)
data_train = data_train.drop('class', axis=1)


In [None]:
data_train

In [None]:
data_train.shape

After deleting the unnecessary features, we now have a dataset with a shape of (102689, 24)

<p style="font-family: Arial; font-size:1.75em;color:red; font-style:bold"><br>
III. Data Transformation :</p><br>

#### Now, we will apply an unsupervised algorithm to each instance in our dataset

## 1) d_raw: The original NSL dataset without any transformation of the numerical values

In [None]:
d_raw=data_train

## Normalization :

### 2) d_norm: The original NSL dataset with the normal training values normal-ized to the range [0-1] and the remaining values normalized according to the previous scaler :

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
d_norm = scaler.fit_transform(d_raw)
data_test_scaled = scaler.transform(data_test)
d_norm = pd.DataFrame(d_norm, columns=d_raw.columns)




In [None]:
d_norm

### 3) d_raw_pca: The uncorrelated version of the original NSL dataset with the same number of features.

In [None]:
from sklearn.decomposition import PCA

d_raw_PCA=d_raw 
pca_train = PCA()
principal_components_train = pca_train.fit_transform(d_raw_PCA)

pca_test = PCA()

explained_variance_ratio_train = pca_train.explained_variance_ratio_

In [None]:
d_raw_PCA

### 4) d_norm_pca: The uncorrelated version of the normalized dataset :

In [None]:
from sklearn.decomposition import PCA
d_norm_pca=d_norm
pca_train = PCA()
principal_components_train = pca_train.fit_transform(d_norm_pca)
explained_variance_ratio_train = pca_train.explained_variance_ratio_

d_norm_pca = pd.DataFrame(d_norm_pca, columns=d_raw.columns)

In [None]:
d_norm_pca

In [None]:
cumulative_variance_train = np.cumsum(explained_variance_ratio_train)

plt.figure(figsize=(10, 6))
plt.plot(range(1, len(cumulative_variance_train) + 1), cumulative_variance_train, marker='o', linestyle='-', color='b')
plt.title('Scree Plot for Training Data')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.grid(True)
plt.show()



In [None]:
plt.figure(figsize=(10, 6))
plt.bar(range(1, len(explained_variance_ratio_train) + 1), explained_variance_ratio_train, alpha=0.5, align='center')
plt.step(range(1, len(explained_variance_ratio_train) + 1), np.cumsum(explained_variance_ratio_train), where='mid')
plt.xlabel('Number of Components')
plt.ylabel('Explained Variance')
plt.title('Scree Plot for Training Data')
plt.axhline(y=0.95, color='r', linestyle='--', label='95% Variance Threshold')
plt.grid()
plt.show()

We can see that the number of the cumulative explained variance scree plot is often more informative for deciding how many principal components to retain. It shows the cumulative explained variance as you add more principal components
we remarke that we need 6 features to obtain the 95% thresh hold so we should retain those 6 components.

## 5) d_raw_probs: We apply the FGMPM to the original NSL dataset values and change each feature value for the occurrence probability of each feature in the normal model. 

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

selected_features = ['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate']
d_raw_probs = d_raw[selected_features]

plt.figure(figsize=(12, 8))

for feature in selected_features:
    if pd.api.types.is_numeric_dtype(d_raw_probs[feature]):
        sns.kdeplot(data=d_raw_probs, x=feature, fill=True, label=feature, bw=20)

plt.title('Kernel Density Estimation (KDE) for Traffic Features')
plt.xlabel('Non-Standardized Values')
plt.ylabel('Density')
plt.legend()
plt.show()

### 6) d_norm_probs: We apply the FGPM to the normalized version of the dataset : 

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
selected_features = ['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate']
subset_data = d_norm[selected_features]
plt.figure(figsize=(12, 8))
for feature in selected_features:
    sns.kdeplot(data=subset_data, x=feature, fill=True, label=feature,bw=20)
plt.title('Kernel Density Estimation (KDE) for Traffic Features')
plt.xlabel('Standardized Values')
plt.ylabel('Density')
plt.legend()
plt.show()


### 7) d_norm_pca_probs: The occurrence probabilities of the uncorrelated features of the normalized dataset :

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
selected_features = ['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate']
subset_data = d_norm_pca[selected_features]
plt.figure(figsize=(12, 8))
for feature in selected_features:
    sns.kdeplot(data=subset_data, x=feature, fill=True, label=feature,bw=20)
plt.title('Kernel Density Estimation (KDE) for Traffic Features')
plt.xlabel('Standardized Values')
plt.ylabel('Density')
plt.legend()
plt.show()


as we can see the most high density is provided by dst_bytes so it affects more the trafic and then we have srv_serror rate ,and then land and rerror rate and finally we have srv_count , duration and count .

Kde provides  insights into the distribution of values for each traffic feature in d_raw The shape of the KDE plot indicates the density of data points at different values of the features.
Dst_bytes (Destination Bytes): This feature has the highest density, indicating that there are many data points concentrated around certain values of dst_bytes. This suggests that dst_bytes is a significant and commonly occurring feature in
data_tr.
Srv_serror_rate (Server SYN Error Rate): This feature has the second-highest density. The higher density suggests that the distribution of srv_serror_rate values is concentrated around specific values.

Land: The land feature has a notable density, suggesting that there are certain conditions where the source and destination are the same host/port (land connections), and these conditions occur frequently enough to contribute to the overall density.

Rerror_rate (Error Rate): The density of the rerror_rate feature is also noteworthy, indicating that certain error rates occur more frequently in the dataset.

Srv_count (Server Count): The density of srv_count is lower compared to the previous features, suggesting that the distribution of server counts is more spread out, and there is less concentration around specific values.

Duration: The density of the duration feature is lower than some other features, suggesting a broader range of durations in the dataset.

Count: The density of the count feature is also relatively lower, indicating a spread of connection counts in the dataset.

### 8) d_raw_pca_probs: We apply the FGMPM to the uncorrelated version of the original dataset and obtain the occurrence probabilities for this uncorrelated values of the features:

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
selected_features = ['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate']
subset_data = d_raw_PCA[selected_features]
plt.figure(figsize=(12, 8))
for feature in selected_features:
    if pd.api.types.is_numeric_dtype(d_raw_PCA[feature]):
        sns.kdeplot(data=subset_data, x=feature, fill=True, label=feature,bw=20)
plt.title('Kernel Density Estimation (KDE) for Traffic Features')
plt.xlabel('Standardized Values')
plt.ylabel('Density')
plt.legend()
plt.show()


## I . Probability Voting Scheme :

## 1) Probability Voting Scheme for d_norm_probs :

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
np.random.seed(42)
d_norm_probs = np.random.rand(100, 10)  
def compute_feature_probabilities(data, gmm):
    return gmm.score_samples(data)
def compute_positive_evaluations(data, threshold):
    return np.sum(data > threshold, axis=1)
alpha = 0.05  
consensus = 1
gmm = GaussianMixture(n_components=2, init_params='random', random_state=42)
gmm.fit(d_norm_probs)
feature_probabilities = compute_feature_probabilities(d_norm_probs, gmm)
threshold = np.percentile(feature_probabilities, 100 * (1 - alpha))
positive_evaluations = compute_positive_evaluations(d_norm_probs, threshold)
plt.figure(figsize=(10, 6))
plt.bar(range(len(positive_evaluations)), positive_evaluations, color='blue')
plt.axhline(y=consensus, color='red', linestyle='--', label='Consensus Threshold')
plt.xlabel('Traffic Vector Index')
plt.ylabel('Number of Positive Evaluations')
plt.title('Voting Scheme Results for Each Traffic Vector')
plt.legend()
plt.show()
anomalous_vector_count = np.sum(positive_evaluations >= consensus)
if anomalous_vector_count >= consensus:
    print("The entire traffic vector is classified as anomalous.")
else:
    print("The entire traffic vector is classified as normal.")


## 2) Probability Voting Scheme for d_norm_pca_probs :

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
np.random.seed(42)
d_norm_pca_probs = np.random.rand(100, 10)  
def compute_feature_probabilities(data, gmm):
    return gmm.score_samples(data)
def compute_positive_evaluations(data, threshold):
    return np.sum(data > threshold, axis=1)
alpha = 0.05 
consensus = 1
gmm = GaussianMixture(n_components=2, init_params='random', random_state=42)
gmm.fit(d_norm_pca_probs)
feature_probabilities = compute_feature_probabilities(d_norm_pca_probs, gmm)
threshold = np.percentile(feature_probabilities, 100 * (1 - alpha))
positive_evaluations = compute_positive_evaluations(d_norm_pca_probs, threshold)
d_raw_probs = pd.DataFrame(d_raw_probs)
plt.figure(figsize=(10, 6))
plt.bar(range(len(positive_evaluations)), positive_evaluations, color='blue')
plt.axhline(y=consensus, color='red', linestyle='--', label='Consensus Threshold')
plt.xlabel('Traffic Vector Index')
plt.ylabel('Number of Positive Evaluations')
plt.title('Voting Scheme Results for Each Traffic Vector')
plt.legend()
plt.show()
anomalous_vector_count = np.sum(positive_evaluations >= consensus)
if anomalous_vector_count >= consensus:
    print("The entire traffic vector is classified as anomalous.")
else:
    print("The entire traffic vector is classified as normal.")


## 3) Probability Voting Scheme for d_raw_probs :

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
np.random.seed(42)
d_raw_probs  = np.random.rand(100, 10)  
def compute_feature_probabilities(data, gmm):
    return gmm.score_samples(data)
def compute_positive_evaluations(data, threshold):
    return np.sum(data > threshold, axis=1)

alpha = 0.05 
consensus = 1
gmm = GaussianMixture(n_components=2, init_params='random', random_state=42)
gmm.fit(d_raw_probs )

feature_probabilities = compute_feature_probabilities(d_raw_probs , gmm)

threshold = np.percentile(feature_probabilities, 100 * (1 - alpha))

positive_evaluations = compute_positive_evaluations(d_raw_probs, threshold)
d_raw_probs = pd.DataFrame(d_raw_probs)

plt.figure(figsize=(10, 6))
plt.bar(range(len(positive_evaluations)), positive_evaluations, color='blue')
plt.axhline(y=consensus, color='red', linestyle='--', label='Consensus Threshold')
plt.xlabel('Traffic Vector Index')
plt.ylabel('Number of Positive Evaluations')
plt.title('Voting Scheme Results for Each Traffic Vector')
plt.legend()
plt.show()

anomalous_vector_count = np.sum(positive_evaluations >= consensus)
if anomalous_vector_count >= consensus:
    print("The entire traffic vector is classified as anomalous.")
else:
    print("The entire traffic vector is classified as normal.")


## 4) Probability Voting Scheme for d_raw_pca_probs :

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture

np.random.seed(42)
d_raw_pca_probs = np.random.rand(100, 10) 

def compute_feature_probabilities(data, gmm):
    return gmm.score_samples(data)

def compute_positive_evaluations(data, threshold):
    return np.sum(data > threshold, axis=1)

alpha = 0.05  
consensus = 1
gmm = GaussianMixture(n_components=2, init_params='random', random_state=42)
gmm.fit(d_raw_pca_probs)
feature_probabilities = compute_feature_probabilities(d_raw_pca_probs, gmm)
threshold = np.percentile(feature_probabilities, 100 * (1 - alpha))
positive_evaluations = compute_positive_evaluations(d_raw_pca_probs, threshold)
plt.figure(figsize=(10, 6))
plt.bar(range(len(positive_evaluations)), positive_evaluations, color='blue')
plt.axhline(y=consensus, color='red', linestyle='--', label='Consensus Threshold')
plt.xlabel('Traffic Vector Index')
plt.ylabel('Number of Positive Evaluations')
plt.title('Voting Scheme Results for Each Traffic Vector')
plt.legend()
plt.show()
anomalous_vector_count = np.sum(positive_evaluations >= consensus)
if anomalous_vector_count >= consensus:
    print("The entire traffic vector is classified as anomalous.")
else:
    print("The entire traffic vector is classified as normal.")


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
selected_features = ['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate']
subset_data = d_norm[selected_features]
plt.figure(figsize=(12, 8))
for feature in selected_features:
    sns.kdeplot(data=subset_data, x=feature, fill=True, label=feature,bw=20)
plt.title('Kernel Density Estimation (KDE) for Traffic Features')
plt.xlabel('Standardized Values')
plt.ylabel('Density')
plt.legend()
plt.show()


## II. K-means :

In [None]:
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer
model = KMeans(n_init=10)
visualizer = KElbowVisualizer(model, k=(1,10))
visualizer.fit(d_norm) # Fit the data to the visualizer
visualizer.show()  

    Based on the elbow graphe we can see that we need only 3 clusters in the train data as said also in the article 

In [None]:
model1=KMeans(n_clusters=3, random_state=42)
model1.fit(d_norm)


## II.1.KM_D :

## 1) Construction of the model K-means (KM-D) for the dataset d_norm:

After obtaining three clusters from the elbow curve analysis, the next step is to determine which clusters correspond to normal features and which ones represent anomalous features.
the first scenario is the first cluster and the second cluster are normal and the third one is abnormal
the next scenario is the first is normal and the two others are abnormal
the third scenario is the first abnormal , the second one is normal and the third one is abnormal 
the fourth scenario is the first abnormal and the other two are normal
so we are going to calculate the distance between every observation and the centroid to see which observations are normal and which ones are abnormal

In [None]:
labels = np.where(model1.labels_ <= 1, 'Normal', 'Abnormal')
distances_to_centroids = model1.transform(d_norm)
df_results = pd.DataFrame({'Observation': d_norm.index, 'Assigned_Cluster': labels})
df_results['Distance_to_Centroid'] = distances_to_centroids[np.arange(len(distances_to_centroids)), model1.labels_]
percentage_by_cluster = pd.DataFrame(distances_to_centroids, columns=[f'Cluster_{i}' for i in range(model1.n_clusters)])
percentage_by_cluster['Assigned_Cluster'] = labels
percentage_by_cluster['Observation'] = d_norm.index
percentage_by_cluster.iloc[:, :-2] = percentage_by_cluster.iloc[:, :-2].apply(lambda x: x / x.sum() * 100, axis=1)
print(df_results)
print(percentage_by_cluster)

In [None]:
labels = np.where(model1.labels_ == 0, 'Normal', np.where(model1.labels_ == 1, 'Abnormal', 'Normal'))

distances_to_centroids = model1.transform(d_norm)
df_results = pd.DataFrame({'Observation': d_norm.index, 'Assigned_Cluster': labels})
df_results['Distance_to_Centroid'] = distances_to_centroids[np.arange(len(distances_to_centroids)), model1.labels_]

percentage_by_cluster = pd.DataFrame(distances_to_centroids, columns=[f'Cluster_{i}' for i in range(model1.n_clusters)])
percentage_by_cluster['Assigned_Cluster'] = labels
percentage_by_cluster['Observation'] = d_norm.index
percentage_by_cluster.iloc[:, :-2] = percentage_by_cluster.iloc[:, :-2].apply(lambda x: x / x.sum() * 100, axis=1)

print(df_results)
print(percentage_by_cluster)

total_normal1= np.sum(labels == 'Normal')
total_abnormal1= np.sum(labels == 'Abnormal')

print(f'Total observations in normal clusters: {total_normal1}')
print(f'Total observations in abnormal cluster: {total_abnormal1}')


In [None]:
labels = np.where(model1.labels_ == 1, 'Normal', 'Abnormal')
normal_cluster = np.unique(model1.labels_[labels == 'Normal'])
abnormal_cluster = np.unique(model1.labels_[labels == 'Abnormal'])
distances_to_centroids = model1.transform(d_norm)
df_results = pd.DataFrame({'Observation': d_norm.index, 'Assigned_Cluster': labels})
df_results['Distance_to_Centroid'] = distances_to_centroids[np.arange(len(distances_to_centroids)), model1.labels_]
percentage_by_cluster = pd.DataFrame(distances_to_centroids, columns=[f'Cluster_{i}' for i in range(model1.n_clusters)])
percentage_by_cluster['Assigned_Cluster'] = labels
percentage_by_cluster['Observation'] = d_norm.index
percentage_by_cluster.iloc[:, :-2] = percentage_by_cluster.iloc[:, :-2].apply(lambda x: x / x.sum() * 100, axis=1)

print(df_results)
print(percentage_by_cluster)


## 2) Construction of the model K-means (KM-D) for the dataset d_norm_probs:

In [None]:
labels = np.where(model1.labels_ <= 1, 'Normal', 'Abnormal')
distances_to_centroids = model1.transform(d_norm)
df_results = pd.DataFrame({'Observation': d_norm.index, 'Assigned_Cluster': labels})
df_results['Distance_to_Centroid'] = distances_to_centroids[np.arange(len(distances_to_centroids)), model1.labels_]
percentage_by_cluster = pd.DataFrame(distances_to_centroids, columns=[f'Cluster_{i}' for i in range(model1.n_clusters)])
percentage_by_cluster['Assigned_Cluster'] = labels
percentage_by_cluster['Observation'] = d_norm.index
percentage_by_cluster.iloc[:, :-2] = percentage_by_cluster.iloc[:, :-2].apply(lambda x: x / x.sum() * 100, axis=1)
print(df_results)
print(percentage_by_cluster)

In [None]:
labels = np.where(model1.labels_ == 0, 'Abnormal', np.where(model1.labels_ == 1, 'Normal', 'Abnormal'))
distances_to_centroids = model1.transform(d_norm)
df_results = pd.DataFrame({'Observation':d_norm.index, 'Assigned_Cluster': labels})
df_results['Distance_to_Centroid'] = distances_to_centroids[np.arange(len(distances_to_centroids)), model1.labels_]

percentage_by_cluster = pd.DataFrame(distances_to_centroids, columns=[f'Cluster_{i}' for i in range(model1.n_clusters)])
percentage_by_cluster['Assigned_Cluster'] = labels
percentage_by_cluster['Observation'] = d_norm.index
percentage_by_cluster.iloc[:, :-2] = percentage_by_cluster.iloc[:, :-2].apply(lambda x: x / x.sum() * 100, axis=1)

print(df_results)
print(percentage_by_cluster)

total_normal2 = np.sum(labels == 'Normal')
total_abnormal2 = np.sum(labels == 'Abnormal')

print(f'Total observations in normal clusters: {total_normal2}')
print(f'Total observations in abnormal cluster: {total_abnormal2}')


## 3) Construction of the model K-means (KM-D) for the dataset d_norm_pca:

In [None]:
labels = np.where(model1.labels_ <= 1, 'Normal', 'Abnormal')
distances_to_centroids = model1.transform(d_norm)
df_results = pd.DataFrame({'Observation': d_norm.index, 'Assigned_Cluster': labels})
df_results['Distance_to_Centroid'] = distances_to_centroids[np.arange(len(distances_to_centroids)), model1.labels_]
percentage_by_cluster = pd.DataFrame(distances_to_centroids, columns=[f'Cluster_{i}' for i in range(model1.n_clusters)])
percentage_by_cluster['Assigned_Cluster'] = labels
percentage_by_cluster['Observation'] = d_norm.index
percentage_by_cluster.iloc[:, :-2] = percentage_by_cluster.iloc[:, :-2].apply(lambda x: x / x.sum() * 100, axis=1)
print(df_results)
print(percentage_by_cluster)

In [None]:
labels = np.where(model1.labels_ == 0, 'Abnormal', np.where(model1.labels_ == 1, 'Normal', 'Abnormal'))
distances_to_centroids = model1.transform(d_norm)
df_results = pd.DataFrame({'Observation':d_norm.index, 'Assigned_Cluster': labels})
df_results['Distance_to_Centroid'] = distances_to_centroids[np.arange(len(distances_to_centroids)), model1.labels_]

percentage_by_cluster = pd.DataFrame(distances_to_centroids, columns=[f'Cluster_{i}' for i in range(model1.n_clusters)])
percentage_by_cluster['Assigned_Cluster'] = labels
percentage_by_cluster['Observation'] = d_norm.index
percentage_by_cluster.iloc[:, :-2] = percentage_by_cluster.iloc[:, :-2].apply(lambda x: x / x.sum() * 100, axis=1)

print(df_results)
print(percentage_by_cluster)

total_normal2 = np.sum(labels == 'Normal')
total_abnormal2 = np.sum(labels == 'Abnormal')

print(f'Total observations in normal clusters: {total_normal2}')
print(f'Total observations in abnormal cluster: {total_abnormal2}')


## Construction of the model K-means (KM-D) for the dataset d_raw_pca :

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.animation import FuncAnimation

fig = plt.figure(figsize=(16, 9))
ax = plt.axes(projection="3d")

ax.grid(b=True, color='grey', linestyle='-.', linewidth=0.3, alpha=0.2)

my_cmap = plt.get_cmap('hsv')

sctt = ax.scatter3D(d_norm.dst_host_serror_rate,d_norm.dst_host_srv_count,d_norm.serror_rate, alpha=0.8,
                    c=(d_norm.dst_host_serror_rate+d_norm.dst_host_srv_count +d_norm.serror_rate),
                    cmap=my_cmap,
                    marker='^')

plt.title("Animated 3D Scatter Plot")
ax.set_xlabel('X-axis', fontweight='bold')
ax.set_ylabel('Y-axis', fontweight='bold')
ax.set_zlabel('Z-axis', fontweight='bold')
fig.colorbar(sctt, ax=ax, shrink=0.5, aspect=5)
def update(frame):
    ax.view_init(elev=20, azim=frame) 
    return sctt,
animation = FuncAnimation(fig, update, frames=range(0, 360, 2), interval=50)
plt.show()


In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.animation import FuncAnimation
from IPython.display import HTML
%matplotlib inline
fig = plt.figure(figsize=(16, 9))
ax = plt.axes(projection="3d")
ax.grid(b=True, color='grey', linestyle='-.', linewidth=0.3, alpha=0.2)
my_cmap = plt.get_cmap('hsv')
sctt = ax.scatter3D(d_norm.dst_host_serror_rate,d_norm.dst_host_srv_count,d_norm.serror_rate, alpha=0.8,
                    c=(d_norm.dst_host_serror_rate+d_norm.dst_host_srv_count +d_norm.serror_rate),
                    cmap=my_cmap,
                    marker='^')
plt.title("Animated 3D Scatter Plot")
ax.set_xlabel('X-axis', fontweight='bold')
ax.set_ylabel('Y-axis', fontweight='bold')
ax.set_zlabel('Z-axis', fontweight='bold')
fig.colorbar(sctt, ax=ax, shrink=0.5, aspect=5)
def update(frame):
    ax.view_init(elev=20, azim=frame)  
    return sctt,
animation = FuncAnimation(fig, update, frames=range(0, 360, 2), interval=50)
html_output = animation.to_jshtml()
HTML(html_output)


# III. SVM :

## 1) SVM for the dataset d_norm :

In [None]:
from sklearn.svm import OneClassSVM
import numpy as np
svm_model = OneClassSVM(nu=0.01, kernel='rbf', gamma='auto')
svm_model.fit(d_norm)
predictions = svm_model.predict(d_norm)
predictions_binary = np.where(predictions == -1, 1, 0)
normal_count = np.sum(predictions_binary == 0)
anomaly_count = np.sum(predictions_binary == 1)

print("Normal instances:", normal_count)
print("Anomalous instances:", anomaly_count)


## 2) SVM for the dataset d_norm_probs :

In [None]:
from sklearn.svm import OneClassSVM
import numpy as np
svm_model = OneClassSVM(nu=0.01, kernel='rbf', gamma='auto')
svm_model.fit(d_norm_probs)
predictions = svm_model.predict(d_norm_probs)
predictions_binary = np.where(predictions == -1, 1, 0)
normal_count = np.sum(predictions_binary == 0)
anomaly_count = np.sum(predictions_binary == 1)

print("Normal instances:", normal_count)
print("Anomalous instances:", anomaly_count)


## 3) SVM for the dataset d_norm_pca :

In [None]:
from sklearn.svm import OneClassSVM
import numpy as np
svm_model = OneClassSVM(nu=0.01, kernel='rbf', gamma='auto')
svm_model.fit(d_norm_pca)
predictions = svm_model.predict(d_norm_pca)
predictions_binary = np.where(predictions == -1, 1, 0)
normal_count = np.sum(predictions_binary == 0)
anomaly_count = np.sum(predictions_binary == 1)

print("Normal instances:", normal_count)
print("Anomalous instances:", anomaly_count)


## 4) SVM for the dataset d_norm_pca_probs :

In [None]:
from sklearn.svm import OneClassSVM
import numpy as np
svm_model = OneClassSVM(nu=0.01, kernel='rbf', gamma='auto')
svm_model.fit(d_norm_pca_probs)
predictions = svm_model.predict(d_norm_pca_probs)
predictions_binary = np.where(predictions == -1, 1, 0)
normal_count = np.sum(predictions_binary == 0)
anomaly_count = np.sum(predictions_binary == 1)

print("Normal instances:", normal_count)
print("Anomalous instances:", anomaly_count)


## 5) SVM for the dataset d_raw :

In [None]:
from sklearn.svm import OneClassSVM
import numpy as np
svm_model = OneClassSVM(nu=0.01, kernel='rbf', gamma='auto')
svm_model.fit(d_raw)
predictions = svm_model.predict(d_raw)
predictions_binary = np.where(predictions == -1, 1, 0)
normal_count = np.sum(predictions_binary == 0)
anomaly_count = np.sum(predictions_binary == 1)

print("Normal instances:", normal_count)
print("Anomalous instances:", anomaly_count)


## 6) SVM for the dataset d_raw_probs :

In [None]:
from sklearn.svm import OneClassSVM
import numpy as np
svm_model = OneClassSVM(nu=0.01, kernel='rbf', gamma='auto')
svm_model.fit(d_raw_probs)
predictions = svm_model.predict(d_raw_probs)
predictions_binary = np.where(predictions == -1, 1, 0)
normal_count = np.sum(predictions_binary == 0)
anomaly_count = np.sum(predictions_binary == 1)

print("Normal instances:", normal_count)
print("Anomalous instances:", anomaly_count)


## 7) SVM for the dataset d_raw_pca :

In [None]:
from sklearn.svm import OneClassSVM
import numpy as np
svm_model = OneClassSVM(nu=0.01, kernel='rbf', gamma='auto')
svm_model.fit(d_raw_pca)
predictions = svm_model.predict(d_raw_pca)
predictions_binary = np.where(predictions == -1, 1, 0)
normal_count = np.sum(predictions_binary == 0)
anomaly_count = np.sum(predictions_binary == 1)

print("Normal instances:", normal_count)
print("Anomalous instances:", anomaly_count)


## 8) SVM for the dataset d_raw_pca_probs :

In [None]:
from sklearn.svm import OneClassSVM
import numpy as np
svm_model = OneClassSVM(nu=0.01, kernel='rbf', gamma='auto')
svm_model.fit(d_raw_pca_probs)
predictions = svm_model.predict(d_raw_pca_probs)
predictions_binary = np.where(predictions == -1, 1, 0)
normal_count = np.sum(predictions_binary == 0)
anomaly_count = np.sum(predictions_binary == 1)

print("Normal instances:", normal_count)
print("Anomalous instances:", anomaly_count)


## BIRCH (Balanced Iterative Reducing and Clustering using Hierarchies)
### BIRCH est un algorithme de clustering hiérarchique qui a été conçu pour gérer des ensembles de données volumineux et pour fonctionner efficacement avec des contraintes de mémoire. Voici une explication du théorème BIRCH :

### Construction de la Structure CF (Clustering Feature) :

BIRCH utilise une structure appelée Clustering Feature (CF) pour représenter chaque cluster. Un CF comprend des informations agrégées telles que la somme, la somme des carrés et le nombre d'éléments dans le cluster. Balanced Iterative Reducing :

L'algorithme suit une approche itérative pour construire et ajuster la structure CF tout en maintenant un équilibre entre la taille de la structure et la précision du clustering. La structure est ajustée à mesure que de nouvelles données sont ajoutées, et les clusters peuvent fusionner si nécessaire. Utilisation de la Structure CF pour le Clustering :

Une fois la structure CF construite, elle est utilisée pour effectuer le clustering. Les points de données sont affectés au cluster dont le CF est le plus proche.

## d_raw


In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import Birch
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
d_raw_birch = d_raw.copy()
birch = Birch(branching_factor=50, threshold=0.5, n_clusters=None)
birch.fit(d_raw_birch)
y_birch = birch.predict(d_raw_birch)
d_raw_np = d_raw.to_numpy()
plt.scatter(d_raw_np[:, 0], d_raw_np[:, 1], c=y_birch, s=40, cmap='viridis')
plt.title('Clustering avec BIRCH')
plt.show()
train_f_birch = d_raw[y_birch > 0]
train_c_birch = d_raw[y_birch <= 0]
unique_clusters = np.unique(y_birch)
num_clusters = len(unique_clusters)
print("Nombre de clusters :", num_clusters)
print("Clusters uniques :", unique_clusters)




## d_norm

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import Birch
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
d_raw_birch1 = d_norm.copy()
birch = Birch(branching_factor=50, threshold=0.5, n_clusters=None)
birch.fit(d_raw_birch1)
y_birch = birch.predict(d_raw_birch1)
d_norm_np = d_norm.to_numpy()
plt.scatter(d_norm_np[:, 0], d_norm_np[:, 1], c=y_birch, s=40, cmap='viridis')
plt.title('Clustering avec BIRCH')
plt.show()
train_f_birch = d_norm[y_birch > 0]
train_c_birch = d_norm[y_birch <= 0]
unique_clusters = np.unique(y_birch)
num_clusters = len(unique_clusters)
print("Nombre de clusters après normalisation:", num_clusters)
print("Clusters uniques après normalisation :", unique_clusters)



### d_norm_pca : 

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import Birch
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
d_norm_birch2 = d_norm_pca.copy()
birch = Birch(branching_factor=50, threshold=0.5, n_clusters=None)
birch.fit(d_norm_birch2)
y_birch = birch.predict(d_norm_birch2)
d_norm_pca_np = d_norm_pca.to_numpy()
plt.scatter(d_norm_pca_np[:, 0], d_norm_pca_np[:, 1], c=y_birch, s=40, cmap='viridis')
plt.title('Clustering avec BIRCH')
plt.show()
train_f_birch = d_norm_pca[y_birch > 0]
train_c_birch = d_norm_pca[y_birch <= 0]
unique_clusters = np.unique(y_birch)
num_clusters = len(unique_clusters)
print("Nombre de clusters après pca :", num_clusters)
print("Clusters uniques après pca:", unique_clusters)

## d_raw_probs :

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import Birch
import matplotlib.pyplot as plt
d_raw_probs_birch2 = d_raw_probs.copy()
birch = Birch(branching_factor=50, threshold=0.5, n_clusters=None)
birch.fit(d_raw_probs_birch2)
y_birch = birch.predict(d_raw_probs_birch2)
d_raw_probs_np =d_raw_probs.to_numpy()
plt.scatter(d_raw_probs_np[:, 0],d_raw_probs_np[:, 1], c=y_birch, s=40, cmap='viridis')
plt.title('Clustering avec BIRCH')
plt.show()
train_f_birch =d_raw_probs[y_birch > 0]
train_c_birch = d_raw_probs[y_birch <= 0]
unique_clusters = np.unique(y_birch)
num_clusters = len(unique_clusters)
print("Nombre de clusters :", num_clusters)
print("Clusters uniques:", unique_clusters)


## d_raw_pca_probs :

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import Birch
import matplotlib.pyplot as plt
d_raw_pca_birch2 = d_raw_pca_probs.copy()
birch = Birch(branching_factor=50, threshold=0.5, n_clusters=None)
birch.fit(d_raw_pca_birch2)
y_birch = birch.predict(d_raw_pca_birch2)
plt.scatter(d_raw_pca_probs[:, 0],d_raw_pca_probs[:, 1], c=y_birch, s=40, cmap='viridis')
plt.title('Clustering avec BIRCH')
plt.show()
train_f_birch =d_raw_pca_probs[y_birch > 0]
train_c_birch = d_raw_pca_probs[y_birch <= 0]
unique_clusters = np.unique(y_birch)
num_clusters = len(unique_clusters)
print("Nombre de clusters :", num_clusters)
print("Clusters uniques:", unique_clusters)


## d_norm_probs:

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import Birch
import matplotlib.pyplot as plt
d_norm_probs_birch2 = d_norm_probs.copy()
birch = Birch(branching_factor=50, threshold=0.5, n_clusters=None)
birch.fit(d_norm_probs_birch2)
y_birch = birch.predict(d_norm_probs_birch2)
plt.scatter(d_norm_probs[:, 0],d_norm_probs[:, 1], c=y_birch, s=40, cmap='viridis')
plt.title('Clustering avec BIRCH')
plt.show()
train_f_birch =d_norm_probs[y_birch > 0]
train_c_birch = d_norm_probs[y_birch <= 0]
unique_clusters = np.unique(y_birch)
num_clusters = len(unique_clusters)
print("Nombre de clusters  :", num_clusters)
print("Clusters uniques:", unique_clusters)


## d_norm_pca_probs:

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import Birch
import matplotlib.pyplot as plt
d_norm_pca_birch2 = d_norm_pca_probs.copy()
birch = Birch(branching_factor=50, threshold=0.5, n_clusters=None)
birch.fit(d_norm_pca_birch2)
y_birch = birch.predict(d_norm_pca_birch2)
plt.scatter(d_norm_pca_probs[:, 0],d_norm_pca_probs[:, 1], c=y_birch, s=40, cmap='viridis')
plt.title('Clustering avec BIRCH')
plt.show()
train_f_birch =d_norm_pca_probs[y_birch > 0]
train_c_birch = d_norm_pca_probs[y_birch <= 0]
unique_clusters = np.unique(y_birch)
num_clusters = len(unique_clusters)
print("Nombre de clusters après pca :", num_clusters)
print("Clusters uniques après pca:", unique_clusters)



## V. MLP : 

## 1) MLP on d_raw :

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_classification
import numpy as np
X, _ = make_classification(n_samples=10000, n_features=24, n_classes=2, random_state=42)
autoencoder = MLPRegressor(hidden_layer_sizes=(24,), max_iter=10000, random_state=42)
autoencoder.fit(d_raw, d_raw)
encoded_data = autoencoder.predict(d_raw)

In [None]:
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
mlp_regressor = MLPRegressor(hidden_layer_sizes=(50, 50), max_iter=1000, random_state=42)
mlp_regressor.fit(d_raw, d_raw) 
plt.scatter(encoded_data[:, 0], encoded_data[:, 1], label='Encoded Data')
plt.title('2D Representation of Encoded Data')
plt.legend()
plt.show()


## 2) MLP on d_norm:

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_classification
import numpy as np
X, _ = make_classification(n_samples=10000, n_features=24, n_classes=2, random_state=42)
autoencoder = MLPRegressor(hidden_layer_sizes=(24,), max_iter=10000, random_state=42)
autoencoder.fit(d_norm, d_norm)
encoded_data1= autoencoder.predict(d_norm)


In [None]:
import numpy as np
from sklearn.neural_network import MLPRegressor
import matplotlib.pyplot as plt
mlp_regressor = MLPRegressor(hidden_layer_sizes=(50, 50), max_iter=1000, random_state=42)
mlp_regressor.fit(d_norm, d_norm) 
plt.scatter(encoded_data1[:, 0], encoded_data1[:, 1], label='Encoded Data')
plt.title('2D Representation of Encoded Data')
plt.legend()
plt.show()


## 3) MLP on d_norm_pca :

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_classification
import numpy as np
X, _ = make_classification(n_samples=10000, n_features=24, n_classes=2, random_state=42)
autoencoder = MLPRegressor(hidden_layer_sizes=(24,), max_iter=10000, random_state=42)
autoencoder.fit(d_norm_pca, d_norm_pca)
encoded_data2= autoencoder.predict(d_norm_pca)

In [None]:
import numpy as np
from sklearn.neural_network import MLPRegressor
import matplotlib.pyplot as plt
mlp_regressor = MLPRegressor(hidden_layer_sizes=(50, 50), max_iter=1000, random_state=42)
mlp_regressor.fit(d_norm_pca,d_norm_pca) 
plt.scatter(encoded_data2[:, 0], encoded_data2[:, 1], label='Encoded Data')
plt.title('2D Representation of Encoded Data')
plt.legend()
plt.show()

## 4) MLP on  d_raw_probs :

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_classification
import numpy as np
X, _ = make_classification(n_samples=10000, n_features=24, n_classes=2, random_state=42)
autoencoder = MLPRegressor(hidden_layer_sizes=(24,), max_iter=10000, random_state=42)
autoencoder.fit(d_raw_probs,d_raw_probs)
encoded_data3= autoencoder.predict(d_raw_probs)

In [None]:
import numpy as np
from sklearn.neural_network import MLPRegressor
import matplotlib.pyplot as plt
mlp_regressor = MLPRegressor(hidden_layer_sizes=(50, 50), max_iter=1000, random_state=42)
mlp_regressor.fit(d_raw_probs,d_raw_probs) 
plt.scatter(encoded_data3[:, 0], encoded_data3[:, 1], label='Encoded Data')
plt.title('2D Representation of Encoded Data')
plt.legend()
plt.show()

<p style="font-family: Arial; font-size:1.75em;color:red; font-style:bold"><br>
IV. Evaluation:</p><br>


In [None]:
import numpy as np
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

data_sets = {
    'd_raw': d_raw,
    'd_raw_probs': d_raw_probs,
    'd_raw_pca': d_raw_PCA,
    'd_raw_pca_probs': d_raw_pca_probs,
    'd_norm': d_norm,
    'd_norm_probs': d_norm_probs,
    'd_norm_pca': d_norm_pca,
    'd_norm_pca_probs': d_norm_pca_probs,
}

fig, axes = plt.subplots(2, 4, figsize=(15, 8))
fig.suptitle('Clustering Results and Silhouette Scores')

silhouette_scores = {}

for i, (data_name, data) in enumerate(data_sets.items()):
    kmeans = KMeans(n_clusters=3, random_state=42)
    kmeans.fit(data)
    labels = kmeans.predict(data)
    silhouette_avg = silhouette_score(data, labels)
    silhouette_scores[data_name] = silhouette_avg
    reduced_data = PCA(n_components=2).fit_transform(data)
    axes[i // 4, i % 4].scatter(reduced_data[:, 0], reduced_data[:, 1], c=labels, cmap='viridis')
    axes[i // 4, i % 4].set_title(f'{data_name}\nSilhouette Score: {silhouette_avg:.2f}')

plt.show()
