# Compare Perform PCA and LDA & CNN

# Install and import relevant libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, MaxPooling1D, Dropout

# Load Data for PCA and LDA

In [3]:
df = pd.read_csv("stand_norm_e1.txt", delimiter='\s+')
new_values = pd.read_csv("y2_e1.txt", header=None, names=['New_Class'], delimiter='\s+')
df['Class'] = new_values['New_Class']

# Load Data for CNN with 2 dataset concatenated

In [32]:
dataset_e1 = pd.read_csv("stand_norm_e1.txt", delimiter='\s+')
dataset_e2 = pd.read_csv("stand_norm_e2.txt", delimiter='\s+')
dataset = pd.concat([dataset_e1, dataset_e2], axis=0)
#dataset = pd.read_csv("stand_norm_e1.txt", delimiter='\s+')
new_values = pd.read_csv("y2_e1.txt", header=None, names=['New_Class'], delimiter='\s+')
dataset['Class'] = new_values['New_Class']

# Split Data into Features and Target variable for PCA and LDA

In [33]:
X_pca_lda = df.drop('Class', axis=1)
y_pca_lda = df['Class']

# Split Data for CNN

In [34]:
X_cnn = dataset.iloc[:, 1:5].values
y_cnn = dataset['Class'].values
le = LabelEncoder()
y_cnn_encoded = le.fit_transform(y_cnn)
y_cnn_encoded = tf.keras.utils.to_categorical(y_cnn_encoded)

# Standardize Features for PCA and LDA

In [35]:
scaler = StandardScaler()
X_scaled_pca_lda = scaler.fit_transform(X_pca_lda)

# Split Data into Training and Testing sets for PCA and LDA

In [36]:
X_train_pca_lda, X_test_pca_lda, y_train_pca_lda, y_test_pca_lda = train_test_split(X_scaled_pca_lda, y_pca_lda, test_size=0.2)

# Split Data into Training and Testing sets for CNN

In [37]:
X_train_cnn, X_test_cnn, y_train_cnn, y_test_cnn = train_test_split(X_cnn, y_cnn_encoded, test_size=0.2, random_state=42)

# Define CNN Model

In [38]:
model = Sequential()
model.add(Conv1D(filters=128, kernel_size=3, activation='relu', padding='same', input_shape=(X_train_cnn.shape[1], 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=128, kernel_size=3, activation='relu', padding='same'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(y_train_cnn.shape[1], activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train_cnn, y_train_cnn, epochs=20, batch_size=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7a63d4e8f040>

# Define PCA Model

In [39]:
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_pca_lda)
X_test_pca = pca.transform(X_test_pca_lda)

# Define LDA Model

In [40]:
lda = LinearDiscriminantAnalysis(n_components=2)
X_train_lda = lda.fit_transform(X_train_pca_lda, y_train_pca_lda)
X_test_lda = lda.transform(X_test_pca_lda)

# Evaluate Models

In [41]:
pca_score = accuracy_score(y_test_pca_lda, lda.predict(X_test_pca_lda))
lda_score = accuracy_score(y_test_pca_lda, lda.predict(X_test_pca_lda))
cnn_loss, cnn_accuracy = model.evaluate(X_test_cnn, y_test_cnn, verbose=0)

# Compare Performance

In [42]:
if pca_score > lda_score and pca_score > cnn_accuracy:
    print("PCA performs the best with a score of {:.2f}".format(pca_score))
elif lda_score > pca_score and lda_score > cnn_accuracy:
    print("LDA performs the best with a score of {:.2f}".format(lda_score))
else:
    print("CNN performs the best with an accuracy of {:.2f}%".format(cnn_accuracy*100))

CNN performs the best with an accuracy of 77.91%


Lors de la concaténation des deux ensembles de données, à savoir stand_norm_e1 et *stand_norm_e2*, nous observons une diminution des performances, avec une précision maximale obtenue par le réseau de neurones convolutifs (CNN) de **77,91 %**. En revanche, lorsque le premier ensemble de données stand_norm_e1 est utilisé isolément, la performance du CNN s'améliore, atteignant une précision de **78,81 %.**