In [5]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, accuracy_score, classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
sns.set(style="white", color_codes=True)
warnings.filterwarnings('ignore')

In [6]:
# Read CC GENERAL dataset and handle missing values
df = pd.read_csv("CC GENERAL.csv")
df = df.fillna(df.iloc[:, 1:].mean())
df

Unnamed: 0,CUST_ID,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,C10001,40.900749,0.818182,95.40,0.00,95.40,0.000000,0.166667,0.000000,0.083333,0.000000,0,2,1000.0,201.802084,139.509787,0.000000,12
1,C10002,3202.467416,0.909091,0.00,0.00,0.00,6442.945483,0.000000,0.000000,0.000000,0.250000,4,0,7000.0,4103.032597,1072.340217,0.222222,12
2,C10003,2495.148862,1.000000,773.17,773.17,0.00,0.000000,1.000000,1.000000,0.000000,0.000000,0,12,7500.0,622.066742,627.284787,0.000000,12
3,C10004,1666.670542,0.636364,1499.00,1499.00,0.00,205.788017,0.083333,0.083333,0.000000,0.083333,1,1,7500.0,0.000000,864.206542,0.000000,12
4,C10005,817.714335,1.000000,16.00,16.00,0.00,0.000000,0.083333,0.083333,0.000000,0.000000,0,1,1200.0,678.334763,244.791237,0.000000,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8945,C19186,28.493517,1.000000,291.12,0.00,291.12,0.000000,1.000000,0.000000,0.833333,0.000000,0,6,1000.0,325.594462,48.886365,0.500000,6
8946,C19187,19.183215,1.000000,300.00,0.00,300.00,0.000000,1.000000,0.000000,0.833333,0.000000,0,6,1000.0,275.861322,864.206542,0.000000,6
8947,C19188,23.398673,0.833333,144.40,0.00,144.40,0.000000,0.833333,0.000000,0.666667,0.000000,0,5,1000.0,81.270775,82.418369,0.250000,6
8948,C19189,13.457564,0.833333,0.00,0.00,0.00,36.558778,0.000000,0.000000,0.000000,0.166667,2,0,500.0,52.549959,55.755628,0.250000,6


In [16]:
# Apply PCA on CC GENERAL dataset
x = df.iloc[:, 1:-1]
y = df.iloc[:, -1]

pca = PCA(3)
x_pca = pca.fit_transform(x)

df2 = pd.DataFrame(data=x_pca, columns=['principal component 1', 'principal component 2', 'principal component 3'])
df2 = pd.concat([df2, df.iloc[:, -1]], axis=1)
df2


Unnamed: 0,principal component 1,principal component 2,principal component 3,TENURE
0,-4326.383979,921.566882,183.708383,12
1,4118.916665,-2432.846346,2369.969289,12
2,1497.907641,-1997.578694,-2125.631328,12
3,1394.548536,-1488.743453,-2431.799649,12
4,-3743.351896,757.342657,512.476492,12
...,...,...,...,...
8945,-4208.357725,1122.443291,136.925895,6
8946,-4123.923788,951.683820,634.880037,6
8947,-4379.443989,911.504583,101.257055,6
8948,-4791.117531,1032.540961,358.038214,6


In [17]:
# Apply k-means algorithm on the original dataset
#x = df3.iloc[:, 0:-1]
#y = df3.iloc[:, -1]

n = 3  # number of clusters
k_means = KMeans(n_clusters=n)
k_means.fit(x)
y_cluster_kmeans = k_means.predict(x)

# Calculate silhouette score
score = silhouette_score(x, y_cluster_kmeans)
print("Silhouette Score without PCA: ", score)

Silhouette Score without PCA:  0.4668190896235943


In [19]:
# Perform Scaling+PCA+K-Means and report performance
scaler = StandardScaler()
x_scale = scaler.fit_transform(x)

pca2 = PCA(3)
x_pca2 = pca.fit_transform(x_scale)

n = 3
km = KMeans(n_clusters=n)
km.fit(x_pca2)

y_cluster_kmeans = km.predict(x_pca2)
score_pca = silhouette_score(x_pca2, y_cluster_kmeans)
print("Silhouette Score with Scaling+PCA+K-Means: ", score_pca)

if score_pca > score:
    print("Silhouette score improved with PCA.")
elif score_pca < score:
    print("Silhouette score decreased with PCA.")
else:
    print("Silhouette score remained the same with PCA.")

Silhouette Score with Scaling+PCA+K-Means:  0.38168437550650613
Silhouette score decreased with PCA.


In [10]:
# Read pd_speech_features dataset
df_pd = pd.read_csv("pd_speech_features.csv")

# Perform Scaling
scaler = StandardScaler()
x_scale = scaler.fit_transform(x)

In [11]:
# Apply PCA (k=3)
pca = PCA(3)
x_pca = pca.fit_transform(x_scale)

principalDf = pd.DataFrame(data=x_pca, columns=['principal component 1', 'principal component 2', 'Principal Component 3'])
finalDf = pd.concat([principalDf, df_pd[['class']]], axis=1)

In [12]:
# Use SVM to report performance
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)
svm_classifier = SVC()
svm_classifier.fit(x_train, y_train)
y_pred = svm_classifier.predict(x_test)

print(classification_report(y_test, y_pred, zero_division=1))
print(confusion_matrix(y_test, y_pred))
acc_svc = accuracy_score(y_pred, y_test)
print('SVM accuracy:', acc_svc)


              precision    recall  f1-score   support

           6       1.00      0.00      0.00        55
           7       1.00      0.00      0.00        45
           8       1.00      0.00      0.00        54
           9       1.00      0.00      0.00        48
          10       1.00      0.00      0.00        73
          11       1.00      0.00      0.00        92
          12       0.86      1.00      0.93      2318

    accuracy                           0.86      2685
   macro avg       0.98      0.14      0.13      2685
weighted avg       0.88      0.86      0.80      2685

[[   0    0    0    0    0    0   55]
 [   0    0    0    0    0    0   45]
 [   0    0    0    0    0    0   54]
 [   0    0    0    0    0    0   48]
 [   0    0    0    0    0    0   73]
 [   0    0    0    0    0    0   92]
 [   0    0    0    0    0    0 2318]]
SVM accuracy: 0.8633147113594041


In [13]:
# Read Iris dataset
df_iris = pd.read_csv("Iris.csv")
print(df_iris.head())

# Check for missing values
df_iris.isnull().any()

# Prepare data for LDA
x = df_iris.iloc[:, :-1].values
y = df_iris.iloc[:, -1].values

# Encode target labels
le = LabelEncoder()
y = le.fit_transform(y)

# Split the dataset into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

# Standardize the data
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
print(x_train.shape,x_test.shape)
# Apply LDA on the Iris dataset to reduce dimensionality of data to k=2
lda = LDA(n_components=2)
x_train_lda = lda.fit_transform(x_train, y_train)
x_test_lda = lda.transform(x_test)
print(x_train_lda.shape,x_test_lda.shape)

   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa
(105, 5) (45, 5)
(105, 2) (45, 2)
