In [45]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Load the data into a DataFrame
df = pd.read_csv('cleaned_data.dat', header=None, sep=' ')

# Drop the extra column
# df = df.drop(columns=[19])

# Rename the last column as the target variable
df.rename(columns={18: 'target'}, inplace=True)

# Convert the target variable to string datatype
df['target'] = df['target'].astype(str)

# Scale the integer features
scaler = StandardScaler()
X = np.abs(scaler.fit_transform(df.iloc[:, :-1]))

# Perform feature selection
selector = SelectKBest(chi2, k=10)
X_new = selector.fit_transform(X, df['target'])

# Perform dimensionality reduction
pca = PCA(n_components=2)
X_new = pca.fit_transform(X_new)

# Handle class imbalance
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_new, df['target'])

# Train a k-NN classifier
k = 5  # number of neighbors
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_resampled, y_resampled)

# Make predictions on the test data
y_pred = knn.predict(X_resampled)

# Calculate the accuracy and confusion matrix
print("Accuracy:", accuracy_score(y_resampled, y_pred))
print("Confusion matrix:", confusion_matrix(y_resampled, y_pred))

Accuracy: 0.6016746411483254
Confusion matrix: [[159  10  13  27]
 [ 29 122  37  21]
 [ 36  42 107  24]
 [ 31  34  29 115]]
