In [None]:
import sklearn

print(sklearn.__version__)

In [None]:
import pandas as pd
import numpy as np
# from imblearn.under_sampling import RandomUnderSampler 
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import warnings
warnings.filterwarnings("ignore")

# **Data adjustment**

In [None]:
df = pd.read_csv('magic04.csv')
le=LabelEncoder()
df[10]=le.fit_transform(df[10])
df

Balancing the dataset

In [None]:
#Checking for Duplicates
df.duplicated().sum()

In [None]:
#Removing Duplicates
df=df.drop_duplicates()

In [None]:
#Checking for null values
df.isnull().sum()

In [None]:
X = df.drop(columns=10)
y = df[10]
y.value_counts()

In [None]:
pd.DataFrame(X).describe()

In [None]:
undersampler = RandomUnderSampler(random_state=42)
X, y = undersampler.fit_resample(X, y)
y.value_counts()

In [None]:
pd.DataFrame(X).describe()

Vectorization and normalization

In [None]:
X = np.array(X)
y = np.array(y)

In [None]:
scaler = StandardScaler().fit(X)
x = scaler.fit_transform(X)

In [None]:
pd.DataFrame(x).describe()

In [None]:
X_train, X_test, y_train, y_test= train_test_split(x, y, test_size= 0.3, random_state=42)

# **Logistic Regression**

In [None]:
lr_model=LogisticRegression()
lr_model.fit(X_train,y_train)

In [None]:
y_lr_pred= lr_model.predict(X_test)
y_lr_pred.shape

In [None]:
lr_cr=classification_report(y_test, y_lr_pred)
print(lr_cr)

In [None]:
model = LogisticRegression()
params = {
    'penalty': [None, 'l2'],
    'C': [0.01, 0.1, 1, 10, 100, 1000]
}
lg_cv = GridSearchCV(model, param_grid=params, cv=10,return_train_score=True, scoring=['accuracy', 'precision', 'recall', 'f1'],refit='accuracy')

In [None]:
lg_cv.fit(X_train,y_train)

In [None]:
print(lg_cv.best_params_)

In [None]:
result = pd.DataFrame(lg_cv.cv_results_)
result

In [None]:
required_results=result[['params','param_penalty','param_C', "mean_test_accuracy", "mean_test_precision", "mean_test_f1", "mean_test_recall"]]
required_results

In [None]:
lg_cv.best_params_

In [None]:
y_test_prediction = lg_cv.best_estimator_.predict(X_test)
matrix = confusion_matrix(y_test, y_test_prediction)
cmatrix = ConfusionMatrixDisplay(matrix)
cmatrix.plot()

In [None]:
print(classification_report(y_test,y_test_prediction))

# **KNN**

In [None]:
knn_model= KNN(n_neighbors=5, metric='minkowski', p=2 )  
knn_model.fit(X_train, y_train)

In [None]:
y_knn_pred= knn_model.predict(X_test)
y_knn_pred.shape

In [None]:
knn_cr=classification_report(y_test, y_knn_pred)
print(knn_cr)

In [None]:
model = KNN() #default 5
params = {
    "n_neighbors":list(range(3,21,2))
}
knn_cv = GridSearchCV(model, param_grid=params, cv=10, return_train_score=True, scoring=["accuracy", "f1", "recall", "precision"], refit="accuracy")

In [None]:
knn_cv.fit(X_train,y_train)

In [None]:
print(knn_cv.best_params_)

In [None]:
result = pd.DataFrame(knn_cv.cv_results_)
result

In [None]:
knn_selected_results = result[["param_n_neighbors", "mean_test_accuracy", "mean_test_precision", "mean_test_f1", "mean_test_recall"]]
knn_selected_results

In [None]:
knn_cv.best_params_

In [None]:
y_test_prediction = knn_cv.best_estimator_.predict(X_test)
matrix = confusion_matrix(y_test, y_test_prediction)
cmatrix = ConfusionMatrixDisplay(matrix)
cmatrix.plot()

In [None]:
# K-Fold Cross Validation

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

k = 10
kf = KFold(n_splits=k, shuffle=True, random_state=42)


# Logistic regerssion model
lr_model_scores = cross_val_score(lr_model,X, y, cv=kf)

# KNN model
knn_model_scores = cross_val_score(knn_model,X, y, cv=kf)


print("Logistic regression models' average accuracy:", np.mean(lr_model_scores))
print("KNN models' average accuracy:", np.mean(knn_model_scores))

In [None]:
lr_model_scores