# GridSearch Classification NP

In [1]:
from pandas import read_excel
from pandas.plotting import scatter_matrix
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline
import numpy as np
import os
import pandas as pd

## 0. Import Data

In [2]:
# Load dataset
url = "../../data/processed/den/tanpabola3-1.xlsx"
names = ['lam_max1', 'csc_max1', 'lam_min', 'csc_min','lam_fwhm1','c_mid','lam_max2','csc_max2','lam_fwhm2','fwhm','posisi1','posisi2','posisi3','posisi4','posisi5','arah_k','arah_E','sb_putar','sudut1','sudut2','ket'] 
dataset = read_excel(url, names=names, header=None)

# Define input and output data
X = dataset.iloc[:,0:10].values
y = dataset.iloc[:,20].values

## 1. Model Selection

In [None]:
# Spot check algorithms
models = {
    'LR' : make_pipeline(StandardScaler(), LogisticRegression(solver='liblinear', multi_class='ovr')),
    'LDA' : LinearDiscriminantAnalysis(),
    'KNN' : KNeighborsClassifier(),
    'CART' : DecisionTreeClassifier(),
    'NB' : GaussianNB(),
    'SVM' : SVC(gamma='auto')
}

In [None]:
# Spot check algorithms
models = {
    'LR' : make_pipeline(StandardScaler(), LogisticRegression(solver='liblinear', multi_class='ovr')),
    'LDA' : LinearDiscriminantAnalysis(),
    'KNN' : make_pipeline(MinMaxScaler(), KNeighborsClassifier()),
    'CART' : DecisionTreeClassifier(),
    'NB' : GaussianNB(),
    'SVM' : make_pipeline(StandardScaler(), SVC(gamma='auto'))
}

In [3]:
# Spot check algorithms
models = {
    'LR' : make_pipeline(StandardScaler(), LogisticRegression(solver='liblinear', multi_class='ovr')),
    'LDA' : LinearDiscriminantAnalysis(),
    'KNN' : make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=9, weights='uniform', metric='manhattan')),
    'CART' : DecisionTreeClassifier(max_depth=7, min_samples_leaf=1, min_samples_split=2),
    'NB' : GaussianNB(),
    'SVM' : make_pipeline(StandardScaler(), SVC(gamma=0.1, kernel='rbf'))
}

In [4]:
#Cross-validation
kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=2)

# Evaluation storage
results = {name: {'accuracy': [], 'precision': []} for name in models}

# Training and evaluation
for name, model in models.items():
    for train_idx, test_idx in kf.split(X, y):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, average=None)

        results[name]['accuracy'].append(acc)
        results[name]['precision'].append(prec)

# Print results
for name, scores in results.items():
    print(f"Model: {name}")
    print(f"  Mean Accuracy: {np.mean(scores['accuracy']):.4f}")
    print(f"  Mean Precision: {np.mean(scores['precision']):.4f}")
    print()


Model: LR
  Mean Accuracy: 0.8818
  Mean Precision: 0.8699

Model: LDA
  Mean Accuracy: 0.8808
  Mean Precision: 0.8643

Model: KNN
  Mean Accuracy: 0.9488
  Mean Precision: 0.9400

Model: CART
  Mean Accuracy: 0.9714
  Mean Precision: 0.9768

Model: NB
  Mean Accuracy: 0.6158
  Mean Precision: 0.6522

Model: SVM
  Mean Accuracy: 0.9616
  Mean Precision: 0.9587



## 2. Data Training: `tanpa3bola`

In [5]:
# Load dataset
url = "../../data/processed/den/tanpabola3-1.xlsx"
names = ['lam_max1', 'csc_max1', 'lam_min', 'csc_min','lam_fwhm1','c_mid','lam_max2','csc_max2','lam_fwhm2','fwhm','posisi1','posisi2','posisi3','posisi4','posisi5','arah_k','arah_E','sb_putar','sudut1','sudut2','ket'] 
dataset = read_excel(url, names=names, header=None)

# Define input and output data
X = dataset.iloc[:,0:10].values
y = dataset.iloc[:,20].values

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=1,
    random_state=10
)

print(len(y_train))
print(len(y_valid))
print(len(y_valid)/(len(y_train) + len(y_valid)))

1014
1
0.0009852216748768472


### 2.0 `tanpa3bola` with `test_size = 0.5`

In [None]:
# Write results to a text file
with open("output_tanpabola3_50-50.txt", "w") as f:
    for name, model in models.items():
        model.fit(X_train,y_train)
        y_predict = model.predict(X_valid)
        report = classification_report(y_valid, y_predict)
        cf = confusion_matrix(y_valid, y_predict)

        f.write(f"Model: {name}\n")
        f.write(f"confusion matrix:\n {cf}\n")
        f.write(f"classification report: \n {report}\n")
        f.write(f"=====================================================\n")
print("Evaluation results exported to .txt file") 

### 2.1 Data Testing: `3bola`

In [7]:
# Load dataset
url = "../../data/processed/den/3_sph.xlsx"
names = ['lam_max1', 'csc_max1', 'lam_min', 'csc_min','lam_fwhm1','c_mid','lam_max2','csc_max2','lam_fwhm2','fwhm','posisi1','posisi2','posisi3','posisi4','posisi5','arah_k','arah_E','sb_putar','sudut1','sudut2','ket'] 
bola3 = read_excel(url, names=names, header=None)

# Define input and output data
X_bola3 = bola3.iloc[:,0:10].values
y_bola3 = bola3.iloc[:,20].values

print(len(X_bola3))

639


In [8]:
y_valid = y_bola3
X_valid = X_bola3

#y_valid = y_train
#X_valid = X_train
# Write results to a text file
with open("output_tanpa3bola_bola3.txt", "w") as f:
    for name, model in models.items():
        model.fit(X_train,y_train)
        y_predict = model.predict(X_valid)
        report = classification_report(y_valid, y_predict)
        cf = confusion_matrix(y_valid, y_predict)

        f.write(f"Model: {name}\n")
        f.write(f"confusion matrix:\n {cf}\n")
        f.write(f"classification report: \n {report}\n")
        f.write(f"=====================================================\n")
print("Evaluation results exported to .txt file")  

Evaluation results exported to .txt file


### 2.2 Data Train memiliki jumlah kelas yang sama

In [None]:
class_0 = dataset[dataset['ket']==0].sample(n=189, random_state=42)
class_1 = dataset[dataset['ket']==1]
class_2 = dataset[dataset['ket']==2].sample(n=189, random_state=42)

balance_df = pd.concat([class_0, class_1, class_2])

balance_df = balance_df.sample(frac=1, random_state=42).reset_index(drop=True)

dataset = balance_df
X = dataset.iloc[:,0:10].values
y = dataset.iloc[:,20].values

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=1,
    random_state=1
)

print(len(y_train))
print(len(y_valid))
print(len(y_valid)/(len(y_train) + len(y_valid)))

In [None]:
# Write results to a text file
with open("output_tanpabola3_sameClass.txt", "w") as f:
    for name, model in models.items():
        model.fit(X_train,y_train)
        y_predict = model.predict(X_valid)
        report = classification_report(y_valid, y_predict)
        cf = confusion_matrix(y_valid, y_predict)

        f.write(f"Model: {name}\n")
        f.write(f"confusion matrix:\n {cf}\n")
        f.write(f"classification report: \n {report}\n")
        f.write(f"=====================================================\n")
print("Evaluation results exported to .txt file") 

In [None]:
y_valid = y_bola3
X_valid = X_bola3

#y_valid = y_train
#X_valid = X_train
# Write results to a text file
with open("output_tanpa3bola_bola3_sameClass_Scale_rbf.txt", "w") as f:
    for name, model in models.items():
        model.fit(X_train,y_train)
        y_predict = model.predict(X_valid)
        report = classification_report(y_valid, y_predict)
        cf = confusion_matrix(y_valid, y_predict)

        f.write(f"Model: {name}\n")
        f.write(f"confusion matrix:\n {cf}\n")
        f.write(f"classification report: \n {report}\n")
        f.write(f"=====================================================\n")
print("Evaluation results exported to .txt file") 

### 2.3 Gridsearch for SVM

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    'C': [0.1, 1, 1.1, 5, 10],
    'gamma': [0.1,'auto'],
    'kernel': ['rbf']
}

grid = GridSearchCV(SVC(), param_grid, cv=5)
grid.fit(X_train, y_train)

print('Best parameter:', grid.best_params_)
print('Best Score:', grid.best_score_)

In [None]:
from sklearn.model_selection import learning_curve

In [None]:
train_sizes, train_scores, test_scores = learning_curve(SVC(C=5, gamma=0.1), X, y, cv=5)

train_mean = train_scores.mean(axis=1)
test_mean = test_scores.mean(axis=1)

plt.plot(train_sizes, train_mean, label='Train score')
plt.plot(train_sizes, test_mean, label='Test score')
plt.xlabel('Size of Data')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

### 2.4 GridSearch for CART

In [None]:
param_grid = {
    'max_depth': [3, 5, 7],
    'min_samples_leaf': [1, 5, 10],
    'min_samples_split': [2, 10, 20]
}

grid = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
grid.fit(X_train, y_train)

print('Best parameter:', grid.best_params_)
print('Best Score:', grid.best_score_)

In [None]:
train_sizes, train_scores, test_scores = learning_curve(DecisionTreeClassifier(max_depth=7, min_samples_leaf=1, min_samples_split=2), X, y, cv=5)

train_mean = train_scores.mean(axis=1)
test_mean = test_scores.mean(axis=1)

plt.plot(train_sizes, train_mean, label='Train score')
plt.plot(train_sizes, test_mean, label='Test score')
plt.xlabel('Size of Data')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

### 2.5 GridSearch for KNN

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),          # Preprocessing
    ('knn', KNeighborsClassifier())        # Classifier
])

# Param grid untuk dicari
param_grid = {
    'knn__n_neighbors': list(range(1, 31)),           # cari k dari 1 sampai 30
    'knn__weights': ['uniform', 'distance'],          # bobot tetangga
    'knn__metric': ['euclidean', 'manhattan']         # metrik jarak
}

# Grid search dengan cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [None]:
print("Best parameters:", grid_search.best_params_)
print("Best cross-val score:", grid_search.best_score_)

# Evaluasi di data uji
y_pred = grid_search.predict(X_test)
print(classification_report(y_test, y_pred))


### 2.6 Sampling Data from `3bola` Data

In [10]:
y_valid = y_bola3
X_valid = X_bola3
# Write results to a text file
with open("output_validation_bola3_20-50_new.txt", "w") as f:
    for i in range(int(np.round(len(y_bola3)/50))):
        row = i*50
        itv = 20 + (i*50)
        X_valid = X_bola3[row:itv,:] 
        y_valid = y_bola3[row:itv]

        f.write(f"=========================({row+1}-{itv})==========================\n")
        for name, model in models.items():
            model.fit(X_train,y_train)
            y_predict = model.predict(X_valid)
            report = classification_report(y_valid, y_predict, output_dict=True)
            cf = confusion_matrix(y_valid, y_predict)

            filtered_report = {k:v for k, v in report.items() if k not in ('accuracy','macro avg', 'weighted avg', 'micro avg')}
            df = pd.DataFrame(filtered_report).T
            acc = report['accuracy']
            f.write(f"Model: {name}\n")
            f.write(f"confusion matrix:\n {cf}\n")
            f.write(f"classification report: \n accuracy = {acc}\n {df[['precision', 'recall', 'f1-score']]}\n")
            f.write(f"------------------------------------------------------------\n")
print("Evaluation results exported to .txt file")



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Evaluation results exported to .txt file


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### 2.7 Experiment Data

In [None]:
# Load dataset
url = "../../data/processed/den/2_sph.xlsx"
names = ['lam_max1', 'csc_max1', 'lam_min', 'csc_min','lam_fwhm1','c_mid','lam_max2','csc_max2','lam_fwhm2','fwhm','posisi1','posisi2','posisi3','posisi4','posisi5','arah_k','arah_E','sb_putar','sudut1','sudut2','ket'] 
dataset = read_excel(url, names=names, header=None)

# Define input and output data
X = dataset.iloc[:,0:10].values
y = dataset.iloc[:,20].values

In [None]:
#test_size=1

X_train = X
y_train = y

In [None]:
# Load dataset
url = "../../data/processed/den/Exp_2bola.xlsx"
names = ['lam_max1', 'csc_max1', 'lam_min', 'csc_min','lam_fwhm1','c_mid','lam_max2','csc_max2','lam_fwhm2','fwhm','posisi1','posisi2','posisi3','posisi4','posisi5','arah_k','arah_E','sb_putar','sudut1','sudut2','ket'] 
exp = read_excel(url, names=names, header=None)

# Define input and output data
X_exp = exp.iloc[:,0:10].values
y_exp = exp.iloc[:,20].values

In [None]:
y_valid = y_exp
X_valid = X_exp
# Write results to a text file
with open("output_validation_experiment_4.txt", "w") as f:
    for name, model in models.items():
        model.fit(X_train,y_train)
        y_predict = model.predict(X_valid)
        report = classification_report(y_valid, y_predict)
        cf = confusion_matrix(y_valid, y_predict)

        f.write(f"Model: {name}\n")
        f.write(f"confusion matrix:\n {cf}\n")
        f.write(f"classification report: \n {report}\n")
        f.write(f"=====================================================\n")
print("Evaluation results exported to .txt file")