In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import ADASYN, SMOTE

In [20]:
features = pd.read_csv('./Dyt-desktop.csv', delimiter=';').columns.values[:-1]

def load_data(path):
    data = pd.read_csv(path, delimiter=';')

    # Extract data points (features) as a NumPy array
    X = data.iloc[:, :-1].values

    # Extract labels as a NumPy array
    y = data['Dyslexia'].values
    y = np.where(y == 'Yes', 1, 0)

    return X, y

def pre_process(data, labels):
    X, y = data, labels

    # Replace NaN values with 0s
    for i in range(X.shape[0]):
        for j in range(X.shape[1]):
            X[i][j] = np.nan_to_num(X[i][j])

    # Encode 'Male' to 0 and 'Female' to 1
    X[:, 0] = np.where(X[:, 0] == 'Male', 0, 1)

    # Encode 'Yes' to 1 and 'No' to 0
    X[:, 1] = np.where(X[:, 1] == 'Yes', 1, 0)
    X[:, 2] = np.where(X[:, 2] == 'Yes', 1, 0)

    # Perform Min-Max scaling for non-'Accuracy' columns
    scaler = MinMaxScaler(feature_range=(0, 1))
    for i, feature in enumerate(features):
        if not feature.startswith('Accuracy'):
            column_values = X[:, i].astype(float).reshape(-1, 1)
            X[:, i] = scaler.fit_transform(column_values).flatten()
    
    return X, y

def cross_validate(X, y, n_folds=10, threshold=0.5, seed=42, oversampling=None):
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
    accuracies = []
    recalls = []
    precisions = []
    rocs = []
    f1_scores = []
    
    for train, test in kf.split(X):
        X_train, y_train = X[train], y[train]
        X_test, y_test = X[test], y[test]

        if oversampling == 'smote':
            oversampler = SMOTE(random_state=seed)
        if oversampling == 'adasyn':
            oversampler = ADASYN(random_state=seed)
        if oversampling is not None:
            X_train, y_train = oversampler.fit_resample(X_train, y_train)
    return X_train,y_train,X_test,y_test

In [56]:
data, labels = load_data('./Dyt-desktop.csv')
data, labels = pre_process(data, labels)
X_train,y_train,X_test,y_test = cross_validate(data,labels,oversampling='adasyn')

In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import shuffle



# Initialize the CatBoostClassifier
model = CatBoostClassifier(
    iterations=1500,  # Number of trees
    depth=6,  # Tree depth
    learning_rate=0.03,  # Controls step size
    loss_function='Logloss',  # For binary classification
    eval_metric='Accuracy',
    verbose=100,
)

# Train the model
model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=50)

# Make predictions
# y_pred = model.predict(X_test)

# # Evaluate model performance
# accuracy = accuracy_score(y_test, y_pred)
# print(f"Test Accuracy: {accuracy:.4f}")
# print(classification_report(y_test, y_pred))
y_prob = model.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class ("Yes")
threshold = 0.3  # Adjust this based on precision-recall tuning
y_pred_threshold = (y_prob >= threshold).astype(int)

print(classification_report(y_test, y_pred_threshold))




0:	learn: 0.8240885	test: 0.7664835	best: 0.7664835 (0)	total: 19.6ms	remaining: 29.4s
100:	learn: 0.9578365	test: 0.8928571	best: 0.9010989 (72)	total: 2.19s	remaining: 30.4s
200:	learn: 0.9716606	test: 0.9175824	best: 0.9230769 (175)	total: 4.22s	remaining: 27.3s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.9230769231
bestIteration = 175

Shrink model to first 176 iterations.
              precision    recall  f1-score   support

           0       0.96      0.90      0.93       321
           1       0.49      0.74      0.59        43

    accuracy                           0.88       364
   macro avg       0.73      0.82      0.76       364
weighted avg       0.91      0.88      0.89       364



In [58]:
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV

# Define a larger search space for hyperparameters
param_grid = {
    'C': [1, 10,100],  # More values for regularization
    'gamma': [1, 0.1, 0.01],  # Wider range for RBF gamma
    'kernel': ['rbf'],  # Keeping the best performing kernel
}

# Initialize SVC
svm_model = SVC(probability=True)

# Using RandomizedSearchCV for better hyperparameter tuning
'''random_search = RandomizedSearchCV(
    svm_model, param_distributions=param_grid,
    n_iter=20, cv=10, verbose=2, random_state=42, n_jobs=-1
)'''
random_search = GridSearchCV(
    svm_model, param_grid,
    cv=5, verbose=2, n_jobs=-1
)
random_search.fit(X_train, y_train)

# Best Parameters
print(f"Best Parameters: {random_search.best_params_}")
best_model = random_search.best_estimator_
# y_pred = best_model.predict(X_test)
# accuracy = accuracy_score(y_test, y_pred)
# print(f"Accuracy: {accuracy * 100:.2f}")

# report = classification_report(y_test, y_pred)
# print(report)

# def predict_data(info):
#     print(best_model.predict(info))
y_prob = model.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class ("Yes")
threshold = 0.3  # Adjust this based on precision-recall tuning
y_pred_threshold = (y_prob >= threshold).astype(int)

print(classification_report(y_test, y_pred_threshold))

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best Parameters: {'C': 10, 'gamma': 1, 'kernel': 'rbf'}
              precision    recall  f1-score   support

           0       0.96      0.90      0.93       321
           1       0.49      0.74      0.59        43

    accuracy                           0.88       364
   macro avg       0.73      0.82      0.76       364
weighted avg       0.91      0.88      0.89       364



In [61]:
model = RandomForestClassifier(n_estimators=200, class_weight=None, random_state=42)
model.fit(X_train,y_train)
y_prob = model.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class ("Yes")
threshold = 0.3  # Adjust this based on precision-recall tuning
y_pred_threshold = (y_prob >= threshold).astype(int)
print(classification_report(y_test, y_pred_threshold))

              precision    recall  f1-score   support

           0       0.96      0.85      0.91       321
           1       0.41      0.77      0.54        43

    accuracy                           0.84       364
   macro avg       0.69      0.81      0.72       364
weighted avg       0.90      0.84      0.86       364

