In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from ucimlrepo import fetch_ucirepo 
from imblearn.pipeline import Pipeline as ImbPipeline

In [2]:
# Fetch the dataset using the given ID
# fetch dataset 
student_academics_performance = fetch_ucirepo(id=467) 
  
# data (as pandas dataframes) 
X = student_academics_performance.data.features 
y = student_academics_performance.data.targets 

# Combine features and target into a single DataFrame for convenience
student_data_df = pd.concat([X, y], axis=1)
print(student_data_df.head())

# Split the data into features and target
X = student_data_df.drop('esp', axis=1)
y = student_data_df['esp']

# Preprocess the Data
categorical_features = X.columns.tolist()  # Assuming all features are categorical
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features)
    ]
)

# Applying SMOTE for class balancing
smote = SMOTE(sampling_strategy='auto', random_state=42, k_neighbors=3)


# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


  ge   cst   tnp   twp iap   esp arr         ms ls    as  ...  fq  mq  \
0  F     G  Good  Good  Vg  Good   Y  Unmarried  V  Paid  ...  Um  10   
1  M   OBC    Vg    Vg  Vg    Vg   N  Unmarried  V  Paid  ...  Um  Il   
2  F   OBC  Good  Good  Vg  Good   N  Unmarried  V  Paid  ...  12  10   
3  M  MOBC  Pass  Good  Vg  Good   N  Unmarried  V  Paid  ...  12  Um   
4  M     G  Good  Good  Vg    Vg   N  Unmarried  V  Paid  ...  10  12   

         fo         mo       nf       sh       ss   me       tt      atd  
0    Farmer  Housewife    Large     Poor     Govt  Asm    Small     Good  
1   Service    Service    Small     Poor     Govt  Asm  Average  Average  
2   Service  Housewife  Average  Average     Govt  Asm    Large     Good  
3  Business   Business    Large     Poor     Govt  Asm  Average  Average  
4   Service  Housewife    Large     Poor  Private  Asm    Small     Good  

[5 rows x 22 columns]


In [3]:

# Models and parameters for grid search
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
}

param_grids = {
    'Logistic Regression': {
        'model__C': [0.1, 1, 10, 100],
        'model__solver': ['liblinear', 'lbfgs'],
    },
    'Random Forest': {
        'model__n_estimators': [50, 100, 200],
        'model__max_depth': [None, 10, 20],
        'model__min_samples_split': [2, 5, 10],
    },
    'SVM': {
        'model__C': [0.1, 1, 10],
        'model__kernel': ['linear', 'rbf'],
    },
    'XGBoost': {
        'model__n_estimators': [50, 100, 200],
        'model__learning_rate': [0.01, 0.1, 0.2],
        'model__max_depth': [3, 5, 7],
    }
}

# Pipeline for preprocessing and training with SMOTE
def create_pipeline(model):
    pipeline = ImbPipeline(steps=[
        ('preprocessor', preprocessor),
        ('smote', smote),
        ('model', model)
    ])
    return pipeline

# Dictionary of models to be tuned
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
}

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [4]:
from sklearn.preprocessing import LabelEncoder

# Encode the target labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Perform grid search for each model
best_models = {}
for model_name, model in models.items():
    print(f"Performing grid search for {model_name}...")
    pipeline = create_pipeline(model)
    param_grid = param_grids[model_name]
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy')
    
    # Use the encoded target labels for XGBoost
    if model_name == 'XGBoost':
        grid_search.fit(X_train, y_train_encoded)
    else:
        grid_search.fit(X_train, y_train)
        
    best_models[model_name] = grid_search.best_estimator_
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best cross-validation accuracy for {model_name}: {grid_search.best_score_:.4f}")

# Evaluate the best models on the test set
for model_name, model in best_models.items():
    # Use the encoded target labels for evaluation if it's XGBoost
    if model_name == 'XGBoost':
        test_accuracy = model.score(X_test, y_test_encoded)
    else:
        test_accuracy = model.score(X_test, y_test)
    print(f"Test accuracy for {model_name}: {test_accuracy:.4f}")

Performing grid search for Logistic Regression...
Best parameters for Logistic Regression: {'model__C': 10, 'model__solver': 'lbfgs'}
Best cross-validation accuracy for Logistic Regression: 0.6443
Performing grid search for Random Forest...
Best parameters for Random Forest: {'model__max_depth': None, 'model__min_samples_split': 2, 'model__n_estimators': 200}
Best cross-validation accuracy for Random Forest: 0.6624
Performing grid search for SVM...
Best parameters for SVM: {'model__C': 1, 'model__kernel': 'rbf'}
Best cross-validation accuracy for SVM: 0.6619
Performing grid search for XGBoost...


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Best parameters for XGBoost: {'model__learning_rate': 0.2, 'model__max_depth': 7, 'model__n_estimators': 100}
Best cross-validation accuracy for XGBoost: 0.5862
Test accuracy for Logistic Regression: 0.5556
Test accuracy for Random Forest: 0.6296
Test accuracy for SVM: 0.6296
Test accuracy for XGBoost: 0.5926
