### Import Necessary Libraries

In [26]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [27]:
import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


### Dataset Selection and Loading

**Requirements:**
- ≥500 samples
- ≥12 features
- Public dataset (UCI/Kaggle)
- Regression OR Classification problem

In [38]:
# Step 1: Load your dataset
# Dataset - https://archive.ics.uci.edu/dataset/697/predict+students+dropout+and+academic+success

train_data_path = "data/train_data.csv"
test_data_path = "data/test_data.csv"
print("Path to dataset files:", train_data_path, ",", test_data_path)
train_data = pd.read_csv(train_data_path, delimiter=',')
test_data = pd.read_csv(test_data_path, delimiter=',')
print("Dataset loaded successfully.")

Path to dataset files: data/train_data.csv , data/test_data.csv
Dataset loaded successfully.


### Data Pre-processing

In [39]:
# Step 2: Preprocess your data

# Step 2.1. Separate features (X) and target (y)
print(f"Columns in the loaded dataset: {list(train_data.columns)}\n")
print("Number of columns in the dataset:", len(train_data.columns))
print("Number of rows in the dataset:", len(train_data))

train_X = train_data.drop('Target', axis=1)
test_X = test_data.drop('Target', axis=1)

# Use mapping to create numeric target
train_y = train_data['Target']
test_y = test_data['Target']

print(f"First 5 entries of features data:")
display(train_X.head())
print(f"\nFirst 5 corresponding entries of target data:")
print(train_y[:5])

Columns in the loaded dataset: ['Marital status', 'Application mode', 'Application order', 'Course', 'Daytime/evening attendance\t', 'Previous qualification', 'Previous qualification (grade)', 'Nationality', "Mother's qualification", "Father's qualification", "Mother's occupation", "Father's occupation", 'Admission grade', 'Displaced', 'Educational special needs', 'Debtor', 'Tuition fees up to date', 'Gender', 'Scholarship holder', 'Age at enrollment', 'International', 'Curricular units 1st sem (credited)', 'Curricular units 1st sem (enrolled)', 'Curricular units 1st sem (evaluations)', 'Curricular units 1st sem (approved)', 'Curricular units 1st sem (grade)', 'Curricular units 1st sem (without evaluations)', 'Curricular units 2nd sem (credited)', 'Curricular units 2nd sem (enrolled)', 'Curricular units 2nd sem (evaluations)', 'Curricular units 2nd sem (approved)', 'Curricular units 2nd sem (grade)', 'Curricular units 2nd sem (without evaluations)', 'Unemployment rate', 'Inflation rate

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nationality,Mother's qualification,Father's qualification,...,Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP
0,4,7,1,9147,1,3,130.0,1,19,1,...,0,0,5,5,0,0.0,0,11.1,0.6,2.02
1,1,39,1,9085,1,1,130.0,1,37,37,...,0,0,6,14,2,11.333333,0,11.1,0.6,2.02
2,1,1,6,9070,1,6,119.0,1,1,1,...,0,0,6,8,6,13.625,0,10.8,1.4,1.74
3,2,39,1,9238,1,19,133.1,1,37,37,...,0,0,6,0,0,0.0,0,15.5,2.8,-4.06
4,1,1,3,9500,1,1,142.0,1,37,38,...,0,0,7,7,6,13.9,0,7.6,2.6,0.32



First 5 corresponding entries of target data:
0     Dropout
1    Enrolled
2    Graduate
3     Dropout
4    Graduate
Name: Target, dtype: object


In [40]:
# Step 2.2 Handle missing values if any
print(f"Basic information about the train dataset:")
train_data.info()
print(f"Number of missing values in the dataset: {train_data.isnull().sum().sum()}")
print(f"Number of duplicated rows in the dataset: {train_data.duplicated().sum()}")

Basic information about the train dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3539 entries, 0 to 3538
Data columns (total 37 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Marital status                                  3539 non-null   int64  
 1   Application mode                                3539 non-null   int64  
 2   Application order                               3539 non-null   int64  
 3   Course                                          3539 non-null   int64  
 4   Daytime/evening attendance	                     3539 non-null   int64  
 5   Previous qualification                          3539 non-null   int64  
 6   Previous qualification (grade)                  3539 non-null   float64
 7   Nationality                                     3539 non-null   int64  
 8   Mother's qualification                          3539 non-null   int64  
 9 

### Train-Test Split of Data

In [None]:
# Step 3.1: Analyze class distribution
print("\nTrain set class distribution:\n")
print(pd.Series(train_y).value_counts())

print("\nTest set class distribution:\n")
print(pd.Series(test_y).value_counts())


Train set class distribution:

Target
Graduate    1791
Dropout     1105
Enrolled     643
Name: count, dtype: int64

Test set class distribution:

Target
Graduate    418
Dropout     316
Enrolled    151
Name: count, dtype: int64


In [None]:
# Step 3.2 Encode target variable
target_class_encoding = {
    cls: idx for idx, cls in enumerate(train_data['Target'].unique(), start=0)
}
# Save target class encoding mapping to CSV for use in deployment
mapping_df = pd.DataFrame(list(target_class_encoding.items()), columns=["class", "encoded"]) 
mapping_df.to_csv('model/target_class_encoding.csv', index=False)

train_y = train_y.map(target_class_encoding)
test_y = test_y.map(target_class_encoding)
print("✓ Target variable encoded successfully.")

✓ Target variable encoded successfully.


### Model Training & Evaluation

**Models to Implement:**
1. Logistic Regression
2. Decision Tree Classifier
3. K-Nearest Neighbor Classifier
4. Naive Bayes Classifier - Gaussian or Multinomial
5. Ensemble Model - Random Forest
6. Ensemble Model - XGBoost

In [43]:
# Step 4: Train various classification models and evaluate in training data

models = {
    "Logistic Regression": LogisticRegression(max_iter=10000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42, criterion='entropy'),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(eval_metric='logloss', random_state=42)
}

results = []

for name, model in models.items():
    model.fit(train_X, train_y)
    pred_y = model.predict(test_X)
    prob_y = model.predict_proba(test_X) if hasattr(model, "predict_proba") else None

    results.append({
        "Model": name,
        "Accuracy": accuracy_score(test_y, pred_y),
        "AUC": roc_auc_score(test_y, prob_y, multi_class='ovr', average="weighted") if prob_y is not None else None,
        "Precision": precision_score(test_y, pred_y, average='weighted', zero_division=0),
        "Recall": recall_score(test_y, pred_y, average='weighted', zero_division=0),
        "F1": f1_score(test_y, pred_y, average='weighted', zero_division=0),
        "MCC": matthews_corrcoef(test_y, pred_y)
    })

    # Save the model
    with open(f"model/{name.replace(' ', '_')}.pkl", 'wb') as file:
        pickle.dump(model, file)

results_df = pd.DataFrame(results)
results_df

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=10000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Model,Accuracy,AUC,Precision,Recall,F1,MCC
0,Logistic Regression,0.748023,0.888693,0.728983,0.748023,0.732054,0.585799
1,Decision Tree,0.681356,0.752944,0.679706,0.681356,0.679964,0.484426
2,KNN,0.60904,0.722862,0.597317,0.60904,0.597702,0.353674
3,Naive Bayes,0.699435,0.824013,0.687721,0.699435,0.686834,0.505626
4,Random Forest,0.760452,0.887001,0.745148,0.760452,0.744572,0.607062
5,XGBoost,0.763842,0.893988,0.754142,0.763842,0.755302,0.613874
