### Import Necessary Libraries

In [6]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


### Dataset Selection and Loading

**Requirements:**
- ≥500 samples
- ≥12 features
- Public dataset (UCI/Kaggle)
- Regression OR Classification problem

In [7]:
# Step 1: Load your dataset
# Dataset - https://archive.ics.uci.edu/dataset/697/predict+students+dropout+and+academic+success

path = "data/data.csv"
print("Path to dataset files:", path)
data = pd.read_csv(path, delimiter=';')
print("Dataset loaded successfully.")

Path to dataset files: data/data.csv
Dataset loaded successfully.


### Data Pre-processing

In [8]:
# Step 2: Preprocess your data

# Step 2.1. Separate features (X) and target (y)
print(f"Columns in the loaded dataset: {list(data.columns)}\n")
print("Number of columns in the dataset:", len(data.columns))
print("Number of rows in the dataset:", len(data))

X = data.drop('Target', axis=1)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['Target'])

print(f"First 5 entries of features data:")
display(X.head())
print(f"\nFirst 5 corresponding entries of target data (Label encoded):")
print(y[:5])

Columns in the loaded dataset: ['Marital status', 'Application mode', 'Application order', 'Course', 'Daytime/evening attendance\t', 'Previous qualification', 'Previous qualification (grade)', 'Nacionality', "Mother's qualification", "Father's qualification", "Mother's occupation", "Father's occupation", 'Admission grade', 'Displaced', 'Educational special needs', 'Debtor', 'Tuition fees up to date', 'Gender', 'Scholarship holder', 'Age at enrollment', 'International', 'Curricular units 1st sem (credited)', 'Curricular units 1st sem (enrolled)', 'Curricular units 1st sem (evaluations)', 'Curricular units 1st sem (approved)', 'Curricular units 1st sem (grade)', 'Curricular units 1st sem (without evaluations)', 'Curricular units 2nd sem (credited)', 'Curricular units 2nd sem (enrolled)', 'Curricular units 2nd sem (evaluations)', 'Curricular units 2nd sem (approved)', 'Curricular units 2nd sem (grade)', 'Curricular units 2nd sem (without evaluations)', 'Unemployment rate', 'Inflation rate

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0,0.0,0,10.8,1.4,1.74
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,0,6,6,6,13.666667,0,13.9,-0.3,0.79
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,0,6,0,0,0.0,0,10.8,1.4,1.74
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,0,6,10,5,12.4,0,9.4,-0.8,-3.12
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,0,6,6,6,13.0,0,13.9,-0.3,0.79



First 5 corresponding entries of target data (Label encoded):
[0 2 0 2 2]


In [9]:
# Step 2.2 Handle missing values if any
print(f"Basic information about the dataset:")
data.info()
print(f"Number of missing values in the dataset: {data.isnull().sum().sum()}")
print(f"Number of duplicated rows in the dataset: {data.duplicated().sum()}")

Basic information about the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4424 entries, 0 to 4423
Data columns (total 37 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Marital status                                  4424 non-null   int64  
 1   Application mode                                4424 non-null   int64  
 2   Application order                               4424 non-null   int64  
 3   Course                                          4424 non-null   int64  
 4   Daytime/evening attendance	                     4424 non-null   int64  
 5   Previous qualification                          4424 non-null   int64  
 6   Previous qualification (grade)                  4424 non-null   float64
 7   Nacionality                                     4424 non-null   int64  
 8   Mother's qualification                          4424 non-null   int64  
 9   Fath

### Train-Test Split of Data

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print('✓ Dataset split into train, validation, and test sets successfully')

print("\nTrain set class distribution:")
print(pd.Series(y_train).value_counts())

print("\nTest set class distribution:")
print(pd.Series(y_test).value_counts())

✓ Dataset split into train, validation, and test sets successfully

Train set class distribution:
2    1767
0    1137
1     635
dtype: int64

Test set class distribution:
2    442
0    284
1    159
dtype: int64


In [None]:
# Save the test data with original labels for streamlit app use

test_data = X_test.copy()
test_data['Target'] = label_encoder.inverse_transform(y_test)

# Save to CSV
test_data.to_csv('data/test_data.csv', index=False)
print("✓ Test data saved to 'test_data.csv'")
print(f"\nFirst few rows of test data:")
display(test_data.head())

✓ Test data saved to 'test_data.csv'

First few rows of test data:


Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
1853,1,44,1,9003,1,39,160.0,1,3,3,...,0,6,7,6,14.666667,0,12.4,0.5,1.79,Graduate
2399,1,17,1,9500,1,1,131.0,1,1,1,...,0,8,10,8,11.7125,0,16.2,0.3,-0.92,Graduate
510,1,1,2,9070,1,1,121.0,1,4,1,...,0,6,9,0,0.0,0,9.4,-0.8,-3.12,Enrolled
242,1,1,2,9147,1,1,118.0,1,37,19,...,0,5,5,5,13.6,0,12.4,0.5,1.79,Graduate
3392,1,1,1,9070,1,1,133.1,1,1,1,...,0,6,6,6,12.666667,0,16.2,0.3,-0.92,Graduate


### Model Training & Evaluation

**Models to Implement:**
1. Logistic Regression
2. Decision Tree Classifier
3. K-Nearest Neighbor Classifier
4. Naive Bayes Classifier - Gaussian or Multinomial
5. Ensemble Model - Random Forest
6. Ensemble Model - XGBoost

In [15]:
import joblib

models = {
    "Logistic Regression": LogisticRegression(solver='liblinear', max_iter=10000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42, criterion='entropy'),
    "KNN": KNeighborsClassifier(n_neighbors=10),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(eval_metric='logloss', random_state=42)
}

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None

    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob, multi_class='ovr') if y_prob is not None else None,
        "Precision": precision_score(y_test, y_pred, average='weighted'),
        "Recall": recall_score(y_test, y_pred, average='weighted'),
        "F1": f1_score(y_test, y_pred, average='weighted'),
        "MCC": matthews_corrcoef(y_test, y_pred)
    })

    # Save the model
    joblib.dump(model, f"model/{name.replace(' ', '_')}.pkl")

# Save label encoder and feature names
joblib.dump(label_encoder, 'model/label_encoder.pkl')
joblib.dump(list(X.columns), 'model/feature_names.pkl')

results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Model,Accuracy,AUC,Precision,Recall,F1,MCC
0,Logistic Regression,0.758192,0.868678,0.730419,0.758192,0.72922,0.597449
1,Decision Tree,0.649718,0.704741,0.656268,0.649718,0.65249,0.436606
2,KNN,0.60904,0.708689,0.585808,0.60904,0.588098,0.337162
3,Naive Bayes,0.659887,0.790761,0.634672,0.659887,0.643205,0.430308
4,Random Forest,0.767232,0.884911,0.754445,0.767232,0.753903,0.61323
5,XGBoost,0.764972,0.882434,0.760599,0.764972,0.761514,0.613642
