In [16]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier


In [17]:
df = pd.read_csv(r"C:\Users\chint\Downloads\student_prediction_dataset_1500.csv")

print("Shape:", df.shape)
print(df.head())


Shape: (1500, 10)
   student_id  study_hours  attendance  previous_score  assignments_completed  \
0           1          6.7          90              85                     99   
1           2          9.1          59              59                     50   
2           3          1.4          63              92                     91   
3           4          8.9          70              73                     87   
4           5          7.8          96              73                     80   

   sleep_hours  extracurriculars  parental_support  teacher_rating  \
0          9.2                 3                 4               8   
1          7.1                 0                10               6   
2          8.6                 5                 3               1   
3          5.3                 5                 3               8   
4          9.8                10                 4               5   

   final_score  
0           80  
1           56  
2           94  
3     

In [18]:
# Function to map scores into categories
def score_to_grade(score):
    if score >= 85:
        return "Excellent"
    elif score >= 70:
        return "Good"
    elif score >= 50:
        return "Average"
    else:
        return "Poor"

# Apply mapping
y = df["final_score"].apply(score_to_grade)

# Features = drop target + ID
X = df.drop(columns=["final_score", "student_id"])

print("Target distribution:\n", y.value_counts())


Target distribution:
 final_score
Average      475
Excellent    398
Good         373
Poor         254
Name: count, dtype: int64


In [19]:
num_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_features = X.select_dtypes(include=["object"]).columns.tolist()

print("Numeric features:", num_features)
print("Categorical features:", cat_features)


Numeric features: ['study_hours', 'attendance', 'previous_score', 'assignments_completed', 'sleep_hours', 'extracurriculars', 'parental_support', 'teacher_rating']
Categorical features: []


In [20]:
numeric_transformer = Pipeline([
    ("scaler", StandardScaler())
])

categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, num_features),
    ("cat", categorical_transformer, cat_features)
])


In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)



In [25]:
models = {
    "RandomForest": Pipeline([
        ("prep", preprocessor),
        ("clf", RandomForestClassifier(n_estimators=300, random_state=42))
    ]),
    "kNN": Pipeline([
        ("prep", preprocessor),
        ("clf", KNeighborsClassifier(n_neighbors=7))
    ]),
    "NaiveBayes": Pipeline([
        ("prep", preprocessor),
        ("clf", GaussianNB())
    ]),
    "LogisticRegression": Pipeline([
        ("prep", preprocessor),
        ("clf", LogisticRegression(max_iter=1000))
    ]),
    "SVM": Pipeline([
        ("prep", preprocessor),
        ("clf", SVC(kernel="rbf", C=1.0, gamma="scale", probability=False))
    ]),
    "NeuralNetwork": Pipeline([
        ("prep", preprocessor),
        ("clf", MLPClassifier(hidden_layer_sizes=(128,), activation="relu",
                              max_iter=1000, random_state=42))
    ])
}



In [27]:
for name, pipe in models.items():
    print(f"\n🔹 Training {name} ...")
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    acc = accuracy_score(y_test, y_pred) * 100
    print(f"{name} Test Accuracy: {acc:.2f}%")
    print(classification_report(y_test, y_pred))




🔹 Training RandomForest ...
RandomForest Test Accuracy: 28.67%
              precision    recall  f1-score   support

     Average       0.33      0.48      0.39        95
   Excellent       0.19      0.19      0.19        80
        Good       0.29      0.28      0.29        74
        Poor       0.33      0.08      0.13        51

    accuracy                           0.29       300
   macro avg       0.29      0.26      0.25       300
weighted avg       0.29      0.29      0.27       300


🔹 Training kNN ...
kNN Test Accuracy: 24.67%
              precision    recall  f1-score   support

     Average       0.29      0.40      0.34        95
   Excellent       0.22      0.25      0.23        80
        Good       0.26      0.20      0.23        74
        Poor       0.05      0.02      0.03        51

    accuracy                           0.25       300
   macro avg       0.20      0.22      0.21       300
weighted avg       0.22      0.25      0.23       300


🔹 Training NaiveBay

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


NeuralNetwork Test Accuracy: 28.33%
              precision    recall  f1-score   support

     Average       0.38      0.39      0.38        95
   Excellent       0.26      0.23      0.24        80
        Good       0.28      0.30      0.29        74
        Poor       0.15      0.16      0.15        51

    accuracy                           0.28       300
   macro avg       0.27      0.27      0.27       300
weighted avg       0.28      0.28      0.28       300





In [28]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, pipe in models.items():
    scores = cross_val_score(pipe, X, y, cv=cv, scoring="accuracy", n_jobs=-1)
    print(f"{name} CV accuracy mean: {scores.mean():.4f} | scores: {scores}")


RandomForest CV accuracy mean: 0.2833 | scores: [0.27333333 0.29333333 0.28333333 0.27333333 0.29333333]
kNN CV accuracy mean: 0.2860 | scores: [0.29666667 0.29       0.28       0.29666667 0.26666667]
NaiveBayes CV accuracy mean: 0.3040 | scores: [0.32333333 0.31666667 0.26       0.30333333 0.31666667]
LogisticRegression CV accuracy mean: 0.2973 | scores: [0.33333333 0.31       0.25666667 0.3        0.28666667]
SVM CV accuracy mean: 0.3067 | scores: [0.28333333 0.3        0.3        0.33666667 0.31333333]
NeuralNetwork CV accuracy mean: 0.2693 | scores: [0.25666667 0.22       0.28666667 0.30333333 0.28      ]


In [30]:
results = []

for name, pipe in models.items():
    # --- Test Accuracy ---
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred) * 100

    # --- Cross-validation Accuracy ---
    scores = cross_val_score(pipe, X, y, cv=cv, scoring="accuracy", n_jobs=-1)
    cv_acc = scores.mean() * 100

    results.append({"Model": name,
                    "Test Accuracy (%)": round(test_acc, 2),
                    "CV Accuracy (%)": round(cv_acc, 2)})

results_df = pd.DataFrame(results).sort_values(by="Test Accuracy (%)", ascending=False)
print(results_df)




                Model  Test Accuracy (%)  CV Accuracy (%)
4                 SVM              32.67            30.67
2          NaiveBayes              31.00            30.40
3  LogisticRegression              29.67            29.73
0        RandomForest              28.67            28.33
5       NeuralNetwork              28.33            26.93
1                 kNN              24.67            28.60


In [31]:
print(y.value_counts(normalize=True))


final_score
Average      0.316667
Excellent    0.265333
Good         0.248667
Poor         0.169333
Name: proportion, dtype: float64


In [32]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

imb_pipe = ImbPipeline([
    ("prep", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("clf", RandomForestClassifier(n_estimators=300, random_state=42))
])

scores = cross_val_score(imb_pipe, X, y, cv=5, scoring="accuracy")
print("RF + SMOTE mean CV accuracy:", scores.mean())


RF + SMOTE mean CV accuracy: 0.2533333333333333


In [34]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    "clf__n_estimators": [100, 300, 500],
    "clf__max_depth": [10, 20, 30, None],
    "clf__min_samples_split": [2, 5, 10]
}

search = RandomizedSearchCV(
    Pipeline([("prep", preprocessor), ("clf", RandomForestClassifier(random_state=42))]),
    param_distributions=param_grid,
    n_iter=10,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
    random_state=42
)

search.fit(X_train, y_train)
print("Best RF params:", search.best_params_)
print("Best CV accuracy:", search.best_score_)


Best RF params: {'clf__n_estimators': 500, 'clf__min_samples_split': 10, 'clf__max_depth': 10}
Best CV accuracy: 0.2908333333333334
