# Predicting Student Academic Performance using Regression-Based Machine Learning Model

In [15]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_excel("Students_Performance_data_set.xlsx")

# Identify target and one-hot feature
target_col = 'What is your current CGPA?'
one_hot_col = 'Status of your English language proficiency'

# Separate numerical and categorical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# Remove the target and one-hot column from the regular processing list
numerical_cols.remove(target_col)
categorical_cols.remove(one_hot_col)

# Handle missing values (if any)
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

df[numerical_cols] = num_imputer.fit_transform(df[numerical_cols])
df[categorical_cols] = cat_imputer.fit_transform(df[categorical_cols])
df[[one_hot_col]] = cat_imputer.fit_transform(df[[one_hot_col]])

# Convert all categorical features to string
df[categorical_cols + [one_hot_col]] = df[categorical_cols + [one_hot_col]].astype(str)

# Label Encoding for regular categorical columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# One-hot encode 'Status of your English language proficiency'
df = pd.get_dummies(df, columns=[one_hot_col], prefix="English")

# Min-Max Scaling for numerical columns
scaler = MinMaxScaler()
df[numerical_cols + [target_col]] = scaler.fit_transform(df[numerical_cols + [target_col]])

# Define features and target
X = df.drop(columns=[target_col])
y = df[target_col]

# Split: Train (70%) / Temp (30%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42
)

# Split Temp into Validation (15%) and Test (15%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42
)

# Output final shapes
print("Train:", X_train.shape)
print("Validation:", X_val.shape)
print("Test:", X_test.shape)

Train: (835, 32)
Validation: (179, 32)
Test: (180, 32)


In [16]:
df.head()

Unnamed: 0,University Admission year,Gender,Age,H.S.C passing year,Program,Current Semester,Do you have meritorious scholarship ?,Do you use University transportation?,How many hour do you study daily?,How many times do you seat for study in a day?,...,With whom you are living with?,Do you have any health issues?,What was your previous SGPA?,Do you have any physical disabilities?,What is your current CGPA?,How many Credit did you have completed?,What is your monthly family income?,English_Advance,English_Basic,English_Intermediate
0,0.5,1,0.666667,0.25,0,0.478261,1,0,0.230769,0.285714,...,0,1,0.67,0,0.7875,0.517241,0.010521,False,False,True
1,0.8,1,0.444444,0.5,0,0.130435,1,1,0.230769,0.285714,...,1,1,0.67,0,0.7875,0.248276,0.048096,False,True,False
2,0.7,0,0.333333,0.4375,0,0.173913,0,0,0.230769,0.428571,...,0,1,0.67,0,0.7875,0.344828,0.023046,False,False,True
3,0.8,1,0.222222,0.5,0,0.130435,1,0,0.076923,0.428571,...,0,2,0.67,0,0.7875,0.248276,0.029303,False,False,True
4,0.8,1,0.444444,0.4375,0,0.130435,1,0,0.230769,0.142857,...,0,2,0.67,0,0.7875,0.248276,0.023046,False,False,True


In [17]:
df.describe()

Unnamed: 0,University Admission year,Gender,Age,H.S.C passing year,Program,Current Semester,Do you have meritorious scholarship ?,Do you use University transportation?,How many hour do you study daily?,How many times do you seat for study in a day?,...,What is you interested area?,What is your relationship status?,Are you engaged with any co-curriculum activities?,With whom you are living with?,Do you have any health issues?,What was your previous SGPA?,Do you have any physical disabilities?,What is your current CGPA?,How many Credit did you have completed?,What is your monthly family income?
count,1194.0,1194.0,1194.0,1194.0,1194.0,1194.0,1194.0,1194.0,1194.0,1194.0,...,1194.0,1194.0,1194.0,1194.0,1194.0,1194.0,1194.0,1194.0,1194.0,1194.0
mean,0.751508,0.562814,0.371487,0.4509,0.0,0.240842,0.438023,0.416248,0.240884,0.277219,...,14.086265,3.630653,0.422948,0.537688,1.177554,0.670986,0.022613,0.7914,0.495466,0.029303
std,0.135495,0.496247,0.17926,0.08631,0.0,0.165647,0.496352,0.493142,0.126892,0.122329,...,3.79373,0.741602,0.494234,0.498786,0.390969,0.217897,0.148729,0.187532,0.331547,0.038163
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.7,0.0,0.222222,0.4375,0.0,0.086957,0.0,0.0,0.153846,0.142857,...,12.0,4.0,0.0,0.0,1.0,0.55,0.0,0.725,0.158621,0.013026
50%,0.8,1.0,0.333333,0.5,0.0,0.26087,0.0,0.0,0.230769,0.285714,...,16.0,4.0,0.0,1.0,1.0,0.67,0.0,0.8025,0.537931,0.023046
75%,0.9,1.0,0.444444,0.5,0.0,0.391304,1.0,1.0,0.307692,0.285714,...,16.0,4.0,1.0,1.0,1.0,0.845,0.0,0.9175,0.806897,0.03507
max,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,24.0,4.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0,1.0


In [18]:
pip install lightgbm



In [19]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import lightgbm as lgb  # Uncomment if installed
import torch
import torch.nn as nn
import torch.optim as optim

# -----------------------------
# Evaluation Function
# -----------------------------
def evaluate_model(name, y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    print(f"{name} Evaluation:")
    print(f"  MSE:  {mse:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  R²:   {r2:.4f}\n")
    return {"Model": name, "MSE": mse, "RMSE": rmse, "R2": r2}

# -----------------------------
# Results List
# -----------------------------
results = []

# -----------------------------
# 1. XGBoost
# -----------------------------
xgb_model = xgb.XGBRegressor(n_estimators=100, max_depth=4, learning_rate=0.1)
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)
results.append(evaluate_model("XGBoost", y_test, xgb_preds))

# -----------------------------
# 2. LightGBM (Optional)
# -----------------------------
# Clean feature names
X_train.columns = X_train.columns.str.replace(" ", "_")
X_test.columns = X_test.columns.str.replace(" ", "_")

# Fit LightGBM model with tuning
import lightgbm as lgb
lgb_model = lgb.LGBMRegressor(
    n_estimators=200,
    learning_rate=0.05,
    num_leaves=31,
    min_child_samples=10,
    random_state=42
)
lgb_model.fit(X_train, y_train)
lgb_preds = lgb_model.predict(X_test)

# Evaluate
results.append(evaluate_model("LightGBM", y_test, lgb_preds))

# -----------------------------
# 3. MLP (Scikit-learn)
# -----------------------------
mlp_model = MLPRegressor(hidden_layer_sizes=(64,), activation='relu', max_iter=1000, random_state=42)
mlp_model.fit(X_train, y_train)
mlp_preds = mlp_model.predict(X_test)
results.append(evaluate_model("MLP (Scikit-learn)", y_test, mlp_preds))




# -----------------------------
# Summary Table
# -----------------------------
results_df = pd.DataFrame(results)
print("\nFinal Summary:\n")
print(results_df)


XGBoost Evaluation:
  MSE:  0.0096
  RMSE: 0.0979
  R²:   0.6963

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000321 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 504
[LightGBM] [Info] Number of data points in the train set: 835, number of used features: 31
[LightGBM] [Info] Start training from score 0.788756
LightGBM Evaluation:
  MSE:  0.0093
  RMSE: 0.0965
  R²:   0.7049

MLP (Scikit-learn) Evaluation:
  MSE:  0.0183
  RMSE: 0.1353
  R²:   0.4206


Final Summary:

                Model       MSE      RMSE        R2
0             XGBoost  0.009593  0.097943  0.696251
1            LightGBM  0.009319  0.096533  0.704930
2  MLP (Scikit-learn)  0.018299  0.135273  0.420580


In [20]:
# -------------------------------------------------
# Additional Models: RF, SVM
# -------------------------------------------------
from sklearn.ensemble import RandomForestRegressor

from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
import numpy as np
import pandas as pd

# Initialize results list (you can append this to your existing results)
additional_results = []

# -----------------------------
# 1. Random Forest (Regression)
# -----------------------------
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)
additional_results.append({
    "Model": "Random Forest",
    "MSE": mean_squared_error(y_test, rf_preds),
    "RMSE": np.sqrt(mean_squared_error(y_test, rf_preds)),
    "R2": r2_score(y_test, rf_preds)
})

# -----------------------------
# 2. SVM (Regression)
# -----------------------------
svm_model = SVR(kernel='rbf')
svm_model.fit(X_train, y_train)
svm_preds = svm_model.predict(X_test)
additional_results.append({
    "Model": "SVM (Regression)",
    "MSE": mean_squared_error(y_test, svm_preds),
    "RMSE": np.sqrt(mean_squared_error(y_test, svm_preds)),
    "R2": r2_score(y_test, svm_preds)
})

# -----------------------------
# Combine and Display Results
# -----------------------------
additional_results_df = pd.DataFrame(additional_results)
print("\n--- Additional Model Evaluation (RF, LR, SVM) ---\n")
print(additional_results_df)



--- Additional Model Evaluation (RF, LR, SVM) ---

              Model       MSE      RMSE        R2
0     Random Forest  0.009380  0.096849  0.702996
1  SVM (Regression)  0.029633  0.172143  0.061684


In [21]:
#combine all
results_df = pd.concat([results_df, additional_results_df], ignore_index=True)
print("\n--- Final Summary with Additional Models ---\n")
print(results_df)


--- Final Summary with Additional Models ---

                Model       MSE      RMSE        R2
0             XGBoost  0.009593  0.097943  0.696251
1            LightGBM  0.009319  0.096533  0.704930
2  MLP (Scikit-learn)  0.018299  0.135273  0.420580
3       Random Forest  0.009380  0.096849  0.702996
4    SVM (Regression)  0.029633  0.172143  0.061684
