In [None]:
# Load the dataset from a CSV file
import pandas as pd

file_path = '/content/synthetic_student_performance.csv'
df = pd.read_csv(file_path)

df.head()


Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,1001,17,1,1,3,16,13,0,2,0,0,0,0,3.158425,1
1,1002,18,1,2,0,17,28,0,2,1,1,1,0,2.794655,2
2,1003,15,1,0,1,13,18,1,4,0,0,0,1,3.774847,0
3,1004,17,1,0,3,6,4,0,3,1,0,0,0,3.040574,1
4,1005,17,0,1,2,6,20,0,4,0,0,0,0,3.38231,1


In [None]:
# Transform student GPA dataset into a user-item matrix format for collaborative filtering

import pandas as pd

df = pd.read_csv('/content/synthetic_student_performance.csv')

# Create item list
subjects = ["Math", "Science", "Reading", "Writing"]

# Expand dataset: one row per student * per subject
rows = []

for _, row in df.iterrows():
    for i, subject in enumerate(subjects):
        rows.append({
            "user_id": row["StudentID"],
            "item_id": i,                # subject index
            "subject": subject,
            "rating": row["GPA"] / 4.0,  # normalize 0‚Äì1
        })

cf_df = pd.DataFrame(rows)
cf_df.head()


Unnamed: 0,user_id,item_id,subject,rating
0,1001.0,0,Math,0.789606
1,1001.0,1,Science,0.789606
2,1001.0,2,Reading,0.789606
3,1001.0,3,Writing,0.789606
4,1002.0,0,Math,0.698664


In [None]:
# Convert student GPA dataset into a user-activity matrix for collaborative filtering
# Items based on activities

activity_items = ["Tutoring", "Extracurricular", "Sports", "Music", "Volunteering"]

rows = []
for _, row in df.iterrows():
    for i, act in enumerate(activity_items):
        rows.append({
            "user_id": row["StudentID"],
            "item_id": i,
            "activity": act,
            "rating": row["GPA"] / 4.0
        })

cf_df = pd.DataFrame(rows)

cf_df.head()

Unnamed: 0,user_id,item_id,activity,rating
0,1001.0,0,Tutoring,0.789606
1,1001.0,1,Extracurricular,0.789606
2,1001.0,2,Sports,0.789606
3,1001.0,3,Music,0.789606
4,1001.0,4,Volunteering,0.789606


In [None]:
# Display dataset structure, summary statistics, and first few rows for initial exploration
# exploratory data analysis (EDA)

# Inspect dataset structure
print(df.info())
# Basic numeric summary (GPA, Absences, StudyTimeWeekly, etc.)
print(df.describe())
# Quick look at first few rows (optional but useful)
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   StudentID          5000 non-null   int64  
 1   Age                5000 non-null   int64  
 2   Gender             5000 non-null   int64  
 3   Ethnicity          5000 non-null   int64  
 4   ParentalEducation  5000 non-null   int64  
 5   StudyTimeWeekly    5000 non-null   int64  
 6   Absences           5000 non-null   int64  
 7   Tutoring           5000 non-null   int64  
 8   ParentalSupport    5000 non-null   int64  
 9   Extracurricular    5000 non-null   int64  
 10  Sports             5000 non-null   int64  
 11  Music              5000 non-null   int64  
 12  Volunteering       5000 non-null   int64  
 13  GPA                5000 non-null   float64
 14  GradeClass         5000 non-null   int64  
dtypes: float64(1), int64(14)
memory usage: 586.1 KB
None
         StudentID 

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m99.2/99.2 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
# Import classifiers and preprocessing tools

from sklearn.naive_bayes import GaussianNB


# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, auc
from sklearn.ensemble import RandomForestClassifier


In [None]:
# Prepare dataset for machine learning by separating features/target, normalizing features, and creating train/test sets

# Split features and target
X = df.drop("GradeClass", axis=1)
y = df["GradeClass"]

# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
# X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
# Train multiple classifiers, evaluate their accuracy, and identify models that are not overfitting

# Techniques used in training
from sklearn.metrics import classification_report
import pandas as pd

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Naive Bayes": GaussianNB(),
    "SVM": SVC(probability=True),
    "KNN": KNeighborsClassifier(),
    "XGBoost": XGBClassifier(eval_metric="mlogloss", use_label_encoder=False),
    "LightGBM": LGBMClassifier(),
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    accuracy = report["accuracy"]
    results[name] = accuracy

    # Only print if accuracy is less than 95%
    if accuracy <= 0.98:
        print(f"üìå {name}")
        print(classification_report(y_test, y_pred))
        print("-" * 60)

# Convert to DataFrame
df_results = pd.DataFrame(results, index=["Accuracy"]).T

# Filter out models with accuracy <= 98%
filtered_results = df_results[df_results["Accuracy"] <= 0.98]

print("üìå Models *NOT* overfitting (Accuracy <= 98%):")
print(filtered_results.sort_values(by="Accuracy", ascending=False))


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


üìå Logistic Regression
              precision    recall  f1-score   support

           0       0.82      0.96      0.88       421
           1       0.62      0.44      0.51       236
           2       0.53      0.56      0.55       204
           3       0.58      0.52      0.55       139

    accuracy                           0.69      1000
   macro avg       0.64      0.62      0.62      1000
weighted avg       0.68      0.69      0.68      1000

------------------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


üìå AdaBoost
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       421
           1       0.00      0.00      0.00       236
           2       0.24      1.00      0.38       204
           3       1.00      1.00      1.00       139

    accuracy                           0.34      1000
   macro avg       0.31      0.50      0.35      1000
weighted avg       0.19      0.34      0.22      1000

------------------------------------------------------------
üìå Naive Bayes
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       421
           1       0.98      0.99      0.98       236
           2       0.96      0.98      0.97       204
           3       0.97      0.94      0.96       139

    accuracy                           0.98      1000
   macro avg       0.98      0.97      0.97      1000
weighted avg       0.98      0.98      0.98      1000

--------------------------------------

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


üìå SVM
              precision    recall  f1-score   support

           0       0.42      1.00      0.59       421
           1       0.00      0.00      0.00       236
           2       0.00      0.00      0.00       204
           3       0.00      0.00      0.00       139

    accuracy                           0.42      1000
   macro avg       0.11      0.25      0.15      1000
weighted avg       0.18      0.42      0.25      1000

------------------------------------------------------------
üìå KNN
              precision    recall  f1-score   support

           0       0.47      0.69      0.56       421
           1       0.22      0.17      0.19       236
           2       0.27      0.18      0.21       204
           3       0.25      0.11      0.15       139

    accuracy                           0.38      1000
   macro avg       0.30      0.29      0.28      1000
weighted avg       0.34      0.38      0.34      1000

---------------------------------------------------

In [None]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import gradio as gr


# --- Step 1: Select the best model ---
best_model_name = filtered_results["Accuracy"].idxmax()
best_model = models[best_model_name]
print(f"Best model selected: {best_model_name} with accuracy {filtered_results.loc[best_model_name, 'Accuracy']}")

# --- Step 2: Define encoders for categorical columns ---
categorical_cols = ["Gender","Ethnicity","ParentalEducation","Tutoring",
                    "ParentalSupport","Extracurricular","Sports","Music","Volunteering"]

encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    le.fit(X_train[col])
    encoders[col] = le

# --- Step 3: Define recommendation function ---
def recommend_courses(age, gender, ethnicity, parental_education, study_time_weekly,
                       absences, tutoring, parental_support, extracurricular,
                       sports, music, volunteering):

    input_data = {
        "StudentID": 0,          # dummy or placeholder
        "Age": age,
        "Gender": gender,
        "Ethnicity": ethnicity,
        "ParentalEducation": parental_education,
        "StudyTimeWeekly": study_time_weekly,
        "Absences": absences,
        "Tutoring": tutoring,
        "ParentalSupport": parental_support,
        "Extracurricular": extracurricular,
        "Sports": sports,
        "Music": music,
        "Volunteering": volunteering,
        "GPA": 0.0               # dummy/numeric default
    }

    input_df = pd.DataFrame([input_data])

    pred_class = best_model.predict(input_df)[0]

    # Map predicted class to course/activity recommendations
    recommendations_map = {
        0: ["Math Club", "Algebra Booster"],
        1: ["Science Fair", "Robotics Club"],
        2: ["Art Workshop", "Creative Writing"],
        3: ["Sports Team", "Fitness Activities"]
    }

    return recommendations_map.get(pred_class, ["No recommendation available"])

Best model selected: Naive Bayes with accuracy 0.98
