# **SchedWiz: AI-Powered Study Scheduler**


## Initial Setup

In [1]:
import os
import sys
import streamlit as st
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import OrdinalEncoder
from datetime import date, timedelta
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler, Imputer
from pyspark.ml import Pipeline

# Configuring PySpark to run with Streamlit
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable


## Ngrok Authentication Token

In [2]:
!ngrok config add-authtoken 2xoDBn5Ii9vasE8s0PSuJe2tj7l_31yAMercJ42t26zSGqc7P

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


## MLP Classifier Model

In [3]:
%%writefile train_agent.py

# Imports
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import OrdinalEncoder
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler, Imputer
from pyspark.ml import Pipeline
from datetime import date, timedelta

# Neural network Module
class MLP(nn.Module):
    def __init__(self, input_size, num_classes):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 32)
        self.out = nn.Linear(32, num_classes)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.out(x)

# Data Loading and Feature Engineering
def prepare_student_dataset():
    info = pd.read_csv("/content/drive/MyDrive/open+university+learning+analytics+dataset/studentInfo.csv")
    assessment = pd.read_csv("/content/drive/MyDrive/open+university+learning+analytics+dataset/studentAssessment.csv")
    meta = pd.read_csv("/content/drive/MyDrive/open+university+learning+analytics+dataset/assessments.csv")
    vle = pd.read_csv("/content/drive/MyDrive/open+university+learning+analytics+dataset/studentVle.csv")

    assessment = assessment.merge(meta[['id_assessment', 'code_module', 'date', 'weight']], on='id_assessment')
    assessment['score'] = pd.to_numeric(assessment['score'], errors='coerce')

    def weighted(x):
        w = assessment.loc[x.index, 'weight'].fillna(0)
        return np.average(x.fillna(0), weights=w) if w.sum() > 0 else x.mean()

    agg = assessment.groupby(['id_student', 'code_module']).agg(
        avg_score=('score', 'mean'),
        std_score=('score', 'std'),
        count=('score', 'count'),
        last_score=('score', lambda x: x.iloc[-1]),
        weighted_score=('score', weighted)
    ).reset_index()

    trend = assessment.sort_values(['id_student', 'date']).groupby('id_student')['score'].apply(
        lambda x: np.polyfit(range(len(x)), x.fillna(0), 1)[0] if len(x) > 1 else 0).reset_index(name='score_trend')

    vle_feat = vle.groupby('id_student').agg(
        total_clicks=('sum_click', 'sum'),
        active_days=('date', 'nunique'),
        avg_clicks_per_day=('sum_click', lambda x: x.sum()/len(x)),
        click_std=('sum_click', 'std')).reset_index()

    f14 = vle[vle['date'] <= 14].groupby('id_student')['sum_click'].sum().reset_index(name='clicks_first_14_days')
    l7 = vle[vle['date'] >= vle['date'].max()-7].groupby('id_student')['sum_click'].sum().reset_index(name='clicks_last_7_days')

    demo = info[['id_student', 'code_module', 'final_result', 'age_band', 'highest_education', 'imd_band']]
    demo[['age_band', 'highest_education', 'imd_band']] = OrdinalEncoder().fit_transform(
        demo[['age_band', 'highest_education', 'imd_band']].astype(str)
    )

    df = agg.merge(demo, on=['id_student', 'code_module'])
    df = df.merge(vle_feat, on='id_student')
    df = df.merge(trend, on='id_student')
    df = df.merge(f14, on='id_student')
    df = df.merge(l7, on='id_student')
    df = df[df['final_result'] != 'Withdrawn']
    df = df[df['count'] >= 2]

    df['target_class'] = df['final_result'].map({'Fail': 0, 'Pass': 1, 'Distinction': 2})
    df['score_click_interaction'] = df['avg_score'] * df['total_clicks']
    df['click_std_ratio'] = df['click_std'] / (df['avg_clicks_per_day'] + 1e-3)

    features = ['avg_score', 'std_score', 'count', 'last_score', 'score_trend', 'weighted_score',
                'total_clicks', 'active_days', 'avg_clicks_per_day', 'click_std',
                'clicks_first_14_days', 'clicks_last_7_days',
                'age_band', 'highest_education', 'imd_band',
                'score_click_interaction', 'click_std_ratio']

    return df, features

# Training Pipeline
spark = SparkSession.builder.appName("StudentNN").getOrCreate()
data_df, features = prepare_student_dataset()
sdf = spark.createDataFrame(data_df.dropna(subset=['target_class']))

pipeline = Pipeline(stages=[
    Imputer(inputCols=features, outputCols=features),
    VectorAssembler(inputCols=features, outputCol="features_vec"),
    StandardScaler(inputCol="features_vec", outputCol="features")
])

fitted_pipeline = pipeline.fit(sdf)
final_df = fitted_pipeline.transform(sdf).select("features", "target_class")

X = np.array(final_df.select("features").rdd.map(lambda x: x[0].toArray()).collect())
y = np.array(final_df.select("target_class").rdd.map(lambda x: int(x[0])).collect())
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)

model = MLP(X.shape[1], 3)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()

for _ in range(100):
    model.train()
    optimizer.zero_grad()
    out = model(X_tensor)
    loss = loss_fn(out, y_tensor)
    loss.backward()
    optimizer.step()

feature_means = dict(zip(features, data_df[features].mean().to_dict()))

# Scheduler Agent
class SchedulerAgent:
    def __init__(self, predicted_scores, exam_dates, total_daily_hours, subject_inputs):
        self.predicted_scores = predicted_scores
        self.exam_dates = exam_dates
        self.total_daily_hours = total_daily_hours
        self.subject_inputs = subject_inputs

    def _priority(self):
        scores = {}
        for subj, pred_class in self.predicted_scores.items():
            diff = self.subject_inputs[subj].get("difficulty_level", 2)
            score = self.subject_inputs[subj].get("last_score", 60)
            raw = (2 - pred_class) * 3 + diff * 2 + (100 - score) / 20
            scores[subj] = raw
        vals = list(scores.values())
        min_val, max_val = min(vals), max(vals)
        return {s: round(1 + 9 * ((v - min_val) / (max_val - min_val + 1e-5)), 2) for s, v in scores.items()}

    def _days_left(self):
        today = date.today()
        return {s: max(1, (date.fromisoformat(d) - today).days) for s, d in self.exam_dates.items()}

    def run(self):
        p_scores = self._priority()
        d_left = self._days_left()
        daily_hours_map = {}

        for s in p_scores:
            if d_left[s] == 1:
                daily_hours_map[s] = self.total_daily_hours
            else:
                daily_hours_map[s] = round((p_scores[s] / sum(p_scores.values())) * self.total_daily_hours * d_left[s], 2)

        return pd.DataFrame({
            "Subject": list(self.predicted_scores.keys()),
            "Predicted Class": [self.predicted_scores[s] for s in self.predicted_scores],
            "Priority Score": [p_scores[s] for s in self.predicted_scores],
            "Days Until Exam": [d_left[s] for s in self.predicted_scores],
            "Total Hours Assigned": [daily_hours_map[s] for s in self.predicted_scores]
        }).sort_values(by="Priority Score", ascending=False).reset_index(drop=True)

    def generate_daily_plan(self):
        summary = self.run()
        today = date.today()
        plan = []

        for i in range(max(summary['Days Until Exam'])):
            current_day = (today + timedelta(days=i)).isoformat()
            valid_subjects = summary[summary['Days Until Exam'] > i]
            weights = valid_subjects["Priority Score"].tolist()
            total_priority = sum(weights)

            for _, row in valid_subjects.iterrows():
                if row['Days Until Exam'] == 1:
                    hrs = self.total_daily_hours
                else:
                    hrs = round((row["Priority Score"] / total_priority) * self.total_daily_hours, 2)
                plan.append({"Date": current_day, "Subject": row["Subject"], "Hours": hrs})

        return pd.DataFrame(plan)


Overwriting train_agent.py


# Streamlit Dashboard App

In [4]:
%%writefile app.py

import streamlit as st
import pandas as pd
import torch
import numpy as np
from datetime import date
from train_agent import model, fitted_pipeline, SchedulerAgent, features, feature_means
from pyspark.sql import SparkSession
import plotly.express as px

# Helper Function to predict outcome for subject
def predict(model, pipeline, subject_input_dict):
    preds = {}
    spark = SparkSession.builder.getOrCreate()
    for code, feats in subject_input_dict.items():
        filled = {k: feats.get(k, feature_means[k]) for k in features}
        pdf = pd.DataFrame([filled])
        sdf = spark.createDataFrame(pdf)
        transformed = pipeline.transform(sdf).select("features")
        X = np.array(transformed.rdd.map(lambda x: x[0].toArray()).collect())
        X_tensor = torch.tensor(X, dtype=torch.float32)
        with torch.no_grad():
            logits = model(X_tensor)
            pred = torch.argmax(logits, axis=1).item()
        preds[code] = pred
    return preds

# Main Dashboard Function
def main():
    st.set_page_config(layout="wide")
    st.title("🧙‍♂️ SchedWiz: Your AI Study Wizard")
    st.caption("Turn your stress into strategy — Powered by PyTorch & Spark ✨")

    # Subject selection
    subject_codes = ['AAA', 'BBB', 'CCC', 'DDD', 'EEE', 'FFF', 'GGG']
    subject_map = {
        'AAA': 'Social Sciences', 'BBB': 'Business Analytics', 'CCC': 'Computer Science',
        'DDD': 'Mathematics', 'EEE': 'Engineering Design', 'FFF': 'AI Ethics', 'GGG': 'Data Structures'
    }

    st.markdown("### 📝 Select Your Subjects")
    selected = st.multiselect("Choose your subjects (max 7)", options=subject_codes, format_func=lambda x: subject_map[x])

    exams, inputs = {}, {}
    for s in selected:
        with st.expander(f"📘 {subject_map.get(s, s)}", expanded=False):
            exams[s] = st.date_input(f"🗓️ When is your exam for {subject_map.get(s, s)}?", value=date.today(), key=f"date_{s}").isoformat()
            diff = st.slider(f"📊 How hard is {subject_map.get(s, s)}?", 1, 3, 2, help="1 = Easy, 3 = Hard", key=f"diff_{s}")
            score = st.slider(f"📈 What was your last score in {subject_map.get(s, s)}?", 0, 100, 60, help="Used to estimate risk of failing", key=f"score_{s}")
            d = {k: 0.0 for k in features}
            d["difficulty_level"] = diff
            d["last_score"] = score
            inputs[s] = d

    hrs = st.slider("⏳ How many hours can you study per day?", 1, 12, 4)

    # Generating Plan On Click
    if st.button("✨ Generate My Smart Schedule"):
        try:
            preds = predict(model, fitted_pipeline, inputs)
            agent = SchedulerAgent(preds, exams, hrs, inputs)
            plan_df = agent.run()
            plan_df['Subject'] = plan_df['Subject'].map(subject_map).fillna(plan_df['Subject'])

            st.markdown("---")
            st.markdown("### 📋 What This Means")
            st.info("""
- **Predicted Class**:
  - 0 = ❌ Fail (score < 30)
  - 1 = ✅ Pass (30–79)
  - 2 = 🌟 Distinction (80+)

- **Priority Score**: Combines AI risk + difficulty + past score, scaled 1–10
""")

            # Creating a plan summary table
            st.markdown("#### 🧠 Smart Summary of Your Subjects")
            st.dataframe(
                plan_df.style
                    .bar(subset=["Total Hours Assigned"], color='#FFA07A')
                    .format({
                        "Total Hours Assigned": "{:.2f}",
                        "Priority Score": "{:.2f}"
                    }),
                use_container_width=True
            )

            # Bubble Chart
            st.markdown("#### 🎯 Visual Allocation of Study Time")
            fig = px.scatter(plan_df, x="Subject", y="Total Hours Assigned", color="Priority Score",
                             size="Priority Score", title="Lollipop-Style Study Allocation", size_max=20)
            st.plotly_chart(fig, use_container_width=True)

            # Daily Plan
            st.markdown("#### 📅 Daily Wizard Plan")
            daily = agent.generate_daily_plan()
            daily['Subject'] = daily['Subject'].map(subject_map).fillna(daily['Subject'])
            styled = daily.style.background_gradient(subset=["Hours"], cmap="BuGn").format({"Hours": "{:.0f}"})
            st.dataframe(styled, use_container_width=True)

            # Downlopad Buttons
            st.download_button("⬇️ Download Full Schedule", plan_df.to_csv(index=False), file_name="schedule.csv")
            st.download_button("⬇️ Download Daily Plan", daily.to_csv(index=False), file_name="daily_schedule.csv")

        except Exception as e:
            st.error(f"❌ Oops! Something went wrong: {e}")

# Entry Point
if __name__ == '__main__':
    main()


Overwriting app.py


# Streamlit + NGROK Lanch cell for colab

In [5]:
import threading
import time
from pyngrok import ngrok

def run_streamlit():
    !streamlit run app.py &> /dev/null

# Starting Streamlit in a background thread
thread = threading.Thread(target=run_streamlit, daemon=True)
thread.start()

# Waiting briefly for Streamlit to boot
time.sleep(5)

# Opening a tunnel to port 8501
public_url = ngrok.connect(addr="http://localhost:8501", proto="http")
print(f"Your SchedWiz app is live at: {public_url}")


Your SchedWiz app is live at: NgrokTunnel: "https://0a97-34-82-54-174.ngrok-free.app" -> "http://localhost:8501"
