In [1]:
import os
import json
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

try:
    import tensorflow as tf
    from tensorflow import keras
    TF_AVAILABLE = True
except Exception:
    TF_AVAILABLE = False
    tf = None
    keras = None

import lightgbm as lgb

# --------------------------
# 1) DATA (from parsed PDF)
# --------------------------
# We use the same small data we discussed earlier (manually constructed)
# This guarantees reproducibility of the demo.
students = pd.DataFrame([
    {"student_id":"S1","name":"Daniel","hobbies":"gaming, music","learning_language":"English","programming_focus":"Python","level":"beginner"},
    {"student_id":"S2","name":"Aisha","hobbies":"reading, chess","learning_language":"French","programming_focus":"JavaScript","level":"beginner"},
    {"student_id":"S3","name":"Kelvin","hobbies":"football, coding","learning_language":"Spanish","programming_focus":"Python","level":"intermediate"},
    {"student_id":"S4","name":"Maria","hobbies":"art, movies","learning_language":"English","programming_focus":"Java","level":"beginner"},
    {"student_id":"S5","name":"Zainab","hobbies":"music, blogging","learning_language":"German","programming_focus":"Python","level":"advanced"},
    {"student_id":"S6","name":"Chidi","hobbies":"coding, robotics","learning_language":"French","programming_focus":"C++","level":"intermediate"},
    {"student_id":"S7","name":"Fatima","hobbies":"fashion, makeup","learning_language":"English","programming_focus":"JavaScript","level":"beginner"},
    {"student_id":"S8","name":"John","hobbies":"gaming, fitness","learning_language":"Spanish","programming_focus":"Python","level":"intermediate"},
    {"student_id":"S9","name":"Leah","hobbies":"reading, k-drama","learning_language":"Korean","programming_focus":"JavaScript","level":"beginner"},
    {"student_id":"S10","name":"Emmanuel","hobbies":"robotics, coding","learning_language":"English","programming_focus":"Rust","level":"advanced"}
])

tutors = pd.DataFrame([
    {"tutor_id":"T1","name":"Mr. Smith","expertise_language":"English","programming_specialty":"Python","teaching_style":"structured","hobbies":"gaming"},
    {"tutor_id":"T2","name":"Ms. Claire","expertise_language":"French","programming_specialty":"JavaScript","teaching_style":"interactive","hobbies":"reading"},
    {"tutor_id":"T3","name":"Juan Pablo","expertise_language":"Spanish","programming_specialty":"Python","teaching_style":"conversational","hobbies":"football"},
    {"tutor_id":"T4","name":"Linda","expertise_language":"English","programming_specialty":"Java","teaching_style":"project-based","hobbies":"art"},
    {"tutor_id":"T5","name":"Hans","expertise_language":"German","programming_specialty":"Python","teaching_style":"structured","hobbies":"blogging"},
    {"tutor_id":"T6","name":"Pierre","expertise_language":"French","programming_specialty":"C++","teaching_style":"analytical","hobbies":"robotics"},
    {"tutor_id":"T7","name":"Yvonne","expertise_language":"English","programming_specialty":"JavaScript","teaching_style":"interactive","hobbies":"fashion"},
    {"tutor_id":"T8","name":"Carlos","expertise_language":"Spanish","programming_specialty":"Python","teaching_style":"hands-on","hobbies":"fitness"},
    {"tutor_id":"T9","name":"Min-ji","expertise_language":"Korean","programming_specialty":"JavaScript","teaching_style":"visual","hobbies":"k-drama"},
    {"tutor_id":"T10","name":"Adrian","expertise_language":"English","programming_specialty":"Rust","teaching_style":"advanced-concepts","hobbies":"coding"}
])

# Small helper to tokenize hobbies -> set for overlap detection
def tokenize(text):
    return set([t.strip().lower() for t in text.split(",") if t.strip()])

students['hobby_set'] = students['hobbies'].apply(tokenize)
tutors['hobby_set'] = tutors['hobbies'].apply(tokenize)

# Teaching-style preferences (pedagogical prior)
level_style_preference = {
    "beginner": {"structured", "interactive", "visual"},
    "intermediate": {"hands-on", "conversational", "project-based", "interactive"},
    "advanced": {"advanced-concepts", "analytical", "project-based"}
}



2025-11-21 10:58:57.309263: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763722737.337665    2828 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763722737.346407    2828 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

In [2]:
# 2) PSEUDO-LABELS (heuristic)
# --------------------------
# We use the same interpretable heuristic from before to create integer relevance labels.
# Why? Because real labels (student ratings, accepts) were not provided. Pseudo-labels
# let the ML model learn the heuristic pattern and generalize.
def heuristic_score(student_row, tutor_row):
    score = 0
    if student_row["learning_language"].lower() == tutor_row["expertise_language"].lower():
        score += 3
    if student_row["programming_focus"].lower() == tutor_row["programming_specialty"].lower():
        score += 4
    score += len(student_row['hobby_set'].intersection(tutor_row['hobby_set']))
    if tutor_row["teaching_style"].lower() in level_style_preference[student_row["level"]]:
        score += 2
    return score

# Build pairwise dataset: for each student, one row per tutor (this is how ranking datasets are usually framed)
pair_rows = []
for _, s in students.iterrows():
    for _, t in tutors.iterrows():
        pair_rows.append({
            "student_id": s["student_id"],
            "tutor_id": t["tutor_id"],
            "language_match": int(s["learning_language"].lower() == t["expertise_language"].lower()),
            "programming_match": int(s["programming_focus"].lower() == t["programming_specialty"].lower()),
            "hobby_overlap": len(s['hobby_set'].intersection(t['hobby_set'])),
            "teaching_style_match": int(t["teaching_style"].lower() in level_style_preference[s["level"]]),
            "student_level": s["level"],
            "student_prog": s["programming_focus"],
            "tutor_prog": t["programming_specialty"],
            "label": heuristic_score(s, t)   # integer relevance label
        })

df = pd.DataFrame(pair_rows)

# Save pairwise dataset for inspection / reproducibility
os.makedirs("/kaggle/working/model_outputs", exist_ok=True)
df.to_csv("/kaggle/working/model_outputs/pairwise_dataset.csv", index=False)



In [3]:
# --------------------------
# 3) FEATURE ENCODING
# --------------------------
# Use both simple binary/numeric features and small categorical encodings.
# Label encoding for small categorical variables is sufficient for tree models; we scale for NN.
label_student_prog = LabelEncoder().fit(df['student_prog'])
label_tutor_prog = LabelEncoder().fit(df['tutor_prog'])

df['student_prog_le'] = label_student_prog.transform(df['student_prog'])
df['tutor_prog_le'] = label_tutor_prog.transform(df['tutor_prog'])

features = ["language_match", "programming_match", "hobby_overlap", "teaching_style_match", "student_prog_le", "tutor_prog_le"]
X = df[features].values
y = df['label'].values



In [4]:
# --------------------------
# 4) TRAIN / TEST SPLIT (grouped by student)
# --------------------------
# For ranking you must keep all items of a query (student) together.
student_ids = sorted(df['student_id'].unique())
# choose train/test splits of students (small demo): first 8 students -> train; last 2 -> test
train_students = student_ids[:8]
test_students = student_ids[8:]

train_df = df[df['student_id'].isin(train_students)].reset_index(drop=True)
test_df  = df[df['student_id'].isin(test_students)].reset_index(drop=True)

X_train = train_df[features].values
y_train = train_df['label'].values
X_test  = test_df[features].values
y_test  = test_df['label'].values

# LightGBM requires group sizes (number of items per query)
group_train = [len(train_df[train_df['student_id']==sid]) for sid in sorted(train_df['student_id'].unique())]
group_test  = [len(test_df[test_df['student_id']==sid]) for sid in sorted(test_df['student_id'].unique())]



In [5]:
# --------------------------
# 5) LIGHTGBM LAMBDARANK (true learning-to-rank)
# --------------------------
# Why lambdarank? It optimizes ranking metrics (NDCG) directly and is standard for ranking tasks.
lgb_train = lgb.Dataset(X_train, label=y_train, group=group_train)
lgb_val   = lgb.Dataset(X_test,  label=y_test,  group=group_test, reference=lgb_train)

lgb_params = {
    "objective": "lambdarank",
    "metric": "ndcg",
    "ndcg_eval_at": [1, 3],   # we will monitor NDCG@1 and NDCG@3
    "learning_rate": 0.1,
    "num_leaves": 31,
    "min_data_in_leaf": 1,
    "verbosity": -1,
    "seed": 42
}

# We use a small number of boosting rounds for the demo. In real data, increase rounds and use CV.
model = lgb.train(lgb_params, lgb_train, valid_sets=[lgb_train, lgb_val], valid_names=['train','valid'],
                  num_boost_round=100, callbacks=[lgb.early_stopping(stopping_rounds=10)])

# Save the LightGBM model
model.save_model("/kaggle/working/model_outputs/lambdarank_model.txt")



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[3]	train's ndcg@1: 1	train's ndcg@3: 0.997598	valid's ndcg@1: 1	valid's ndcg@3: 1


<lightgbm.basic.Booster at 0x79a8d0b3c390>

In [6]:
# --------------------------
# 6) EVALUATION: NDCG@K (per student)
# --------------------------
def dcg_at_k(rels, k):
    rels = np.asfarray(rels)[:k]
    if rels.size == 0:
        return 0.0
    discounts = np.log2(np.arange(2, rels.size + 2))
    return np.sum((2**rels - 1) / discounts)

def ndcg_at_k(true_rels, pred_scores, k):
    order = np.argsort(pred_scores)[::-1]
    sorted_true = np.asarray(true_rels)[order]
    ideal_sorted = np.sort(true_rels)[::-1]
    idcg = dcg_at_k(ideal_sorted, k)
    if idcg == 0:
        return 0.0
    return dcg_at_k(sorted_true, k) / idcg

def evaluate_by_student(model_predict_fn, df_group, k=3):
    rows = []
    for sid in sorted(df_group['student_id'].unique()):
        grp = df_group[df_group['student_id'] == sid]
        Xg = grp[features].values
        y_true = grp['label'].values
        y_pred = model_predict_fn(Xg)
        ndcg = ndcg_at_k(y_true, y_pred, k)
        rows.append({"student_id": sid, f"ndcg@{k}": ndcg})
    return pd.DataFrame(rows)



In [None]:
# LightGBM predictions wrapper
lgb_pred_fn = lambda X: model.predict(X, num_iteration=model.best_iteration)

lgb_eval = evaluate_by_student(lgb_pred_fn, test_df, k=3)
lgb_mean_ndcg3 = lgb_eval[f"ndcg@3"].mean()

# --------------------------
# 7) NEURAL NETWORK (pointwise ranking / deep regressor)
# --------------------------
# A pointwise NN learns to predict the label (relevance score); ranking follows from predicted scores.
nn_eval = None
if TF_AVAILABLE:
    # scale numeric features for NN
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled  = scaler.transform(X_test)

    # small MLP regressor (dense layers). We keep it simple due to small dataset.
    def build_mlp(inp_dim):
        model_nn = keras.Sequential([
            keras.layers.Input(shape=(inp_dim,)),
            keras.layers.Dense(64, activation='relu'),
            keras.layers.Dense(32, activation='relu'),
            keras.layers.Dense(1, activation='linear')  # regress relevance
        ])
        model_nn.compile(optimizer=keras.optimizers.Adam(learning_rate=0.01), loss='mse')
        return model_nn

    mlp = build_mlp(X_train_scaled.shape[1])
    es = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    mlp.fit(X_train_scaled, y_train, validation_split=0.1, epochs=200, batch_size=8, callbacks=[es], verbose=0)

    # Save NN and scaler
    mlp.save("/kaggle/working/model_outputs/nn_pointwise_regressor.h5")
    import joblib
    joblib.dump(scaler, "/kaggle/working/model_outputs/nn_scaler.pkl")

    # NN predict wrapper
    nn_pred_fn = lambda X: mlp.predict(scaler.transform(X)).reshape(-1)
    nn_eval = evaluate_by_student(nn_pred_fn, test_df, k=3)
    nn_mean_ndcg3 = nn_eval[f"ndcg@3"].mean()
else:
    print("TensorFlow not installed — skipping neural network training. Install TensorFlow to run the NN block.")



In [9]:
# --------------------------
# 8) Save evaluation summary and top-K recommendations
# --------------------------
summary = {
    "lambdarank_ndcg_by_student": lgb_eval.to_dict(orient='records'),
    "lambdarank_mean_ndcg@3": float(lgb_mean_ndcg3),
    "nn_available": bool(TF_AVAILABLE)
}
if TF_AVAILABLE and nn_eval is not None:
    summary["nn_ndcg_by_student"] = nn_eval.to_dict(orient='records')
    summary["nn_mean_ndcg@3"] = float(nn_mean_ndcg3)

with open("/kaggle/working/model_outputs/evaluation_summary.json", "w") as f:
    json.dump(summary, f, indent=2)

# Save top-3 per test student for each trained model
def topk_for_student(model_predict_fn, df_group, k=3):
    Xg = df_group[features].values
    preds = model_predict_fn(Xg)
    df_group = df_group.copy()
    df_group['pred'] = preds
    df_group = df_group.sort_values('pred', ascending=False)
    return df_group.head(k)[['student_id','tutor_id','pred','label']]

all_recs = []
# LightGBM recs:
for sid in sorted(test_df['student_id'].unique()):
    grp = test_df[test_df['student_id'] == sid]
    recs = topk_for_student(lgb_pred_fn, grp, k=3)
    recs['model'] = 'lambdarank'
    all_recs.append(recs)
# NN recs if available
if TF_AVAILABLE and nn_eval is not None:
    for sid in sorted(test_df['student_id'].unique()):
        grp = test_df[test_df['student_id'] == sid]
        recs = topk_for_student(nn_pred_fn, grp, k=3)
        recs['model'] = 'nn_pointwise'
        all_recs.append(recs)

if all_recs:
    recs_df = pd.concat(all_recs, ignore_index=True)
    recs_df.to_csv("/kaggle/working/model_outputs/test_recommendations.csv", index=False)

print("Finished training. Outputs written to /kaggle/working/model_outputs")
print(os.listdir("/kaggle/working/model_outputs"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
Finished training. Outputs written to /kaggle/working/model_outputs
['pairwise_dataset.csv', 'lambdarank_model.txt', 'test_recommendations.csv', 'nn_scaler.pkl', 'nn_pointwise_regressor.h5', 'evaluation_summary.json']
