In [None]:
!pip install requests

In [None]:
import requests
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset

import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Function to load data
def populate_df(slug, name):
  df = pd.DataFrame(columns=['review', 'rating'])
  # Get the JSON from Planet Terp
  url = "https://planetterp.com/api/v1/professor"
  response = requests.get(url, params={"slug": slug, "name": name, "reviews": "true"})

  print("Status Code:", response.status_code)

  json = response.json()

  # Extract text and numeric rating from the JSON
  length = len(json['reviews'])

  ####### TEMP BLOCK SO MY CODE DOESN'T RUN FOREVER WHILE I'M TESTING AS I WRITE: SHOULD BE TAKEN OUT FOR THE FINAL RUN #######
  # if length > 50:
  #   length = 50
  ####### If I forgot to take this out, then uhhhhhhh oopsie daisies                                                    #######

  for i in range(length):
    # Fill it out in reverse order so that the most recent ones show up at the beginning, so that I could more easily validate that this is working properly
    df.loc[i] = [json['reviews'][length - (i + 1)]['review'], json['reviews'][length - (i + 1)]['rating']]

  return df

In [None]:
# Populate dataframes

# The "good" professors*
df_nelson = populate_df("padua-perez", "Nelson Padua-Perez")

df_justin = populate_df("wyss-gallifent", "Justin Wyss-Gallifent")

# "mid" professor*
df_hatice = populate_df("sahinoglu", "Hatice Sahinoglu")

# The "bad" professors*
df_mignerey = populate_df("mignerey", "Alice Mignerey")

df_raluca = populate_df("rosca", "Raluca Rosca")

# More professors with... "balanced" reviews to even out the class imbalance, hopefully
df_cukier = populate_df("cukier", "Michel Cukier")

df_yoon = populate_df("yoon_ilchul", "Ilchul Yoon")

df_kruskal = populate_df("kruskal", "Clyde Kruskal")

df_kleiss = populate_df("kleiss", "Michael Kleiss")

df_li = populate_df("li_liyi", "Liyi Li")

df_salako = populate_df("salako", "Olubukola Salako")

df_pilachowski = populate_df("pilachowski", "Timothy Pilachowski")

# More professors to fill in the 2/3/4 star area

df_cliff = populate_df("bakalian", "Cliff Bakalian")

df_koppel = populate_df("koppel", "Monique Koppel")

df_baxter = populate_df("baxter_ashley", "Ashley Baxter")

df_rainbolt = populate_df("rainbolt_james", "James Rainbolt")

df_guest = populate_df("guest_christiana", "Christiana Guest")

df_fernandes = populate_df("fernandes", "Jonathan Fernandes")

df_herman = populate_df("herman_larry", "Larry Herman")

df_total = pd.concat([df_nelson, df_justin, df_hatice, df_mignerey, df_raluca,
                      df_cukier, df_yoon, df_kruskal, df_kleiss, df_li, df_salako, df_pilachowski,
                      df_cliff, df_koppel, df_baxter, df_rainbolt, df_guest, df_fernandes, df_herman
                      ], ignore_index=True)

# *Note: no statistical analysis was done to back up the claims on the quality of these professors.

In [None]:
# Data exploration

# Class imbalance
print(df_total['rating'].value_counts()/len(df_total))

model_name = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Max tokens: 1527
print(df_total['review'].apply(lambda x: len(tokenizer.encode(x))).describe())
# Find # of responses that exceed max token count of 512
token_counts = df_total['review'].apply(lambda x: len(tokenizer.encode(x)))
percent_above_512 = (token_counts > 512).mean() * 100
print(f"Percentage of reviews longer than 512 tokens: {percent_above_512:.2f}%")

In [None]:
# Data cleaning
# Since they make up a small part of the data and increasing the token count would slow down computation significantly, just throw them out
df_total = df_total[df_total['review'].apply(lambda x: len(tokenizer.encode(x))) <= 512]

In [None]:
# Code taken from the following:
# https://medium.com/@prabhatzade/freezing-layers-and-fine-tuning-transformer-models-in-pytorch-a-simple-guide-119cad0980c6

model = AutoModelForCausalLM.from_pretrained(model_name,
                                             device_map="auto", # uses GPU if available, CPU if not
                                             dtype=torch.float32 # safest, works everywhere
                                            )
print(model)
# Convert the df to a dataset for use in the trainer
dataset = Dataset.from_pandas(df_total)
# print(dataset)

# Test train split via hugging face method
dataset = dataset.train_test_split(test_size=0.2)
train_dataset = dataset["train"]
test_dataset = dataset["test"]
df_test = test_dataset.to_pandas()

# reformat data as a prompt for the trainer
def create_prompt(row, training):
  # Append true label to prompt in training data so the transformer can learn to predict the numerical rating token, but don't target leak
  if training:
    row['text'] = (
              "<|im_start|>system\n"
              "You output a single digit 1–5. No words. No explanation.\n"
              "<|im_end|>\n"
              "<|im_start|>user\n"
              f"{row['review']}\nHow many stars, from 1-5, did this review give?\n"
              "<|im_end|>\n"
              "<|im_start|>assistant\n"
              f"{row['rating']}\n"
              "<|im_end|>"
    )
  else:
    row['text'] = (
              "<|im_start|>system\n"
              "You output a single digit 1–5. No words. No explanation.\n"
              "<|im_end|>\n"
              "<|im_start|>user\n"
              f"{row['review']}\nHow many stars, from 1-5, did this review give?\n"
              "<|im_end|>\n"
              "<|im_start|>assistant\n"
    )
  return row

# tokenize data
def tokenize(batch):
    tokenized = tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

    input_ids = tokenized["input_ids"]
    labels = []

    # Mask padding tokens to be ignored by accuracy evaluator
    # tokenized["labels"] = []
    # for ids in tokenized["input_ids"]:
    #   input_tokens = []
    #   for token in ids:
    #     if token != tokenizer.pad_token_id:
    #       input_tokens.append(token)
    #     else:
    #       input_tokens.append(-100)

    # Mask everything except the star prediction to be ignored by the accuracy evaluator
    # I didn't rigorously test this with CV given the time taken to train a model, but I noticed no improvement in accuracy compared to the above approach
    for ids in input_ids:
      ids_copy = ids.copy()
      label_ids = [-100] * len(ids)  # mask everything by default

      # Find where the assistant answer starts
      assistant_tokens = tokenizer.encode("<|im_start|>assistant", add_special_tokens=False)
      end_token = tokenizer.encode("<|im_end|>", add_special_tokens=False)

      # Locate the start of assistant
      start_index = None
      for i in range(len(ids_copy) - len(assistant_tokens) + 1):
        if ids_copy[i:i+len(assistant_tokens)] == assistant_tokens:
          start_index = i + len(assistant_tokens)
          break

      if start_index is not None:
        # Locate the end of assistant answer
        end_index = None
        for i in range(start_index, len(ids_copy) - len(end_token) + 1):
          if ids_copy[i:i+len(end_token)] == end_token:
            end_index = i
            break
        if end_index is None:
          end_index = len(ids_copy)

        # Unmask only the assistant answer tokens
        for i in range(start_index, end_index):
          if ids_copy[i] != tokenizer.pad_token_id:
            label_ids[i] = ids_copy[i]

      labels.append(label_ids)
    tokenized["labels"] = labels


    return tokenized

# Train-test split
train_dataset = train_dataset.map(lambda row: create_prompt(row, training=True))
test_dataset = test_dataset.map(lambda row: create_prompt(row, training=False))
train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

train_dataset.set_format("torch")
test_dataset.set_format("torch")

# Freeze the first few layers of the encoder to avoid overfitting
for layer in model.model.layers[:6]:
    for param in layer.parameters():
        param.requires_grad = False

# Unfreeze other layers to allow fine-tuning
for layer in model.model.layers[6:]:
    for param in layer.parameters():
        param.requires_grad = True

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4, # Effectively have 4x batch size
    num_train_epochs=4,
    logging_dir='./logs',
    logging_steps=10,
    report_to="none",
)

# Define a Trainer with frozen layers
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Start training
trainer.train()

trainer.save_model("./results")
tokenizer.save_pretrained("./results")

model = AutoModelForCausalLM.from_pretrained("./results", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")

In [None]:
# Some code courtesy of Maksym Morawski
# Predictions
def predict_review(review):
  prompt = ("<|im_start|>system\n"
              "You output a single digit 1–5. No words. No explanation.\n"
              "<|im_end|>\n"
              "<|im_start|>user\n"
              f"{review}\nHow many stars, from 1-5, did this review give?\n"
              "<|im_end|>\n"
              "<|im_start|>assistant\n"
  )


  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

  with torch.no_grad():
    output_tokens = model.generate(**inputs,
                                   max_new_tokens=1, do_sample=False, # set this to false?
                                  )

  return tokenizer.decode(output_tokens[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True).strip()

acc = 0
predictions = []

for i in range(len(df_test)):
  review = df_test.loc[i]
  prediction = int(predict_review(review['review']))
  predictions.append(prediction)
  if prediction == review['rating']:
    acc += 1
  # print(int(prediction), review['rating'])

acc /= len(df_test)

print(acc)

In [None]:
cm = confusion_matrix(df_test['rating'], predictions, normalize='all')
plt.figure(figsize=(6, 5), facecolor = "none")
plt.rcParams['font.family'] = 'DejaVu Sans'
plt.rcParams['font.stretch'] = 'condensed'
plt.rcParams['font.weight'] = 'bold'
fontcol = "#000000"

plt.rcParams['text.color'] = fontcol
plt.rcParams['axes.labelcolor'] = fontcol
plt.rcParams['xtick.color'] = fontcol
plt.rcParams['ytick.color'] = fontcol
# ax.tick_params(colors=fontcol)
# ax.xaxis.label.set_color(fontcol)
# ax.yaxis.label.set_color(fontcol)
# ax.title.set_color(fontcol)
ax = sns.heatmap(cm, annot=True, fmt=".1f", cmap="YlOrBr", cbar=True,
            xticklabels=['1', '2', '3', '4', '5'], yticklabels=['1', '2', '3', '4', '5'])


label_font = {
    'family': 'DejaVu Sans',   # bundled with matplotlib
    'weight': 'bold',
    'stretch': 'condensed',    # makes it narrower (closest to Anton look)
    'size': 14
}

plt.xlabel("Predicted Stars", fontdict=label_font)
plt.ylabel("Actual Stars", fontdict=label_font)
plt.title("Confusion Matrix (%)", fontdict=label_font)
plt.show()