In [None]:
from email.mime import text
import planetterp
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset
from torch.utils.data import DataLoader
import torch
import evaluate
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments


# fetch professor data

professor1 = planetterp.professor(name="Michael McCourt", reviews=True)
professor2 = planetterp.professor(name="Samuel Kerstein", reviews=True)
professor3 = planetterp.professor(name="Antong Liu", reviews=True)
professor4 = planetterp.professor(name="Hallie Liberto", reviews=True)
professor5 = planetterp.professor(name="Peter Carruthers", reviews=True)

# Tokenizing data of professors

data = []

for prof in [professor1, professor2, professor3, professor4, professor5]:
    for review in prof["reviews"]:
        data.append({
            "text": review['review'],
            "label": float(review['rating'])
        })


model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)

dataset = Dataset.from_list(data)


def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=256)

tokenized = dataset.map(tokenize, batched=True)
tokenized = tokenized.rename_column("label", "labels")
tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

#splitng dataset

split_datasets = dataset.train_test_split(test_size=0.2) 

train_dataset = split_datasets['train']
test_dataset = split_datasets['test']

tokenized_train = train_dataset.map(tokenize, batched=True)
tokenized_eval = test_dataset.map(tokenize, batched=True)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
eval_loader = DataLoader(test_dataset, batch_size=8)

# Setting up optimizer


# Loading model

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)

# Training the model

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=1,
    logging_dir="./logs",
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=4e-3,             
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
)

trainer.train()

# Evaluating the model

metrics = trainer.evaluate()

print("Evaluation metrics:")

print(metrics)

def predict_review(review_text):
    inputs = tokenizer(
        review_text,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )

    inputs.pop("token_type_ids", None)
    inputs = {k: v.cuda() for k, v in inputs.items()}
    

    
    with torch.no_grad(): 
        outputs = model(**inputs)

    prediction = outputs.logits.squeeze().item()
    return prediction

# Example prediction

model = model.cuda()
BadReviewText = "The workload was quite heavy, really didn't like this professor. Dont take him."
GoodReviewText = "This professor was amazing! Loved the way he taught and made the class engaging."
BadReview = predict_review(BadReviewText)
print(f"Predicted rating for the Bad review: {BadReview}")
GoodReview = predict_review(GoodReviewText)
print(f"Predicted rating for the Good review: {GoodReview}")