# Project Part 3 - RateMyProfessor Deep Learning Model

[![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/eboyer221/CS39AA-project/blob/main/Project%20Part%203.ipynb)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/eboyer221/CS39AA-project/blob/main/Project%20Part%203.ipynb)

For Part 3 of this Project I will be using the 'BertForSequenceClassification' model for binary classification. This code will attempt to fine-tune BERT using the 'transformers' library by HuggingFace.

In [19]:
#install the transformers library
!pip install transformers

In [20]:
!pip install torch

In [17]:
#install packages
import pandas as pd
import nltk
import torch
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset, random_split
from torch.nn import CrossEntropyLoss
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [12]:
# Load the ratemyprofessor ratings dataset
data_path = 'https://raw.githubusercontent.com/eboyer221/CS39AA-Project/main/merged_data.csv'
df_1 = pd.read_csv(data_path)

In [14]:
#Apply cleaning steps to the dataset
#remove rows that have null values in either of these columns
columns_to_check = ['student_star', 'comments']

# Remove rows with null values in either of the specified columns
df_1 = df_1.dropna(subset=columns_to_check)

# Reset the index after removing rows
df_1.reset_index(drop=True, inplace=True)
# Columns to remove 
columns_to_remove = ['school_name', 'local_name', 'state_name',
                    'year_since_first_review', 'take_again', 'diff_index',
                    'tag_professor', 'post_date', 'name_onlines', 'attence',
                    'for_credits', 'would_take_agains', 'grades', 'stu_tags',
                    'help_useful', 'help_not_useful']

# Drop the specified columns
df = df_1.drop(columns=columns_to_remove)

#Change the pandas default column width to view more of the comments field
pd.set_option("display.max_colwidth", 370)

df.head()

Unnamed: 0,professor_name,department_name,star_rating,num_student,student_star,student_difficult,comments
0,Robert Olshansky,Urban & Regional Planning department,3.5,1,3.5,2.0,"Good guy, laid back and interested in his field. Class can get... a little..... slllllllloooooowwwwwwww during his junior workshop."
1,Marshall Levett,Counseling department,5.0,2,5.0,1.0,such a fun professor. really helpful and knows his stuff
2,Marshall Levett,Counseling department,5.0,2,5.0,1.0,Such a easy class. It\'s simple. Do your homework and pay attention and you will fly right by or be the person that blames him for not leaarning. He wont let you fail. just ask for help....
3,Soazig Le Bihan,Philosophy department,3.6,4,5.0,5.0,"A very hard class, and a massive amount of work. But, Soazig is also very good about explaining difficult concepts, gives excellent feedback, and is very accessible for extra assistance."
4,Soazig Le Bihan,Philosophy department,3.6,4,1.0,4.0,"Took 100 level class for Ethics offered online as an option to fill a core requirement She was terrible! Did not seem to have a grasp of the English language nor does she seem to have a grasp on reality as she insisted many times that failure in an ENTRY LEVEL, OPTIONAL class is very common due to the ""difficulty"" of material, very full of herself"


In [18]:
# Function to clean up comments text using lemmatization
def clean_comments_lemm(text):
    # Check if the value is a string and not NaN
    if isinstance(text, str) and text.lower() != 'nan':
        # Convert to lowercase
        text = text.lower()

        # Remove special characters, numbers, and extra whitespaces
        text = re.sub(r'[^a-zA-Z\s]', '', text)

        # Remove stop words
        stop_words = set(stopwords.words('english'))
        words = text.split()
        words = [word for word in words if word not in stop_words]

        # Perform lemmatization
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]

        return words
    
    return []


# Apply the clean_comments function with lemmatization to the 'comments' column
df['tokens_lemm'] = df['comments'].apply(clean_comments_lemm)

df.head()

Unnamed: 0,professor_name,department_name,star_rating,num_student,student_star,student_difficult,comments,tokens_lemm
0,Robert Olshansky,Urban & Regional Planning department,3.5,1,3.5,2.0,"Good guy, laid back and interested in his field. Class can get... a little..... slllllllloooooowwwwwwww during his junior workshop.","[good, guy, laid, back, interested, field, class, get, little, slllllllloooooowwwwwwww, junior, workshop]"
1,Marshall Levett,Counseling department,5.0,2,5.0,1.0,such a fun professor. really helpful and knows his stuff,"[fun, professor, really, helpful, know, stuff]"
2,Marshall Levett,Counseling department,5.0,2,5.0,1.0,Such a easy class. It\'s simple. Do your homework and pay attention and you will fly right by or be the person that blames him for not leaarning. He wont let you fail. just ask for help....,"[easy, class, simple, homework, pay, attention, fly, right, person, blame, leaarning, wont, let, fail, ask, help]"
3,Soazig Le Bihan,Philosophy department,3.6,4,5.0,5.0,"A very hard class, and a massive amount of work. But, Soazig is also very good about explaining difficult concepts, gives excellent feedback, and is very accessible for extra assistance.","[hard, class, massive, amount, work, soazig, also, good, explaining, difficult, concept, give, excellent, feedback, accessible, extra, assistance]"
4,Soazig Le Bihan,Philosophy department,3.6,4,1.0,4.0,"Took 100 level class for Ethics offered online as an option to fill a core requirement She was terrible! Did not seem to have a grasp of the English language nor does she seem to have a grasp on reality as she insisted many times that failure in an ENTRY LEVEL, OPTIONAL class is very common due to the ""difficulty"" of material, very full of herself","[took, level, class, ethic, offered, online, option, fill, core, requirement, terrible, seem, grasp, english, language, seem, grasp, reality, insisted, many, time, failure, entry, level, optional, class, common, due, difficulty, material, full]"


In [None]:
# Convert the lists of tokens to strings
X = df['tokens_lemm'].apply(lambda tokens: ' '.join(tokens))
#identify response variable as y
y = df['rating_result']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Vectorize text data using TF-IDF
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [None]:
# Convert the sparse matrix to dense NumPy array
X_train_vectorized = X_train_vectorized.toarray()
X_test_vectorized = X_test_vectorized.toarray()

In [None]:
# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train_vectorized, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_vectorized, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

In [None]:
# Define a simple BERT-based model for sequence classification
class BertClassifier(torch.nn.Module):
    def __init__(self):
        super(BertClassifier, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        return outputs.logits

# Initialize the model
model = BertClassifier()

In [None]:
# Define training parameters
epochs = 3
batch_size = 8
learning_rate = 2e-5

In [None]:
# Prepare the data loader
train_data = TensorDataset(X_train_tensor, y_train_tensor)
train_sampler = random_split(train_data, [int(0.8 * len(train_data)), len(train_data) - int(0.8 * len(train_data))])
train_loader = DataLoader(train_sampler[0], batch_size=batch_size, shuffle=True)

In [None]:
# Define loss function and optimizer
criterion = CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=learning_rate)

In [None]:
# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{epochs}'):
        inputs, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=(inputs != 0))
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}')

In [None]:
# Validation
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor, attention_mask=(X_test_tensor != 0))
    predictions = torch.argmax(outputs, dim=1).numpy()

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)

In [None]:
print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)
print("Confusion Matrix:\n", conf_matrix