For Part 3 of this Project I will be using the 'BertForSequenceClassification' model for binary classification. This code will attempt to fine-tune BERT using the 'transformers' library by HuggingFace.

In [None]:
#install the transformers library
!pip install transformers

In [None]:
#install packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset, random_split
from torch.nn import CrossEntropyLoss
from tqdm import tqdm
import torch

In [None]:
# Load the ratemyprofessor ratings dataset
data_path = 'https://raw.githubusercontent.com/eboyer221/CS39AA-Project/main/merged_data.csv'
df = pd.read_csv(data_path)

In [None]:
#Apply cleaning steps to the dataset
#remove rows that have null values in either of these columns
columns_to_check = ['student_star', 'comments']

# Remove rows with null values in either of the specified columns
df_1 = df_1.dropna(subset=columns_to_check)

# Reset the index after removing rows
df_1.reset_index(drop=True, inplace=True)
# Columns to remove 
columns_to_remove = ['school_name', 'local_name', 'state_name',
                    'year_since_first_review', 'take_again', 'diff_index',
                    'tag_professor', 'post_date', 'name_onlines', 'attence',
                    'for_credits', 'would_take_agains', 'grades', 'stu_tags',
                    'help_useful', 'help_not_useful']

# Drop the specified columns
df = df_1.drop(columns=columns_to_remove)

#Change the pandas default column width to view more of the comments field
pd.set_option("display.max_colwidth", 370)

In [None]:
# Function to clean up comments text using lemmatization
def clean_comments_lemm(text):
    # Check if the value is a string and not NaN
    if isinstance(text, str) and text.lower() != 'nan':
        # Convert to lowercase
        text = text.lower()

        # Remove special characters, numbers, and extra whitespaces
        text = re.sub(r'[^a-zA-Z\s]', '', text)

        # Remove stop words
        stop_words = set(stopwords.words('english'))
        words = text.split()
        words = [word for word in words if word not in stop_words]

        # Perform lemmatization
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]

        return words
    
    return []


# Apply the clean_comments function with lemmatization to the 'comments' column
df['tokens_lemm'] = df['comments'].apply(clean_comments_lemm)

df.head()

In [None]:
# Convert the lists of tokens to strings
X = df['tokens_lemm'].apply(lambda tokens: ' '.join(tokens))
#identify response variable as y
y = df['rating_result']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Vectorize text data using TF-IDF
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [None]:
# Convert the sparse matrix to dense NumPy array
X_train_vectorized = X_train_vectorized.toarray()
X_test_vectorized = X_test_vectorized.toarray()

In [None]:
# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train_vectorized, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_vectorized, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

In [None]:
# Define a simple BERT-based model for sequence classification
class BertClassifier(torch.nn.Module):
    def __init__(self):
        super(BertClassifier, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        return outputs.logits

# Initialize the model
model = BertClassifier()

In [None]:
# Define training parameters
epochs = 3
batch_size = 8
learning_rate = 2e-5

In [None]:
# Prepare the data loader
train_data = TensorDataset(X_train_tensor, y_train_tensor)
train_sampler = random_split(train_data, [int(0.8 * len(train_data)), len(train_data) - int(0.8 * len(train_data))])
train_loader = DataLoader(train_sampler[0], batch_size=batch_size, shuffle=True)

In [None]:
# Define loss function and optimizer
criterion = CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=learning_rate)

In [None]:
# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{epochs}'):
        inputs, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=(inputs != 0))
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}')

In [None]:
# Validation
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor, attention_mask=(X_test_tensor != 0))
    predictions = torch.argmax(outputs, dim=1).numpy()

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)

In [None]:
print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)
print("Confusion Matrix:\n", conf_matrix