# 6. Testing the supervised learning topic modelling methods on a subset of the reviews

In [1]:
from google.colab import drive

# Mounting Google Drive
drive.mount('/content/drive', force_remount=True) # Adding force_remount=True to force the remounting process


import pandas as pd
from gensim import corpora
from gensim.models import LdaModel
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import pipeline
import nltk
import os
import torch  # Import torch for saving .pth files

# Download NLTK stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
stop_words = set(stopwords.words('english'))

# Load the data
df = pd.read_csv('/content/drive/MyDrive/train_data.csv')
df_reviews = df.iloc[:100] # Assume this is your original DataFrame with 39,000 reviews

# Define batch size and checkpoint directory
batch_size = 100  # Start with 50 for better efficiency
checkpoint_dir = "/content/drive/MyDrive/checkpoints_topic_labels_supervised_final"
os.makedirs(checkpoint_dir, exist_ok=True)

# Define BART classifier and topics
zero_shot_classifier = pipeline('zero-shot-classification', model='facebook/bart-large-mnli')
topics = ["Assignments", "Exams", "Lectures", "Course Material", "Grading", "Instructor", "Workload"]

# Preprocessing function
def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

# LDA function to get top topics
def get_top_topics(lda_model, corpus, top_n=3):
    top_topics = []
    for row in lda_model[corpus]:
        row = sorted(row, key=lambda x: x[1], reverse=True)
        top_n_topics = [topic[0] for topic in row[:top_n]]
        top_topics.append(top_n_topics)
    return top_topics

# Function to apply BART zero-shot classification
def classify_review_multiple(review, top_n=3):
    result = zero_shot_classifier(review, candidate_labels=topics)
    return result['labels'][:top_n]

# Dictionary for LDA topic mapping
topic_mapping = {
    0: 'Assignments', 1: 'Exams', 2: 'Lectures', 3: 'Course Material',
    4: 'Grading and Feedback', 5: 'Instructor/Professor',
    6: 'Class Participation and Engagement', 7: 'Online Resources',
    8: 'Course Structure and Organization', 9: 'Facilities and Equipment',
    10: 'Workload and Difficulty', 11: 'Learning Outcomes and Skills',
    12: 'Class Environment and Culture', 13: 'Career Relevance and Application'
}

def map_topics_to_categories(topics, topic_mapping):
    return [topic_mapping[topic] for topic in topics if topic in topic_mapping]

# Process the dataset in batches with checkpointing
for i in range(0, len(df_reviews), batch_size):
    batch_num = i // batch_size
    checkpoint_file = os.path.join(checkpoint_dir, f"batch_{batch_num}.pth")

    # Skip processing if checkpoint already exists
    if os.path.exists(checkpoint_file):
        print(f"Skipping batch {batch_num}, checkpoint found.")
        continue

    # Slice the batch
    batch_df = df_reviews.iloc[i:i + batch_size].copy()

    # LDA processing
    batch_df['processed_reviews'] = batch_df['reviews'].apply(preprocess)
    dictionary = corpora.Dictionary(batch_df['processed_reviews'])
    corpus = [dictionary.doc2bow(review) for review in batch_df['processed_reviews']]
    lda_model = LdaModel(corpus, num_topics=14, id2word=dictionary, passes=15)
    batch_df['LDA_Topic'] = get_top_topics(lda_model, corpus, top_n=3)
    batch_df['LDA_Topic'] = batch_df['LDA_Topic'].apply(lambda x: map_topics_to_categories(x, topic_mapping))

    # BART processing
    batch_df['BART_Topic'] = batch_df['reviews'].apply(classify_review_multiple)

    # Save the batch to a checkpoint file as .pth
    checkpoint_data = {
        'LDA_Topic': batch_df['LDA_Topic'].tolist(),
        'BART_Topic': batch_df['BART_Topic'].tolist()
    }
    torch.save(checkpoint_data, checkpoint_file)
    print(f"Processed and saved batch {batch_num}.")

# Load all checkpoints and concatenate them
all_batches = []
for file in sorted(os.listdir(checkpoint_dir)):
    if file.startswith("batch_") and file.endswith(".pth"):
        checkpoint_data = torch.load(os.path.join(checkpoint_dir, file))

        # Create a DataFrame from the checkpoint data
        batch_df = pd.DataFrame({
            'LDA_Topic': checkpoint_data['LDA_Topic'],
            'BART_Topic': checkpoint_data['BART_Topic']
        })
        all_batches.append(batch_df)

# Concatenate all processed batches into a single DataFrame
processed_df = pd.concat(all_batches, ignore_index=True)

# Ensure that the final DataFrame includes all original columns plus the new topic labels
# This should match the structure and order of the original DataFrame
final_df = pd.concat([df_reviews.reset_index(drop=True), processed_df], axis=1)

# Display final DataFrame with all original columns plus new topic labels
print(final_df.head())


Mounted at /content/drive


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Skipping batch 0, checkpoint found.


  checkpoint_data = torch.load(os.path.join(checkpoint_dir, file))


   Unnamed: 0.2  Unnamed: 0.1  Unnamed: 0  \
0       46979.0       43342.0     52189.0   
1       15087.0       27963.0     33570.0   
2       32265.0        3290.0      4397.0   
3       44070.0       43900.0     52762.0   
4       18016.0       34850.0     42219.0   

                                             reviews date_reviews  rating  \
0  Andrew Ng has provided a fantastic resource fo...   2017-09-12     5.0   
1  Peer graded assingments take a lot of time bec...   2020-06-14     4.0   
2  This course was the first actual course for Pr...   2019-01-31     5.0   
3  The best course to start with if you are enter...   2017-08-20     5.0   
4  I wanted to learn more about Functional Progra...   2020-07-12     4.0   

               course_id    month       departments  \
0       machine-learning  2017-09  Computer Science   
1       python-databases  2020-06  Computer Science   
2                 python  2019-01  Computer Science   
3       machine-learning  2017-08  Computer Sc

In [2]:
final_df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.2'], inplace=True)

In [5]:
df_to_save = final_df.iloc[:100]

In [6]:
df_to_save.to_csv('topic_modelling_supervised_subset.csv', index=False)