In [1]:
import os
from scraper.reddit_scraper import get_reddit_object, get_qa_object, \
                                   get_trump_reddit_posts_and_comments, \
                                    get_kamala_reddit_posts_and_comments, \
                                    read_paths_create_df
from scraper.youtube_scraper import scrape_youtube
from utilities.util import create_folder_if_not_exists
from preprocessor.preprocess import rename_df_cols, set_post_title_as_parent_comment_if_na, \
                                    unify_youtube_and_reddit_comments, combine_reddit_comment_and_post_df,\
                                    preprocess_dataset, split_comments_to_sentence,\
                                    assign_level
from preprocessor.coreference_resolution import coref_resolve
from preprocessor.subjectivity_classifier import is_subjective

from model.prediction import predict_with_models, AspectBasedSentimentModel

from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel, \
                        AutoModelForSeq2SeqLM, BartForConditionalGeneration
import torch
from torch import nn

import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  self._model.load_state_dict(torch.load(filelike, map_location=device))
  model.load_state_dict(torch.load(filelike, map_location=device))
  return torch.load(checkpoint_file, map_location="cpu")


In [2]:
import warnings
warnings.filterwarnings("ignore")

# Scrape Reddit

In [3]:
# Alter file paths if necessary
REDDIT_SAVE_DIR = './data/reddit/'
YOUTUBE_SAVE_DIR = './data/youtube/'
FINAL_FILE_SAVE_DIR = './data/'

CONSOLIDATED_REDDIT_COMMENT_SAVE_FILE = f'{REDDIT_SAVE_DIR}/consolidated_reddit_comments.csv'
CONSOLIDATED_REDDIT_POST_SAVE_FILE = f'{REDDIT_SAVE_DIR}/consolidated_reddit_posts.csv'
YOUTUBE_FILE_NAME = 'youtube_comments.csv'

COMBINED_COMMENT_LEVEL_FILE = 'comment_level.csv'
COMBINED_SENTENCE_LEVEL_FILE = 'sentence_level.csv'

ENTITY_MODEL = 'destonedbob/nusiss-election-project-entity-model-distilbert-base-cased'
ASPECT_MODEL_DISTIL = './model/multilabel_aspect_distil_4epochs_lr3e-5_without_test_set_split_keep_same_sent_together.pth'
ASPECT_MODEL_SEQ2SEQ = 'destonedbob/nusiss-election-project-aspect-seq2seq-model-facebook-bart-large'
SENTIMENT_MODEL_DISTIL = './model/sentiment_model_val_acc_6162_lr4.5e-5_wtdecay_1e-4_epochs4_256_256_256_256_smoothed_weight_warmup_and_reducelr_freeze4layers.pth'
SENTIMENT_MODEL_SEQ2SEQ = 'destonedbob/nusiss-election-project-sentiment-seq2seq-model-facebook-bart-large'
DISTILBERT_BASE_CASED = 'distilbert-base-cased'


In [4]:
entity_idx_map = {k:v for v, k in enumerate(['kamala', 'trump', 'others'])}
idx_entity_map = {v:k for k, v in entity_idx_map.items()}
aspect_idx_map = {k:v for v, k in enumerate(['campaign', 'communication', 'competence', 'controversies',
       'ethics and integrity', 'leadership', 
       'personality trait', 'policies', 'political ideology',
       'public image', 'public service record',
       'relationships and alliances', 'voter sentiment', 'others'])}
idx_aspect_map = {v:k for k, v in aspect_idx_map.items()}
sentiment_idx_map = {k:v for v, k in enumerate(['negative', 'neutral', 'positive'])}
idx_sentiment_map = {v:k for k, v in sentiment_idx_map.items()}
idx_sentiment_map2 = {v-1:k for k, v in sentiment_idx_map.items()}

In [4]:
for path in [REDDIT_SAVE_DIR, YOUTUBE_SAVE_DIR, FINAL_FILE_SAVE_DIR]:
    create_folder_if_not_exists(path)

In [11]:
reddit = get_reddit_object()
qa_obj = get_qa_object()

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [None]:
REDDIT_POST_TO_SCRAPE = 1 # Change to alter number of posts to scrape comments from
trump_post_df, trump_comment_df = get_trump_reddit_posts_and_comments(reddit, qa_obj, REDDIT_SAVE_DIR, max_posts_to_collect=REDDIT_POST_TO_SCRAPE)
kamala_post_df, kamala_comment_df = get_kamala_reddit_posts_and_comments(reddit, qa_obj, REDDIT_SAVE_DIR, max_posts_to_collect=REDDIT_POST_TO_SCRAPE)

In [22]:
scraped_reddit_comment_file_paths = [REDDIT_SAVE_DIR+file for file in os.listdir(REDDIT_SAVE_DIR) if '_comment_data_' in file and 'test' not in file]
scraped_reddit_post_file_paths = [REDDIT_SAVE_DIR+file for file in os.listdir(REDDIT_SAVE_DIR) if '_post_data_' in file and 'test' not in file]

scraped_reddit_comment_df = read_paths_create_df(scraped_reddit_comment_file_paths)
scraped_reddit_comment_df.to_csv(CONSOLIDATED_REDDIT_COMMENT_SAVE_FILE, index=False)
scraped_reddit_post_df = read_paths_create_df(scraped_reddit_post_file_paths)
scraped_reddit_post_df.to_csv(CONSOLIDATED_REDDIT_POST_SAVE_FILE, index=False)


In [4]:
scraped_reddit_comment_df = pd.read_csv(CONSOLIDATED_REDDIT_COMMENT_SAVE_FILE)
scraped_reddit_post_df = pd.read_csv(CONSOLIDATED_REDDIT_POST_SAVE_FILE)

# Youtube Scrape TODO

In [6]:
youtube_comment_df = scrape_youtube(YOUTUBE_SAVE_DIR+YOUTUBE_FILE_NAME, videos_to_scrape=1)

YouTube comments have been saved to ./data/youtube/youtube_comments.csv


In [5]:
youtube_comment_df = pd.read_csv(YOUTUBE_SAVE_DIR+YOUTUBE_FILE_NAME)

# Unifying Reddit + Youtube Data

In [6]:
df_reddit_comment = rename_df_cols(scraped_reddit_comment_df, 'reddit_comments')
df_reddit_post = rename_df_cols(scraped_reddit_post_df, 'reddit_posts')
df_youtube = rename_df_cols(youtube_comment_df, 'youtube')

df_reddit_comment = df_reddit_comment.drop_duplicates('comment_id', keep='first')
df_youtube = df_youtube.drop_duplicates('comment_id', keep='first')
df_reddit_post = df_reddit_post.drop_duplicates('post_id', keep='first')

df_reddit = combine_reddit_comment_and_post_df(df_reddit_comment, df_reddit_post)
combined_df = unify_youtube_and_reddit_comments(df_reddit, df_youtube)


# Preprocess and reduce scraped comment

Below preprocess_dataset function will handle:
1) Fix unicode encoding issues using the html and unicodedata library 
2) Remove non-english text using facebook/fasttext-language-identification
3) Replace misformatted punctuation e.g. '“', '”'
4) Remove non-human readable characters using regex, e.g. empty string characters
5) Replace URL with placeholder using regex [URL]
6) Assign level to comment (e.g. root comment = 1, reply to root = 2). Will be used for coreference resolution later.

In [7]:
# Preprocess as above
combined_df = preprocess_dataset(combined_df)

In [15]:
combined_df = coref_resolve(combined_df)


Error in output string: That's fine and all, but what about that one time when they loaded up a C17 with a bunch of "supplies" that were being "flown to North Carolina" so that Kamala could come and grab a photo op, only to have the "supplies" unloaded from the plane? [ NEXT_COMMENT] You say, "that one time when they loaded up a C17 with a bunch of "supplies" that were being "flown to North Carolina" so that Kamala could come and grab a photo op, only to have the "supplies" unloaded from the plane? [ " as if that one time when they loaded up a C17 with a bunch of "supplies" that were being "flown to North Carolina" so that Kamala could come and grab a photo op, only to have the "supplies" unloaded from the plane? [ were quite some time ago and one amongst many. Did you by chance Huff Dust Off?Or did you by chance partake in eating the dogs and cats in Ohio? I heard they were all fed psychedelics before they were cooked so that might explain it...


In [17]:
# combined_df.to_csv(FINAL_FILE_SAVE_DIR+COMBINED_COMMENT_LEVEL_FILE, index=False, encoding='utf-8-sig')
combined_df = pd.read_csv(FINAL_FILE_SAVE_DIR+COMBINED_COMMENT_LEVEL_FILE, encoding='utf-8-sig')

# Sentence Level

In [18]:
sentence_df = split_comments_to_sentence(combined_df[~combined_df.comment.isna()])
sentence_df.head()

Unnamed: 0,comment_id,post_id,post_title,post_timestamp,parent_comment_id,parent_comment,comment,comment_timestamp,number_of_comment_votes,sentence,previous_sentence
0,lrt56j4,1g34hu5,Bryan Cranston campaigning for Kamala Harris i...,2024-10-14 00:00:00+00:00,,,It looks like this post is about Politics. Var...,2024-10-14 00:00:00+00:00,1,It looks like this post is about Politics.,
1,lrt56j4,1g34hu5,Bryan Cranston campaigning for Kamala Harris i...,2024-10-14 00:00:00+00:00,,,It looks like this post is about Politics. Var...,2024-10-14 00:00:00+00:00,1,Various methods of filtering out content relat...,It looks like this post is about Politics.
2,lrt56j4,1g34hu5,Bryan Cranston campaigning for Kamala Harris i...,2024-10-14 00:00:00+00:00,,,It looks like this post is about Politics. Var...,2024-10-14 00:00:00+00:00,1,Please [contact the moderators of this subredd...,Various methods of filtering out content relat...
3,lrt56j4,1g34hu5,Bryan Cranston campaigning for Kamala Harris i...,2024-10-14 00:00:00+00:00,,,It looks like this post is about Politics. Var...,2024-10-14 00:00:00+00:00,1,*,Please [contact the moderators of this subredd...
4,lrt9w4u,1g34hu5,Bryan Cranston campaigning for Kamala Harris i...,2024-10-14 00:00:00+00:00,,,"""I am the one who votes!""",2024-10-14 00:00:00+00:00,4509,"""I am the one who votes!""",


### Remove objective sentences

In [24]:
# Using cffl/bert-base-styleclassification-subjective-neutral
sentence_df['is_subjective'] = sentence_df.sentence.apply(is_subjective) 

In [49]:
sentence_df = sentence_df[sentence_df.is_subjective == 1]

In [5]:
# sentence_df.to_csv(FINAL_FILE_SAVE_DIR+COMBINED_SENTENCE_LEVEL_FILE, index=False, encoding='utf-8-sig')
sentence_df = pd.read_csv(FINAL_FILE_SAVE_DIR+COMBINED_SENTENCE_LEVEL_FILE, encoding='utf-8-sig')

# Inference

In [6]:
# df = sentence_df.copy()
df = sentence_df.sample(5, random_state=666)

predict_with_models is a function that consolidates the below code. As long as you pass it a df with "sentence" column, it should work.


In [7]:
predict_with_models(df[['sentence']]) 

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Unnamed: 0,sentence,entity_ids,final_aspect_labels,entity_category,final_aspect_categories,final_sentiment_prediction
0,Did people honestly think trump actually worke...,"[0, 1, 0]","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",trump,controversies,-1
1,Better than Kamala any day of the week.,"[1, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]",kamala,voter sentiment,1
2,Who gives a fuck,"[0, 0, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]",others,voter sentiment,-1
3,Hillary is not in Prison is she?,"[0, 0, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]",others,others,0


# Entity Extraction

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(ENTITY_MODEL)
tokenizer = AutoTokenizer.from_pretrained(ENTITY_MODEL)
model.to('cuda')

def get_probabilities(texts, score=False):
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
    inputs.to('cuda')
    outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.sigmoid(logits)
    if not score:
        return np.array(list(map(lambda x: 1 if x > 0.65 else 0, probabilities.cpu().detach().numpy()[0].tolist())))
    else:
        return probabilities.cpu().detach().numpy()[0]

In [8]:
df['entity_ids'] = df.sentence.apply(get_probabilities)
df_columns = df.columns.tolist()
# Expand the DataFrame for each entity
expanded_rows = []
for index, row in df.iterrows():
    entity_labels = row['entity_ids']
    if entity_labels[0] == 1:
        dict_row = dict()
        for col in df.columns:
            dict_row[col] = row[col]
            dict_row['entity_category'] = 'kamala'
            dict_row['entity_id'] = entity_idx_map['kamala']

        expanded_rows.append(dict_row)
        # expanded_rows.append({
        #     'comment': row['comment'],
        #     'parent_comment': row['parent_comment'],
        #     'comment_id': row['comment_id'],
        #     'parent_comment_id': row['parent_comment_id'],
        #     'comment_timestamp': row['comment_timestamp'],
        #     'level': row['level'],
        #     'post_id': row['post_id'],
        #     'post_title': row['post_title'],
        #     'post_timestamp': row['post_timestamp'],
        #     'number_of_comment_votes': row['number_of_comment_votes'],
        #     'platform': row['platform'],
        #     'sentence': row['sentence'],
        #     'previous_sentence': row['previous_sentence'],
        #     'sentence_idx': row['sentence_idx'],
        #     'contains_trump_mentions': row['contains_trump_mentions'],
        #     'contains_kamala_mentions': row['contains_kamala_mentions'],
        #     'is_subjective': row['is_subjective'],
        #     'entity_ids': row['entity_ids'],
        #     'entity_category': 'kamala',
        #     'entity_id': entity_idx_map['kamala']
        # })
    if entity_labels[1] == 1:
        dict_row = dict()
        for col in df.columns:
            dict_row[col] = row[col]
            dict_row['entity_category'] = 'trump'
            dict_row['entity_id'] = entity_idx_map['trump']
        expanded_rows.append(dict_row)

        # expanded_rows.append({
        #     'comment': row['comment'],
        #     'parent_comment': row['parent_comment'],
        #     'comment_id': row['comment_id'],
        #     'parent_comment_id': row['parent_comment_id'],
        #     'comment_timestamp': row['comment_timestamp'],
        #     'level': row['level'],
        #     'post_id': row['post_id'],
        #     'post_title': row['post_title'],
        #     'post_timestamp': row['post_timestamp'],
        #     'number_of_comment_votes': row['number_of_comment_votes'],
        #     'platform': row['platform'],
        #     'sentence': row['sentence'],
        #     'previous_sentence': row['previous_sentence'],
        #     'sentence_idx': row['sentence_idx'],
        #     'contains_trump_mentions': row['contains_trump_mentions'],
        #     'contains_kamala_mentions': row['contains_kamala_mentions'],
        #     'is_subjective': row['is_subjective'],
        #     'entity_ids': row['entity_ids'],
        #     'entity_category': 'trump',
        #     'entity_id': entity_idx_map['trump']
        # })
    if entity_labels[2] == 1:
        dict_row = dict()
        for col in df.columns:
            dict_row[col] = row[col]
            dict_row['entity_category'] = 'others'
            dict_row['entity_id'] = entity_idx_map['others']
        expanded_rows.append(dict_row)

        # expanded_rows.append({
        #     'comment': row['comment'],
        #     'parent_comment': row['parent_comment'],
        #     'comment_id': row['comment_id'],
        #     'parent_comment_id': row['parent_comment_id'],
        #     'comment_timestamp': row['comment_timestamp'],
        #     'level': row['level'],
        #     'post_id': row['post_id'],
        #     'post_title': row['post_title'],
        #     'post_timestamp': row['post_timestamp'],
        #     'number_of_comment_votes': row['number_of_comment_votes'],
        #     'platform': row['platform'],
        #     'sentence': row['sentence'],
        #     'previous_sentence': row['previous_sentence'],
        #     'sentence_idx': row['sentence_idx'],
        #     'contains_trump_mentions': row['contains_trump_mentions'],
        #     'contains_kamala_mentions': row['contains_kamala_mentions'],
        #     'is_subjective': row['is_subjective'],
        #     'entity_ids': row['entity_ids'],
        #     'entity_category': 'others',
        #     'entity_id': entity_idx_map['others']
        # })
          

# Create a new DataFrame from expanded rows
df = pd.DataFrame(expanded_rows)
df = df[df_columns + ['entity_category', 'entity_id']]

In [9]:
# Remove sentences without Kamala / Trump
df = df[df['entity_category'] != 'others']

# Aspect Extraction

### Model 1

In [12]:
model_name = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
num_aspects = 13

class MultiLabelClassifier(nn.Module):
    def __init__(self, num_labels):
        super(MultiLabelClassifier, self).__init__()
        # self.distilbert = DistilBertModel.from_pretrained(model_name)
        self.distilbert = AutoModel.from_pretrained(model_name)
        self.fc1 = nn.Linear(self.distilbert.config.hidden_size + 1, 256)  # +1 for entity_ids
        self.fc2 = nn.Linear(256, num_labels)
        self.dropout = nn.Dropout(0.3)
    
    def forward(self, input_ids, attention_mask, entity_ids):
        outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[0][:, 0]  # Use the [CLS] token representation
        
        entity_ids_expanded = entity_ids.view(entity_ids.size(0), -1)  # Added

        # Concatenate pooled_output with entity_ids
        # combined_output = torch.cat((pooled_output, entity_ids.unsqueeze(1)), dim=1)
        combined_output = torch.cat((pooled_output, entity_ids_expanded), dim=1)
        
        x = self.fc1(combined_output)
        x = self.dropout(x)
        x = self.fc2(x)
        
        return torch.sigmoid(x)  # Use sigmoid for multi-label classification
    
def predict_scores(model, tokenizer, dataframe, max_length=512):
    model.eval()  # Set the model to evaluation mode
    
    scores = []  # List to hold the scores for each row
    
    for _, row in dataframe.iterrows():
        # Tokenize the text and prepare inputs
        tokenized = tokenizer(
            row['sentence'],
            truncation=True,
            padding='max_length',
            max_length=max_length,
            return_tensors='pt'
        )

        # Extract the entity_id and prepare it as a LongTensor
        entity_id_tensor = torch.tensor([row['entity_id']], dtype=torch.long).to('cuda')  # Shape: (1, 1)

        # Move tokenized inputs to the device
        input_ids = tokenized['input_ids'].to('cuda')
        attention_mask = tokenized['attention_mask'].to('cuda')

        # Perform inference
        with torch.no_grad():
            output = model(input_ids=input_ids, attention_mask=attention_mask, entity_ids=entity_id_tensor)
            scores.append(output.cpu().numpy().flatten().tolist())  # Flatten to a list of length 14

    # Convert scores to a list of lists and add to DataFrame
    dataframe['distil_aspect_scores'] = scores
    
    return dataframe


def get_aspect_category(x, map):
    if sum(x) == 0:
        return ['others']
    else:
        result = []
        for idx, value in enumerate(x):
            if value == 1:
                result.append(map[idx])
            
        return result

model = MultiLabelClassifier(num_labels=num_aspects).to('cuda')
model.load_state_dict(torch.load(ASPECT_MODEL_DISTIL))

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [13]:
df = predict_scores(model, tokenizer, df)
df['distil_aspect_labels'] = df.distil_aspect_scores.apply(lambda x: np.where(np.array(x) >= 0.35, 1.0, 0.0)) # Returns list of length 13, if all 0 then its others.
df['distil_aspect_categories'] = df['distil_aspect_labels'].apply(lambda x: get_aspect_category(x, idx_aspect_map))


In [14]:
df

Unnamed: 0,comment_id,post_id,post_title,post_timestamp,parent_comment_id,parent_comment,comment,comment_timestamp,number_of_comment_votes,sentence,previous_sentence,is_subjective,entity_ids,entity_category,entity_id,distil_aspect_scores,distil_aspect_labels,distil_aspect_categories
0,lt2ikja,1g88apd,It was all STAGED!! Trump did not work. McDona...,2024-10-20 00:00:00+00:00,,,Did people honestly think trump actually worke...,2024-10-21 00:00:00+00:00,1,Did people honestly think trump actually worke...,,1,"[0, 1, 0]",trump,1,"[0.028811892494559288, 0.021681832149624825, 0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[others]
1,lsyikl3,1g88apd,It was all STAGED!! Trump did not work. McDona...,2024-10-20 00:00:00+00:00,,,Better than Kamala any day of the week. Both a...,2024-10-21 00:00:00+00:00,-2,Better than Kamala any day of the week.,,1,"[1, 0, 0]",kamala,0,"[0.0308123379945755, 0.019607435911893845, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[voter sentiment]


### Model 2

In [15]:
df['sentence2'] = df.apply(lambda row: 'entity of interest: ' + row['entity_category'].replace('others', 'neither trump nor kamala') + ' [SEP] ' + row['sentence'], axis=1)

In [16]:
# model_name = "facebook/bart-large" 
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSeq2SeqLM.from_pretrained('./model/aspect_model_seq2seq_facebooklargebart_lr5e-5_epochs5_w_additional_val_acc_4683')
# model.to('cuda')

model_name = 'destonedbob/nusiss-election-project-aspect-seq2seq-model-facebook-bart-large'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.to('cuda')


BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerN

In [17]:
def predict_aspect_model2b(texts, batch_size=32, return_conf=False, return_aspect_list=False):
    predictions = []

    # Tokenize texts in batches
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        
        # Tokenize the batch
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs.to('cuda')  # Move to GPU

        if return_conf:
            with torch.no_grad():  # Disable gradient calculation
                output_ids = model.generate(**inputs, output_scores=True, return_dict_in_generate=True)
                predictions.append(output_ids.sequences_scores.cpu().numpy())  # Collect scores
        
        else:
            with torch.no_grad():  # Disable gradient calculation
                output_ids = model.generate(**inputs)

            # Decode the batch of generated sequences
            output_texts = tokenizer.batch_decode(output_ids, skip_special_tokens=True)

            if return_aspect_list:
                for output_text in output_texts:
                    aspect_lst = output_text.split(';')
                    aspect_lst_result = [0] * len(aspect_idx_map)
                    for aspect in aspect_lst:
                        processed_aspect = aspect.lower().strip()
                        aspect_id = aspect_idx_map.get(processed_aspect, 9999)
                        if aspect_id == 9999:
                            print(processed_aspect)
                            continue
                        aspect_lst_result[aspect_id] = 1

                    predictions.append(aspect_lst_result)
            else:
                predictions.extend(output_texts)  # Append the decoded texts

    return predictions

In [18]:
mask = df['distil_aspect_categories'].apply(lambda x: x == ['others'])

# Get the sentences where the condition is met
texts_to_predict = df.loc[mask, 'sentence2'].tolist()

# Call the combined predict function
if texts_to_predict:
    predicted_aspect_lists = predict_aspect_model2b(texts_to_predict, batch_size=32, return_aspect_list=True)
    # Assign the predictions back to the relevant rows using .loc
    print(len(texts_to_predict))
    print(len(predicted_aspect_lists))
    df.loc[mask, 'bart_aspect_labels'] = pd.Series(predicted_aspect_lists, index=df[mask].index)

1
1


In [19]:
def get_aspect_category_model2(x, map):
    if type(x) != list:
        return np.nan
    
    if sum(x) == 0:
        return ['others']
    else:
        result = []
        for idx, value in enumerate(x):
            if value == 1:
                result.append(map[idx])
            
        return result
    
df['bart_aspect_categories'] = df['bart_aspect_labels'].apply(lambda x: get_aspect_category_model2(x, idx_aspect_map))
df['final_aspect_categories'] = df['bart_aspect_categories'].fillna(df['distil_aspect_categories'])
df['final_aspect_labels'] = df['bart_aspect_labels'].fillna(df['distil_aspect_labels'].apply(lambda x: [int(each) for each in x] + [1] if sum(x) == 0 else [int(each) for each in x] + [0]))

In [20]:
df = df.explode('final_aspect_categories').reset_index(drop=True)

In [21]:
df

Unnamed: 0,comment_id,post_id,post_title,post_timestamp,parent_comment_id,parent_comment,comment,comment_timestamp,number_of_comment_votes,sentence,...,entity_category,entity_id,distil_aspect_scores,distil_aspect_labels,distil_aspect_categories,sentence2,bart_aspect_labels,bart_aspect_categories,final_aspect_categories,final_aspect_labels
0,lt2ikja,1g88apd,It was all STAGED!! Trump did not work. McDona...,2024-10-20 00:00:00+00:00,,,Did people honestly think trump actually worke...,2024-10-21 00:00:00+00:00,1,Did people honestly think trump actually worke...,...,trump,1,"[0.028811892494559288, 0.021681832149624825, 0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[others],entity of interest: trump [SEP] Did people hon...,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",[controversies],controversies,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,lsyikl3,1g88apd,It was all STAGED!! Trump did not work. McDona...,2024-10-20 00:00:00+00:00,,,Better than Kamala any day of the week. Both a...,2024-10-21 00:00:00+00:00,-2,Better than Kamala any day of the week.,...,kamala,0,"[0.0308123379945755, 0.019607435911893845, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[voter sentiment],entity of interest: kamala [SEP] Better than K...,,,voter sentiment,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]"


# Sentiment Prediction

### Model 1

In [22]:
df['final_aspect_ids'] = df['final_aspect_categories'].apply(lambda x: aspect_idx_map[x])

In [23]:
model_name = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

class AspectBasedSentimentModel(nn.Module):
    def __init__(self, pretrained_model_name):
        super(AspectBasedSentimentModel, self).__init__()

        num_extra_dims = 2
        self.config = AutoConfig.from_pretrained(pretrained_model_name)
        self.bert = AutoModel.from_pretrained(pretrained_model_name, config=self.config)

        for param in self.bert.transformer.layer[:4].parameters():  # Freeze first 4 layers
            param.requires_grad = False

        num_hidden_size = self.bert.config.hidden_size 
        self.entity_embedding = nn.Embedding(num_embeddings=3, embedding_dim=256)
        self.aspect_embedding = nn.Embedding(num_embeddings=14, embedding_dim=256)
        self.dropout = nn.Dropout(0.2)
        # self.classifier = torch.nn.Linear(num_hidden_size+num_extra_dims, 3)
        # self.classifier = torch.nn.Linear(num_hidden_size + 1028 + 1028, 3)
        self.entity_labels_embedding = nn.Embedding(num_embeddings=2, embedding_dim=256)  # Embedding for binary 0/1 values
        self.aspect_labels_embedding = nn.Embedding(num_embeddings=2, embedding_dim=256)  # Embedding for binary 0/1 values

        self.classifier = nn.Linear(num_hidden_size + 256 + 256 + (256 * 3) + (256 * 14), 514)  # First hidden layer
        self.relu = nn.ReLU()  # Activation function
        # Second linear layer (output layer)
        self.output_layer = nn.Linear(514, 3)  # Final layer to output class probabilities




    def forward(self, input_ids, attention_mask, entity_cat, aspect_cat, entity_labels, aspect_labels):
        
        hidden_states = self.bert(input_ids=input_ids, attention_mask=attention_mask)  # [batch size, sequence length, hidden size]
        cls_embeddings = hidden_states.last_hidden_state[:, 0, :] # [batch size, hidden size]
        # concat = torch.cat((cls_embeddings, entity_cat.unsqueeze(1), aspect_cat.unsqueeze(1)), axis=1) # [batch size, hidden size+num extra dims]
        
        entity_embed = self.entity_embedding(entity_cat.type(torch.IntTensor).to('cuda'))
        aspect_embed = self.aspect_embedding(aspect_cat.type(torch.IntTensor).to('cuda'))
        # print((cls_embeddings.shape, entity_embed.shape, aspect_embed.shape))
        entity_labels_embed = self.entity_labels_embedding(entity_labels.type(torch.LongTensor).to('cuda')).view(entity_labels.shape[0], -1)  # Flatten [batch_size, 3, 50] to [batch_size, 150]
        aspect_labels_embed = self.aspect_labels_embedding(aspect_labels.type(torch.LongTensor).to('cuda')).view(aspect_labels.shape[0], -1)  # Flatten [batch_size, 14, 50] to [batch_size, 700]
        
        # Concatenate embeddings with CLS token output
        concat = torch.cat((cls_embeddings, entity_embed, aspect_embed, entity_labels_embed, aspect_labels_embed), axis=1)
        hidden_output = self.relu(self.classifier(self.dropout(concat)))  # [batch size, 128]

        # logits = self.output_layer(self.dropout(hidden_output))  # [batch size, num labels]
        logits = self.output_layer(hidden_output)  # [batch size, num labels]


        # logits = self.classifier(self.dropout(concat)) # [batch size, num labels]

        return logits
    

def predict_sentiment(row, tokenizer, model, return_conf=False, return_both=False):
    model.eval()
    inputs = tokenizer(row['sentence'], return_tensors='pt', truncation=True, padding=True, max_length=512)
    input_ids = inputs['input_ids'].to('cuda')
    attention_mask = inputs['attention_mask'].to('cuda')
    entity_cat = torch.tensor([row['entity_id']]).to('cuda')
    aspect_cat = torch.tensor([row['final_aspect_ids']]).to('cuda')
    entity_labels = torch.tensor([row['entity_ids']]).to('cuda')
    aspect_labels = torch.tensor([row['final_aspect_labels']]).to('cuda')
    
    with torch.no_grad():
        logits = model(input_ids, attention_mask, entity_cat, aspect_cat, entity_labels, aspect_labels)
    
    if return_both:
        predicted_label = torch.argmax(logits, dim=-1).item()
        return (predicted_label - 1, torch.max(logits).item())
    
    if return_conf:
        return torch.max(logits).item()
    else:
        predicted_label = torch.argmax(logits, dim=-1).item()
        return predicted_label - 1
    
    
model = torch.load(SENTIMENT_MODEL_DISTIL)
model.to('cuda')

AspectBasedSentimentModel(
  (bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1

In [24]:
df['distil_sentiment_prediction_and_confidence_score'] = df.apply(lambda x: predict_sentiment(x, tokenizer, model, return_both=True), axis=1)
df['distil_sentiment_prediction'] = df['distil_sentiment_prediction_and_confidence_score'].apply(lambda x: x[0])
df['distil_sentiment_confidence_score'] = df['distil_sentiment_prediction_and_confidence_score'].apply(lambda x: x[1])

### Model 2

In [27]:
def create_sentence_for_sentiment_seq2seq(row):
    return f"entity of interest: {row['entity_category'].replace('others', 'neither trump nor kamala')} [SEP] aspect of interest: {row['final_aspect_categories']} [SEP] {row['sentence']}"


def predict_sentiment_model2_batch(texts, batch_size=32, return_conf=False, return_both=False):
    predictions = []
    confidences = []

    # Tokenize texts in batches
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]

        # Tokenize the batch
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = inputs.to('cuda')  # Move to GPU

        if return_both:
            with torch.no_grad():  # Disable gradient calculation
                output_ids = model.generate(**inputs, output_scores=True, return_dict_in_generate=True)
                confidences.extend(output_ids.sequences_scores.cpu().tolist())

            output_texts = tokenizer.batch_decode(output_ids.sequences, skip_special_tokens=True)
            for output_text in output_texts:
                try:
                    # Extract label and map it to sentiment index
                    final_label = sentiment_idx_map[output_text.split(': ')[1]] - 1
                except:
                    print(output_text)
                    final_label = 0
                predictions.append(final_label)
            
        elif return_conf:
            with torch.no_grad():  # Disable gradient calculation
                output_ids = model.generate(**inputs, output_scores=True, return_dict_in_generate=True)
                # Collect sequence scores for confidence
                predictions.extend(output_ids.sequences_scores.cpu().tolist())
        else:
            with torch.no_grad():  # Disable gradient calculation
                output_ids = model.generate(**inputs)

            # Decode the batch of generated sequences
            output_texts = tokenizer.batch_decode(output_ids, skip_special_tokens=True)

            for output_text in output_texts:
                try:
                    # Extract label and map it to sentiment index
                    final_label = sentiment_idx_map[output_text.split(': ')[1]] - 1
                except:
                    print(output_text)
                    final_label = 0
                predictions.append(final_label)

    if return_both:
        return predictions, confidences
    else:
        return predictions
    
    
df['sentence3'] = df.apply(create_sentence_for_sentiment_seq2seq, axis=1)

In [28]:
model = AutoModelForSeq2SeqLM.from_pretrained('destonedbob/nusiss-election-project-sentiment-seq2seq-model-facebook-bart-large')
tokenizer = AutoTokenizer.from_pretrained('destonedbob/nusiss-election-project-sentiment-seq2seq-model-facebook-bart-large')
model.to('cuda')

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerN

In [29]:
texts_to_predict = df.sentence3.tolist()
if texts_to_predict:
    predicted_data = predict_sentiment_model2_batch(texts_to_predict, batch_size=32, return_both=True)
df['seq2seq_sentiment_prediction'] = pd.Series(predicted_data[0])
df['seq2seq_sentiment_confidence_score'] = pd.Series(predicted_data[1])

In [30]:
def get_final_sentiment_pred(row):
    seq2seq_conf = row['seq2seq_sentiment_confidence_score']
    distil_conf = row['distil_sentiment_confidence_score']

    if distil_conf >= 0.85: # 0.6
        return row['distil_sentiment_prediction']
    elif seq2seq_conf >= -0.36: # -0.32499999999999996
        return row['seq2seq_sentiment_prediction']
    else:
        return row['distil_sentiment_prediction']

df['final_sentiment_prediction'] = df.apply(get_final_sentiment_pred, axis=1)


In [31]:
df.columns

Index(['comment_id', 'post_id', 'post_title', 'post_timestamp',
       'parent_comment_id', 'parent_comment', 'comment', 'comment_timestamp',
       'number_of_comment_votes', 'sentence', 'previous_sentence',
       'is_subjective', 'entity_ids', 'entity_category', 'entity_id',
       'distil_aspect_scores', 'distil_aspect_labels',
       'distil_aspect_categories', 'sentence2', 'bart_aspect_labels',
       'bart_aspect_categories', 'final_aspect_categories',
       'final_aspect_labels', 'final_aspect_ids',
       'distil_sentiment_prediction_and_confidence_score',
       'distil_sentiment_prediction', 'distil_sentiment_confidence_score',
       'sentence3', 'seq2seq_sentiment_prediction',
       'seq2seq_sentiment_confidence_score', 'final_sentiment_prediction'],
      dtype='object')

In [32]:
df[['sentence', 'entity_ids', 'final_aspect_labels', 'entity_category', 'final_aspect_categories',  'final_sentiment_prediction']]

Unnamed: 0,sentence,entity_ids,final_aspect_labels,entity_category,final_aspect_categories,final_sentiment_prediction
0,Did people honestly think trump actually worke...,"[0, 1, 0]","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",trump,controversies,-1
1,Better than Kamala any day of the week.,"[1, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]",kamala,voter sentiment,1
