# Installs and Imports

In [None]:
!pip install transformers pandas scikit-learn torch matplotlib seaborn wordcloud gensim transformers[torch]

In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import XLMRobertaForSequenceClassification, AutoTokenizer, Trainer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from gensim import corpora
from gensim.models import LdaModel
import gensim
from collections import Counter
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Drive Mounting

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Dataset Loading

In [None]:
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

In [None]:
model_nd = XLMRobertaForSequenceClassification.from_pretrained('/content/drive/My Drive/Research/SentimentAnalysisDivorce/Models/ND_final')
model_dn = XLMRobertaForSequenceClassification.from_pretrained('/content/drive/My Drive/Research/SentimentAnalysisDivorce/Models/DN_final')

In [None]:
annotated_and_pseudolabeled_path = '/content/drive/My Drive/Research/SentimentAnalysisDivorce/Dataset/ANNOTATED_AND_PSEUDOLABELED_DATA_01.xlsx'
cleaned_preprocessed_path = '/content/drive/My Drive/Research/SentimentAnalysisDivorce/Dataset/CLEANED_PREPROCESSED_DATA_05.xlsx'

In [None]:
df_annotated_nd = pd.read_excel(annotated_and_pseudolabeled_path, sheet_name='ND')
df_annotated_dn = pd.read_excel(annotated_and_pseudolabeled_path, sheet_name='DN')
df_cleaned_nd = pd.read_excel(cleaned_preprocessed_path, sheet_name='ND')
df_cleaned_dn = pd.read_excel(cleaned_preprocessed_path, sheet_name='DN')

In [None]:
df_unlabeled_nd = df_cleaned_nd[~df_cleaned_nd['text'].isin(df_annotated_nd['text'])]
df_unlabeled_dn = df_cleaned_dn[~df_cleaned_dn['text'].isin(df_annotated_dn['text'])]

# Sentiment Analysis

In [None]:
def preprocess_data(df, tokenizer, max_length=512):
    texts = df['text'].tolist()
    inputs = tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
    return inputs

In [None]:
inputs_unlabeled_nd = preprocess_data(df_unlabeled_nd, tokenizer)
inputs_unlabeled_dn = preprocess_data(df_unlabeled_dn, tokenizer)

In [None]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, inputs):
        self.inputs = inputs

    def __len__(self):
        return len(self.inputs['input_ids'])

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.inputs.items()}
        return item

In [None]:
dataset_unlabeled_nd = CustomDataset(inputs_unlabeled_nd)
dataset_unlabeled_dn = CustomDataset(inputs_unlabeled_dn)

In [None]:
def predict_sentiments(model, dataset):
    trainer = Trainer(model=model)
    predictions = trainer.predict(dataset)
    preds = np.argmax(predictions.predictions, axis=1)
    return preds

In [None]:
preds_unlabeled_nd = predict_sentiments(model_nd, dataset_unlabeled_nd)
preds_unlabeled_dn = predict_sentiments(model_dn, dataset_unlabeled_dn)

In [None]:
df_unlabeled_nd['predicted_label'] = preds_unlabeled_nd
df_unlabeled_dn['predicted_label'] = preds_unlabeled_dn

In [None]:
df_annotated_nd['source'] = 'annotated'
df_unlabeled_nd['source'] = 'unlabeled'
df_combined_nd = pd.concat([df_annotated_nd, df_unlabeled_nd])

In [None]:
df_annotated_dn['source'] = 'annotated'
df_unlabeled_dn['source'] = 'unlabeled'
df_combined_dn = pd.concat([df_annotated_dn, df_unlabeled_dn])

In [None]:
label_mapping = {
    0: 'Very Positive',
    1: 'Positive',
    2: 'Slightly Positive',
    3: 'Neutral',
    4: 'Slightly Negative',
    5: 'Negative',
    6: 'Very Negative'
}

In [None]:
df_combined_nd['predicted_label'] = df_combined_nd['predicted_label'].map(label_mapping)
df_combined_dn['predicted_label'] = df_combined_dn['predicted_label'].map(label_mapping)

# Stopword Removal

In [None]:
stop_words = set([
    'akin', 'aking', 'ako', 'alin', 'am', 'amin', 'aming', 'ang', 'ano', 'anumang', 'apat', 'at', 'atin', 'ating', 'ay',
    'bababa', 'bago', 'bakit', 'bawat', 'bilang', 'dahil', 'dalawa', 'dapat', 'din', 'dito', 'doon', 'gagawin',
    'gayunman', 'ginagawa', 'ginawa', 'ginawang', 'gumawa', 'gusto', 'habang', 'hanggang', 'hindi', 'huwag', 'iba',
    'ibaba', 'ibabaw', 'ibig', 'ikaw', 'ilagay', 'ilalim', 'ilan', 'inyong', 'isa', 'isang', 'itaas', 'ito', 'iyo',
    'iyon', 'iyong', 'ka', 'kahit', 'kailangan', 'kailanman', 'kami', 'kanila', 'kanilang', 'kanino', 'kanya', 'kanyang',
    'kapag', 'kapwa', 'karamihan', 'katiyakan', 'katulad', 'kaya', 'kaysa', 'ko', 'kong', 'kulang', 'kumuha', 'kung',
    'laban', 'lahat', 'lamang', 'likod', 'lima', 'maaari', 'maaaring', 'maging', 'mahusay', 'makita', 'marami', 'marapat',
    'masyado', 'may', 'mayroon', 'mga', 'minsan', 'mismo', 'mula', 'muli', 'na', 'nabanggit', 'naging', 'nagkaroon',
    'nais', 'nakita', 'namin', 'napaka', 'narito', 'nasaan', 'ng', 'ngayon', 'ni', 'nila', 'nilang', 'nito', 'niya',
    'niyang', 'noon', 'o', 'pa', 'paano', 'pababa', 'paggawa', 'pagitan', 'pagkakaroon', 'pagkatapos', 'palabas',
    'pamamagitan', 'panahon', 'pangalawa', 'para', 'paraan', 'pareho', 'pataas', 'pero', 'pumunta', 'pumupunta', 'sa',
    'saan', 'sabi', 'sabihin', 'sarili', 'sila', 'sino', 'siya', 'tatlo', 'tayo', 'tulad', 'tungkol', 'una', 'walang',
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves',
    'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their',
    'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was',
    'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the',
    'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against',
    'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in',
    'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
    'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some'
])

In [None]:
def remove_stop_words(text):
    return " ".join([word for word in text.split() if word.lower() not in stop_words])

In [None]:
df_combined_nd['text'] = df_combined_nd['text'].apply(remove_stop_words)
df_combined_dn['text'] = df_combined_dn['text'].apply(remove_stop_words)

# Data Visualization

In [None]:
label_order = ['Very Positive', 'Positive', 'Slightly Positive', 'Neutral', 'Slightly Negative', 'Negative', 'Very Negative']

## Distribution of Sentiments

In [None]:
def plot_label_distribution_bar(df, title):
    df = df.reset_index(drop=True)  # Reset index to avoid duplicate index issues
    plt.figure(figsize=(10, 6))
    sns.countplot(x='predicted_label', data=df, order=label_order, color='skyblue')
    plt.title(title)
    plt.xlabel('Predicted Sentiment')
    plt.ylabel('Count')
    plt.show()

In [None]:
plot_label_distribution_bar(df_combined_nd, 'Sentiment Distribution for Comments on ND (Neutral Survey Posts)')
plot_label_distribution_bar(df_combined_dn, 'Sentiment Distribution for Comments on DN (Divorce News Posts)')

## Proportion of Sentiments

In [None]:
def plot_label_proportion_bar(df, title):
    df = df.reset_index(drop=True)  # Reset index to avoid duplicate index issues
    plt.figure(figsize=(10, 6))
    df['predicted_label'].value_counts(normalize=True).reindex(label_order).plot(kind='bar', color='skyblue')
    plt.title(title)
    plt.xlabel('Predicted Sentiment')
    plt.ylabel('Proportion')
    plt.show()

In [None]:
plot_label_proportion_bar(df_combined_nd, 'Sentiment Proportion for Comments on ND (Neutral Survey Posts)')
plot_label_proportion_bar(df_combined_dn, 'Sentiment Proportion for Comments on DN (Divorce News Posts)')

## Word Clouds

In [None]:
def generate_wordcloud(df, label, title):
    df = df.reset_index(drop=True)  # Reset index to avoid duplicate index issues
    text = " ".join(comment for comment in df[df['predicted_label'] == label]['text'])
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(title)
    plt.axis('off')
    plt.show()

In [None]:
for label in label_order:
    generate_wordcloud(df_combined_nd, label, f'Word Cloud for ND - Sentiment {label}')

for label in label_order:
    generate_wordcloud(df_combined_dn, label, f'Word Cloud for DN - Sentiment {label}')

## Words Used Per Category

In [None]:
def most_frequent_words(df, label, top_n=10):
    df = df.reset_index(drop=True)  # Reset index to avoid duplicate index issues
    text = " ".join(comment for comment in df[df['predicted_label'] == label]['text'])
    words = text.split()
    word_freq = pd.Series(words).value_counts().head(top_n)
    print(f'Most Frequent Words for Sentiment {label}')
    print(word_freq)

In [None]:
for label in label_order:
    most_frequent_words(df_combined_nd, label)

In [None]:
for label in label_order:
    most_frequent_words(df_combined_dn, label)

# Topic Modeling

In [None]:
def topic_modeling_per_sentiment(df, num_topics=5):
    df = df.reset_index(drop=True)  # Reset index to avoid duplicate index issues
    sentiments = df['predicted_label'].unique()
    topic_results = {}

    for sentiment in sentiments:
        texts = df[df['predicted_label'] == sentiment]['text'].tolist()
        if len(texts) == 0:
            print(f"No text data for sentiment: {sentiment}")
            continue
        tokenized_texts = [text.split() for text in texts]
        if len(tokenized_texts) == 0:
            print(f"No tokens generated for sentiment: {sentiment}")
            continue
        dictionary = corpora.Dictionary(tokenized_texts)
        if len(dictionary) == 0:
            print(f"Empty dictionary for sentiment: {sentiment}")
            continue
        corpus = [dictionary.doc2bow(text) for text in tokenized_texts]
        if len(corpus) == 0:
            print(f"Empty corpus for sentiment: {sentiment}")
            continue
        lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
        topics = lda_model.print_topics()
        topic_results[sentiment] = topics

    return topic_results

In [None]:
nd_topics = topic_modeling_per_sentiment(df_combined_nd)
print("ND Data Topics by Sentiment:")
for sentiment, topics in nd_topics.items():
    print(f"\nSentiment: {sentiment}")
    for topic in topics:
        print(topic)
