# Installs and Imports

In [None]:
!pip install bertopic fuzzywuzzy emoji langdetect transformers

In [None]:
# Standard library imports
import glob
import itertools
import json
import re
import emoji
from collections import Counter
from typing import List
from langdetect import detect
from langdetect import detect_langs
from langdetect import DetectorFactory
DetectorFactory.seed = 0
import time
import logging

## Third-party imports
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import torch
import plotly.figure_factory as ff
import plotly.graph_objects as go
import scipy.sparse
from bertopic import BERTopic
from fuzzywuzzy import process
from scipy.cluster.hierarchy import linkage
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (accuracy_score, classification_report, cohen_kappa_score, f1_score, precision_score, recall_score)
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from umap import UMAP
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, Conv1D, GlobalMaxPooling1D
from transformers import BertTokenizer, TFBertForSequenceClassification, AutoTokenizer, AutoModelForMaskedLM, pipeline

# NLTK-specific imports
from nltk.util import ngrams

In [None]:
model_name = 'xlm-roberta-base'  # multilingual model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)

# Drive Mounting

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
file_path = '/content/drive/My Drive/Research/SentimentAnalysisDivorce/Dataset/FINAL_DATASET.csv'

df = pd.read_csv(file_path)

In [None]:
print(df.shape)

# Data Preprocessing

## Noise Reduction

### Removal of Duplicates

In [None]:
# Create a new DataFrame with only the specified columns
selected_columns = ["profileId", "profileName", "replyToCommentId", "text"]
df_selected = df[selected_columns]

In [None]:
# Remove duplicates
df_selected.drop_duplicates(subset='text', inplace=True)
df_selected.to_csv('PREPROCESSED_DATA_01.csv', index=False)

In [None]:
print(df_selected.shape)

In [None]:
# Save the DataFrame to Google Drive
output_file_path = '/content/drive/My Drive/Research/SentimentAnalysisDivorce/Dataset/PREPROCESSED_DATA_01.csv'
df_selected.to_csv(output_file_path, index=False)

print(f"File saved to {output_file_path}")

In [None]:
# Initialize empty DataFrame
df = pd.DataFrame(columns=["profileId", "profileName", "replyToCommentId", "text"])

# Function to log bad lines
bad_lines = []

# Iterate over CSV files
for filepath in glob.iglob(file_path):
    try:
        # Read CSV file. Should be able to handle quotes and line breaks
        temp_df = pd.read_csv(filepath, quotechar='"', escapechar='\\', on_bad_lines='skip')
        # Extract post_id from the filename
        post_id = filepath.split("\\")[-1].split(".")[0]
        temp_df["post_id"] = post_id
        # Append to the main DataFrame
        df = pd.concat([df, temp_df], ignore_index=True)
    except pd.errors.ParserError as e:
        print(f"Error reading {filepath}: {e}")

# Select specific columns and shuffle the DataFrame
df = df[["profileId", "profileName", "replyToCommentId", "text"]].sample(frac=1, random_state=42).reset_index(drop=True)

# Display the first few rows of the DataFrame
print(df.head())

In [None]:
# Remove duplicates
df.drop_duplicates(subset=['text'], inplace=True)

In [None]:
print(df.shape)

In [None]:
def remove_spam(text):
    if isinstance(text, str):
        # Define phrases associated with spam
        spam_phrases = [
            'crypto', 'cryptocurrency', 'bitcoin', 'blockchain',
            'click here', 'check my profile', 'tag for', 'follow for', 'check out',
            'lotto', 'gambling'
        ]
        # Check if any spam phrase is present in the comment
        for phrase in spam_phrases:
            if phrase.lower() in text.lower():
                return ''
        # If no spam phrase is found, return the original text
        return text
    else:
        return text

# Remove spam comments
df['text'] = df['text'].apply(remove_spam)

In [None]:
# Function to remove emojis, photos, and GIFs
def remove_non_text(text):
    if isinstance(text, str):
        # Define regex pattern for detecting emojis, photos, and GIFs
        non_text_pattern = r'^[^\x00-\x7F]+$|http\S+|\bimg\b|\bphoto\b|\bgif\b'
        # Remove non-text elements from text
        return re.sub(non_text_pattern, '', text).strip()
    else:
        return ''

# Remove emojis, photos, and GIFs
df['text'] = df['text'].apply(remove_non_text)

In [None]:
print(df.shape)

In [None]:
# Function to detect language and remove non-English/Filipino text
def filter_language(text):
    try:
        langs = detect_langs(text)
        # Check if English or Filipino is detected and its probability is greater than a threshold
        for lang in langs:
            if lang.lang == 'en' and lang.prob > 0.5:
                return True
            elif lang.lang == 'tl' and lang.prob > 0.5:
                return True
        return False
    except:
        return False

# Remove non-English/Filipino text
df = df[df['text'].apply(filter_language)]

In [None]:
df_filtered = df[df['text'].apply(filter_language)]

In [None]:
# Save the filtered dataset to the specified location
output_file_path = '/content/drive/My Drive/Research/SentimentAnalysisDivorce/Dataset/FILTERED_DATASET.csv'
df_filtered.to_csv(output_file_path, index=False)

print(f"Filtered dataset saved to {output_file_path}")
print(f"Filtered dataset shape: {df_filtered.shape}")

## Clean Comments

In [None]:
# Set up logging with flushing
class FlushHandler(logging.StreamHandler):
    def emit(self, record):
        super().emit(record)
        self.flush()

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[FlushHandler()])

In [None]:
# Function to remove emojis
def remove_emojis(text):
    if not isinstance(text, str):
        return text  # Return the original value if it's not a string
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [None]:
# Function to remove special characters such as links, hashtags, question marks, exclamation points, mention tags
def clean_text(text):
    if not isinstance(text, str):
        return text  # Return the original value if it's not a string
    text = re.sub(r'http\S+|www\S+|#|\?|!|@[\w_]+', '', text)
    return text

In [None]:
# Function to correct spelling using the pretrained ROBERTA model
def correct_spelling(text):
    if not isinstance(text, str):
        return text  # Return the original value if it's not a string

    max_length = 512  # Maximum sequence length for the model
    encoded_input = tokenizer.encode(text, return_tensors='pt', truncation=True, max_length=max_length)

    with torch.no_grad():
        outputs = model(encoded_input)

    predictions = outputs.logits
    predicted_ids = predictions.argmax(dim=-1).squeeze().tolist()
    corrected_text = tokenizer.decode(predicted_ids, skip_special_tokens=True)

    return corrected_text

In [None]:
# Function to remove mentions
def remove_mentions(row, names_set):
    text = row['text']
    if pd.notna(row['replyToCommentId']):
        if not isinstance(text, str):
            return text  # Return the original value if it's not a string

        words = text.split()
        for i in range(1, 6):  # Check the first 1 to 5 words
            potential_mention = ' '.join(words[:i]).lower()
            if potential_mention in names_set:
                return ' '.join(words[i:])
    return text

In [None]:
# Function to process text in batches and save the results incrementally
def process_spelling_correction_in_batches(df, batch_size, output_file, names_set):
    num_batches = len(df) // batch_size + 1
    start_time = time.time()

    for batch_num in range(num_batches):
        batch_start_time = time.time()
        start_idx = batch_num * batch_size
        end_idx = min((batch_num + 1) * batch_size, len(df))
        df_batch = df.iloc[start_idx:end_idx].copy()

        df_batch['text'] = df_batch.apply(lambda row: remove_mentions(row, names_set), axis=1)
        df_batch['text'] = df_batch['text'].apply(correct_spelling)

        if batch_num == 0:
            df_batch.to_csv(output_file, index=False)
        else:
            df_batch.to_csv(output_file, mode='a', header=False, index=False)

        elapsed_time = time.time() - batch_start_time
        total_elapsed_time = time.time() - start_time
        remaining_batches = num_batches - (batch_num + 1)
        estimated_remaining_time = remaining_batches * elapsed_time

        logging.info(f'Processed batch {batch_num + 1}/{num_batches} in {elapsed_time:.2f} seconds.')
        logging.info(f'Estimated remaining time: {estimated_remaining_time // 60:.0f} minutes and {estimated_remaining_time % 60:.0f} seconds.')

# Load the dataset
file_path = '/content/drive/My Drive/Research/SentimentAnalysisDivorce/Dataset/FINAL_DATASET.csv'
df = pd.read_csv(file_path)
logging.info(f'Loaded dataset with shape: {df.shape}')

# Apply preprocessing steps
df['text'] = df['text'].apply(remove_emojis)
df['text'] = df['text'].apply(clean_text)
df['text'] = df['text'].str.lower()

# Save the intermediate result
intermediate_file_path = '/content/drive/My Drive/Research/SentimentAnalysisDivorce/Dataset/FINAL_DATASET_INTERMEDIATE.csv'
df.to_csv(intermediate_file_path, index=False)
logging.info(f'Saved intermediate dataset with shape: {df.shape}')

In [None]:
# Create a set of profile names in lowercase for faster lookup
names_set = set(df['profileName'].str.lower().dropna().unique())

In [None]:
# Process the dataset in batches and save the results incrementally
output_file = '/content/drive/My Drive/Research/SentimentAnalysisDivorce/Dataset/FINAL_DATASET_SPELLING_CORRECTED.csv'
batch_size = 1000  # Adjust batch size as needed
process_spelling_correction_in_batches(df, batch_size, output_file, names_set)

logging.info("Processing complete.")

In [None]:
df.shape()