# Installs and Imports

In [None]:
import numpy as np
import pandas as pd
import os

# Drive Mounting

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Dataset Combination

In [None]:
directory = '/content/'
drive_directory = '/content/drive/My Drive/'

In [None]:
def combine_csv(prefix):
    files = [file for file in os.listdir(directory) if file.startswith(prefix)]
    combined_df = pd.DataFrame()
    for file in files:
        try:
            df = pd.read_csv(directory + file, low_memory=False)
            combined_df = pd.concat([combined_df, df], ignore_index=True)
        except pd.errors.EmptyDataError:
            print(f"Empty file: {file}. Skipping...")
            continue
    combined_df.to_csv(directory + prefix + '_combined.csv', index=False)
    combined_df.to_csv(drive_directory + prefix + '_combined.csv', index=False)

In [None]:
prefixes = ['DN', 'ND']
for prefix in prefixes:
    combine_csv(prefix)

NameError: name 'os' is not defined

In [None]:
all_files = [file for file in os.listdir(directory) if file.endswith('.csv') and not file.endswith('_combined.csv')]
combined_all_df = pd.concat([pd.read_csv(directory + file, low_memory=False) for file in all_files])
combined_all_df.to_csv(directory + 'FINAL_DATASET.csv', index=False)
combined_all_df.to_csv(drive_directory + 'FINAL_DATASET.csv', index=False)

In [None]:
for prefix in prefixes:
    combined_df = pd.read_csv(directory + prefix + '_combined.csv')
    print(f"{prefix} combined shape: {combined_df.shape}")

In [None]:
final_combined_df = pd.read_csv(directory + 'RAW_COMBINED_DATASET.csv')
print(f"Final combined shape: {final_combined_df.shape}")

In [None]:
file_paths = [
    '/content/drive/My Drive/Research/SentimentAnalysisDivorce/Dataset/RAW_COMBINED_DATASET.csv'
    '/content/drive/My Drive/Research/SentimentAnalysisDivorce/Dataset/DN_combined.csv',
    '/content/drive/My Drive/Research/SentimentAnalysisDivorce/Dataset/ND_combined.csv'
]

# Dataset Preprocessing

In [None]:
file_path = '/content/drive/My Drive/Research/SentimentAnalysisDivorce/Dataset/RAW_COMBINED_DATASET.csv'
df = pd.read_csv(file_path)

## Noise Reduction

In [None]:
print(df.shape)

In [None]:
selected_columns = ["date", "id", "profileId", "profileName", "replyToCommentId", "text"]
df_selected = df[selected_columns]

In [None]:
df_selected.drop_duplicates(subset='text', inplace=True)

In [None]:
print(df_selected.shape)

In [None]:
independent_comments = df_selected[df_selected['replyToCommentId'].isna()]

In [None]:
labeled_file_path = '/content/drive/My Drive/Research/SentimentAnalysisDivorce/Dataset/WITH LABELS - PREPROCESSED_DATA_01.csv'
labeled_df = pd.read_csv(labeled_file_path)

In [None]:
non_useful_labels = [
    "EMPTY COMMENT",
    "EMPTY COMMENT/MERE FB PAGE OR ACC MENTION/GIF/PHOTO",
    "NON-ENGLISH",
    "NON-ENGLISH/TAGALOG/TAGLISH",
    "NOT HELPFU",
    "NOT HELPFUL",
    "REMOVE (MAIN POST)",
    "REMOVE (NOT HELPFUL)",
    "SPAM COMMENT"
]

In [None]:
final_df_filtered = final_df[~final_df['Unnamed: 4'].isin(non_useful_labels)]

In [None]:
final_df_filtered.drop(columns=['Unnamed: 4'], inplace=True)

In [None]:
print(final_df_filtered.shape)

In [None]:
category_counts = final_df_filtered['category'].value_counts()

print(category_counts)

## Dataset Reconstruction

In [None]:
nd_df = pd.read_csv('/content/drive/My Drive/Research/SentimentAnalysisDivorce/Dataset/Combined/ND_combined.csv')
dn_df = pd.read_csv('/content/drive/My Drive/Research/SentimentAnalysisDivorce/Dataset/Combined/DN_combined.csv')

In [None]:
nd_df['category'] = 'ND'
dn_df['category'] = 'DN'

In [None]:
original_combined_df = pd.concat([nd_df, dn_df], ignore_index=True)

text_to_category = dict(zip(original_combined_df['text'], original_combined_df['category']))

def get_category(text):
    return text_to_category.get(text, 'Unknown')

final_df_filtered['category'] = final_df_filtered['text'].apply(get_category)

In [None]:
nd_reconstructed = final_df_filtered[final_df_filtered['category'] == 'ND']
dn_reconstructed = final_df_filtered[final_df_filtered['category'] == 'DN']

In [None]:
with pd.ExcelWriter('/content/drive/My Drive/Research/SentimentAnalysisDivorce/Dataset/CLEANED_PREPROCESSED_DATA_03.xlsx') as writer:
    nd_reconstructed.to_excel(writer, sheet_name='ND', index=False)
    dn_reconstructed.to_excel(writer, sheet_name='DN', index=False)

## Sample for Labeling

In [None]:
cleaned_file_path = '/content/drive/My Drive/Research/SentimentAnalysisDivorce/Dataset/CLEANED_PREPROCESSED_DATA_03.xlsx'
df_cleaned = pd.read_excel(cleaned_file_path, sheet_name=None)

In [None]:
sample_fraction = 0.5

sampled_data = {}

In [None]:
for category, df in df_cleaned.items():
    sampled_data[category] = df.sample(frac=sample_fraction, random_state=42)

In [None]:
with pd.ExcelWriter('/content/drive/My Drive/Research/SentimentAnalysisDivorce/Dataset/SAMPLE_FOR_LABELING.xlsx') as writer:
    for category, df in sampled_data.items():
        df.to_excel(writer, sheet_name=category, index=False)