In [1]:
import pandas as pd
import json
import os
import glob

FUNCTIONS FOR PARSING COMMENTS (JSON TO DATAFRAME)

In [None]:
#  parse comments and replies with CommentIDs
def parse_comments(comments, post_id, parent_id, depth, result, parent_counter=0):
    for comment_index, comment in enumerate(comments):
        # If it's a top-level comment, its ID is sth like post_1_0. Otherwise, it includes the parent's counter.
        if depth == 1:  # Directly under post
            comment_id = f"{post_id}_{parent_counter + comment_index}"
        else:  # Nested comment/reply
            comment_id = f"{parent_id}_{comment_index + 1}"

        result.append({
            'PostID': post_id,
            'CommentID': comment_id,
            'ParentID': parent_id,
            'Author': comment['author'],
            'Text': comment['body'],
            'Depth': depth
        })
        if 'replies' in comment and comment['replies']:
            # For replies, increment the parent counter for each new comment
            parse_comments(comment['replies'], post_id, comment_id, depth + 1, result, 0)

# process files in each subreddit directory
def process_reddit_posts(folder_path):
    data = []
    post_counter = 1  

    for file_path in glob.glob(os.path.join(folder_path, '*.json')):
        with open(file_path, 'r') as file:
            post_data = json.load(file)
            post_id = f"post_{post_counter}"  # e.g., post_1
            post_counter += 1  
            
            # Initially, add the post itself with a basic CommentID and no ParentID
            data.append({
                'PostID': post_id,
                'CommentID': f"{post_id}_0",
                'ParentID': None,
                'Author': 'NONE',  # Placeholder
                'Text': post_data['title'],
                'Depth': 0
            })
            
            # Process comments, starting with depth=1
            if 'comments' in post_data:
                parse_comments(post_data['comments'], post_id, f"{post_id}_0", 1, data)

    return pd.DataFrame(data)

COMBINE ALL POSTS INTO ONE DATAFRAME PER SUBREDDIT

In [None]:
folder_path = './../data/subreddits/Scientific/ScienceUncensored_data'  # Adjust the folder path to every subreddit
df = process_reddit_posts(folder_path)
df.head(10)

# Save the dataframe to a CSV file
df.to_csv('./../data/validation-posts/seen-subreddits/ScienceUncensored_data.csv', index=False)

ADDS AUTHOR TO DATA

In [None]:
file_path = './../data/labelled-posts/conservative_data.csv'  # Adjust the file path to labelled data location
labelled_df = pd.read_csv(file_path)
folder_path = './../data/subreddits/Political/Conservative_data' # Adjust the folder path to corresponding raw data location
raw_df = process_reddit_posts(folder_path)

# Add author & drop ParentID column
labelled_df['Author'] = raw_df['Author']
df = labelled_df.drop('ParentID', axis=1)

csv_path = './../data/labelled-posts/conservative_data.csv' # Adjust the file path to labelled data location
df.to_csv(file_path, index=False)

df.head(10)

SEPARATE LABELLED AND UNLABELLED DATA

In [None]:
# collects labelled comments - saving them to a csv file

subreddit = 'Games' # adjust the subreddit name
file_path = (f'./../data/validation-posts/seen-subreddits/{subreddit}_unlabelled_data.csv')
df = pd.read_csv(file_path)

df = df.dropna(subset=['Combative', 'Deliberative'])

csv_path = (f'./../data/{subreddit}_data.csv')
df.to_csv(csv_path,index=False)

df.head(10)

In [None]:
# Collects unlabelled comments - saving them to a csv file

subreddit = 'Games' # adjust the subreddit name
df = pd.read_csv(f'./../data/validation-posts/seen-subreddits/{subreddit}_unlabelled_data.csv')

df = df[df['Deliberative'].isna() & df['Combative'].isna()]
df = df.drop(columns=['Combative', 'Deliberative'])
df = df[['PostID', 'CommentID', 'Author', 'Text','Toxicity', 'Rationality', 'Mutual Respect', 'Emotion', 'Moderator', 'Diversity'
]]

df.to_csv(f'./../data/{subreddit}_unlabelled_data.csv', index=False)

df.head(10)

COMBINE A FOLDER OF CSV'S INTO ONE

In [None]:
joined_files = os.path.join("./../data/validation-posts", "*.csv") # Adjust the folder path to data location
joined_list = glob.glob(joined_files) 
dataframes = []

for file in joined_list:
    df = pd.read_csv(file)
    subreddit = os.path.basename(file).split('_')[0] # Adjust symbol to split the file name
    df['Subreddit'] = subreddit
    dataframes.append(df)


merged_df = pd.concat(dataframes, ignore_index=True)
column_order = ['Subreddit', 'PostID', 'CommentID', 'Author', 'Text', 'Combative', 'Deliberative', 'Toxicity', 'Rationality', 'Mutual Respect', 'Emotion', 'Moderator', 'Diversity'] # Adjust based on columns in data & desired order
df = merged_df[column_order]
# , 'Toxicity', 'Rationality', 'Mutual Respect', 'Emotion', 'Moderator', 'Diversity'

csv_path = f'./../data/compiled-posts/validation_posts.csv' # Adjust the file path to combined data location
df.to_csv(csv_path,index=False)
df.head(10)

ADD A DATAFRAME TO A CSV

In [None]:
# load csv to df
df = pd.read_csv('./../data/validation-posts/Games_data.csv')

# add a column with the subreddit name
df['Subreddit'] = 'Games'

# reorder columns
column_order = ['Subreddit', 'PostID', 'CommentID', 'Author', 'Text', 'Toxicity', 'Rationality', 'Mutual Respect', 'Emotion', 'Moderator', 'Diversity']
df = df[column_order]

# append this dataframe to a csv file
csv_path = './../data/compiled-posts/validation_posts_unseen copy.csv' # Adjust the file path to combined data location
df.to_csv(csv_path, mode='a', header=False, index=False)

REORDER CSV COLUMNS TO STANDARDISE DATA 

In [None]:
# csv to df
df = pd.read_csv('./../data/unlabelled-posts/ScienceUncensored_unlabelled_data.csv')

column_order = ['PostID', 'CommentID', 'Author', 'Text', 'Toxicity', 'Rationality', 'Mutual Respect', 
                'Emotion', 'Moderator', 'Diversity']

# reorder columns to column_order
df = df[column_order]
df.head(10)

In [None]:
# df to csv
df.to_csv('./../data/unlabelled-posts/ScienceUncensored_unlabelled_data.csv', index=False)   

CLEAN DATA BY: 
- DROPPING 0-0 ANNOTATIONS (OPTIONAL)
- MAKING SCORES > 1 EITHER 1-0 OR 0-1 (DEPENDING ON MAX)

In [None]:
df = pd.read_csv('./../data/compiled-posts/validation_posts.csv')
new_df = df.copy()

# Max function
def transform_values(row):
    if row['Combative'] != row['Deliberative']:
        if row['Combative'] > row['Deliberative']:
            row['Combative'] = 1
            row['Deliberative'] = 0
        else:
            row['Combative'] = 0
            row['Deliberative'] = 1
    else:
        row['Combative'] = 0
        row['Deliberative'] = 0
    return row

new_df = new_df.apply(transform_values, axis=1)

# Drop rows where both 'Deliberative' and 'Combative' are 0
new_df = new_df.drop(new_df[(new_df['Deliberative'] == 0) & (new_df['Combative'] == 0)].index)

new_df.head(10)


In [None]:
# save new_df to csv
new_df.to_csv('./../data/compiled-posts/validation_posts_binary.csv', index=False)

In [None]:
df = pd.read_csv('./../data/compiled-posts/validation_posts.csv')

# Filter out rows where the 'Combative' and 'Deliberative' columns are equal
filtered_df = df[df['Combative'] != df['Deliberative']]

# drop combative and deliberative columns
filtered_df = filtered_df.drop(columns=['Combative', 'Deliberative'])

# Save the filtered DataFrame to a new CSV file
output_file_path = './../data/compiled-posts/validation_data_unlabelled_binary.csv'
filtered_df.to_csv(output_file_path, index=False)