THIS FILE CONTAINS THE CODE USED TO CREATE THE GRAPHS USED IN THE REPORT FOR SECTION 5.4 (DATA ANALYSIS) TO COMPARE HOW FEATURES OF A COMMUNITY DIFFER WITH SUBJECT MATTER

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
file_path = './../../data/compiled-posts/normalised_labelled_data.csv'
data = pd.read_csv(file_path)
measures = ['Combative', 'Deliberative', 'Toxicity', 'Rationality', 'Mutual Respect', 'Emotion', 'Moderator', 'Diversity']

In [None]:
# Ensure we only have numeric columns for aggregation
numeric_data = data[measures + ['Subreddit']]

# Group data by Subreddit and calculate the mean for each measure
subreddit_grouped_data = numeric_data.groupby('Subreddit').mean()

# Plotting bar charts for each measure grouped by Subreddit
fig, axes = plt.subplots(nrows=4, ncols=2, figsize=(15, 20))

for ax, measure in zip(axes.flatten(), measures):
    subreddit_grouped_data[measure].plot(kind='bar', ax=ax, title=measure)
    ax.set_xlabel('Subreddit')
    ax.set_ylabel('Average Value')
plt.tight_layout()
plt.show()


In [None]:
normalized_subreddit_grouped_data = subreddit_grouped_data.copy()
for measure in measures:
    min_val = subreddit_grouped_data[measure].min()
    max_val = subreddit_grouped_data[measure].max()
    normalized_subreddit_grouped_data[measure] = (subreddit_grouped_data[measure] - min_val) / (max_val - min_val)

# Plotting and saving individual bar charts for each normalized measure grouped by Subreddit
for measure in measures:
    fig, ax = plt.subplots(figsize=(10, 6))
    normalized_subreddit_grouped_data[measure].plot(kind='bar', ax=ax, title=measure)
    ax.set_xlabel('Subreddit')
    ax.set_ylabel('Value')
    plt.tight_layout()
    file_path = f'{measure}.png'  # This will save the file in the current directory
    plt.savefig(file_path)
    plt.close(fig)

In [None]:
normalized_subreddit_grouped_data = subreddit_grouped_data.copy()
for measure in measures:
    min_val = subreddit_grouped_data[measure].min()
    max_val = subreddit_grouped_data[measure].max()
    normalized_subreddit_grouped_data[measure] = (subreddit_grouped_data[measure] - min_val) / (max_val - min_val)

# Plotting bar charts for each normalized measure grouped by Subreddit
fig, axes = plt.subplots(nrows=4, ncols=2, figsize=(15, 20))

for ax, measure in zip(axes.flatten(), measures):
    normalized_subreddit_grouped_data[measure].plot(kind='bar', ax=ax, title=measure)
    ax.set_xlabel('Subreddit')
    ax.set_ylabel('Value (0-1)')
    


plt.tight_layout()
plt.show()

In [None]:
# Plotting bar charts for each measure grouped by Subreddit
fig, axes = plt.subplots(nrows=4, ncols=2, figsize=(15, 20))

for ax, measure in zip(axes.flatten(), measures):
    subreddit_grouped_data[measure].plot(kind='bar', ax=ax, title=measure)
    ax.set_xlabel('Subreddit')
    ax.set_ylabel('Value (0-1)')
    ax.set_yticks([0, 0.5, 1])
    ax.set_yticklabels(['Low', 'Medium', 'High'])
    plt.tight_layout()
    plt.show()



