In [None]:
import pickle
import os
import sys
import numpy as np
import pandas as pd
parent_directory = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
sys.path.append(parent_directory)

#### Read in the Reddit Data

In [None]:
parent_directory = os.path.abspath(os.path.join(os.getcwd(), '..'))

directory = os.path.join(parent_directory,'data_collection/project_data/')

full_df = pd.DataFrame()

for filename in os.listdir(directory):
    if filename.endswith(".pkl"):
        file_path = os.path.join(directory, filename)
        df = pd.read_pickle(file_path)

        print(f"Data from {filename}:")
        #print(df.head())

        full_df = pd.concat([full_df, df], ignore_index=True)

print("Combined DataFrame:")
print(full_df)

In [None]:
full_df = full_df[0:10000]

In [None]:
from gensim.models import KeyedVectors
if os.path.exists("wordvectors.kv"):
    word_vectors = KeyedVectors.load("wordvectors.kv")
else:
    import gensim.downloader as api

    word_vectors = api.load("glove-twitter-200")
    word_vectors.save("wordvectors.kv")

In [None]:
def compute_similarity(comment, foundation_words_vec, similarity_threshold=0.25):
    similarities = []
    for word in comment:  # word in reddit comment
        try:
            word_vec = word_vectors[word]  # get the embedding
            for foundation_word_vec in foundation_words_vec:  # loop through moral foundation words
                sim = np.dot(word_vec, foundation_word_vec) / (np.linalg.norm(word_vec) * np.linalg.norm(foundation_word_vec))
                # Apply threshold
                if sim >= similarity_threshold:
                    similarities.append(sim)
        except KeyError:  # If the word is not in the embedding vocabulary
            pass
    
    if similarities:
        return np.mean(similarities)
    else:
        return 0

def classify_sentence_with_profile(sentence, moral_foundations_dict):
    foundation_scores = {}

    for foundation, words in moral_foundations_dict.items():
        words_vec = []
        for word in words:
            try:  # loop through moral foundation words
                word_vec = word_vectors[word]
                words_vec.append(word_vec)
            except:
                pass
        foundation_scores[foundation] = compute_similarity(sentence, words_vec)

    return foundation_scores

In [None]:
import json
with open("expanded_moral_foundations_dictionary.json", "r") as f:
        word_to_moral_foundation_expanded = json.load(f)

classification_profiles = []

for comment in full_df["tokenized_body_words_norm"]:
    classification_profile = classify_sentence_with_profile(
        comment, word_to_moral_foundation_expanded
    )
    classification_profiles.append(classification_profile)

# Convert classification_profiles to a DataFrame
classification_df = pd.DataFrame(classification_profiles)

# Concatenate this DataFrame with full_df
full_df = pd.concat([full_df.reset_index(drop=True), classification_df.reset_index(drop=True)], axis=1)

In [None]:
full_df

In [None]:
full_df['Harm_Care_Agg'] = (full_df['HarmVice'] + full_df['HarmVirtue']) / 2
full_df['Authority_Agg'] = (full_df['AuthorityVice'] + full_df['HarmVirtue']) / 2
full_df['Purity_Agg'] = (full_df['PurityVice'] + full_df['PurityVirtue']) / 2
full_df['Fairness_Agg'] = (full_df['FairnessVice'] + full_df['FairnessVirtue']) / 2
full_df['Ingroup_Agg'] = (full_df['IngroupVice'] + full_df['IngroupVirtue']) / 2

In [None]:
full_df = pd.read_csv('experimental_data.csv')


In [None]:
full_df.drop(['HarmVirtue',	'AuthorityVirtue',	'PurityVirtue',	'HarmVice',	'PurityVice',	'IngroupVice',	'FairnessVirtue',	'MoralityGeneral',	'FairnessVice',	'IngroupVirtue', 'AuthorityVice'], axis = 1, inplace = True)

In [None]:
full_df.columns

In [None]:

# Group by 'subreddit' and calculate the mean for each moral foundation
average_df = full_df.groupby('subreddit')[['HarmVirtue.1',
       'AuthorityVirtue.1', 'PurityVirtue.1', 'HarmVice.1', 'PurityVice.1',
       'IngroupVice.1', 'FairnessVirtue.1', 'MoralityGeneral.1',
       'FairnessVice.1', 'IngroupVirtue.1', 'AuthorityVice.1']].mean().reset_index()

# Print the resulting DataFrame
print(average_df)


In [None]:
full_df['Dominant_Moral_Foundation'] = full_df[['HarmVirtue', 'AuthorityVirtue', 'PurityVirtue', 'HarmVice', 'PurityVice', 'IngroupVice', 'FairnessVirtue', 'FairnessVice', 'IngroupVirtue', 'AuthorityVice']].idxmax(axis=1)

In [None]:
average_df['Dominant_Moral_Foundation']

In [None]:
full_df['Harm_Care_Agg'] = (full_df['HarmVice.1'] + full_df['HarmVirtue.1']) / 2
full_df['Authority_Agg'] = (full_df['AuthorityVice.1'] + full_df['HarmVirtue.1']) / 2
full_df['Purity_Agg'] = (full_df['PurityVice.1'] + full_df['PurityVirtue.1']) / 2
full_df['Fairness_Agg'] = (full_df['FairnessVice.1'] + full_df['FairnessVirtue.1']) / 2
full_df['Ingroup_Agg'] = (full_df['IngroupVice.1'] + full_df['IngroupVirtue.1']) / 2

In [None]:
full_df.columns

In [None]:

# Group by 'subreddit' and calculate the mean for each moral foundation
average_df = full_df.groupby('subreddit')[['Harm_Care_Agg',
       'Authority_Agg', 'Purity_Agg', 'Fairness_Agg', 'Ingroup_Agg']].mean().reset_index()

# Print the resulting DataFrame
print(average_df)

  subreddit      care  fairness   loyalty  authority    purity
0   climate  0.741985  0.784272  0.716046   0.723466  0.758646

In [None]:
full_df

In [None]:
# Filter the DataFrame to include only comments with 'body' length greater than 20
filtered_df = full_df[full_df['body'].str.len() > 20]

# Group by 'subreddit' and calculate the mean for each moral foundation
average_df = filtered_df.groupby('subreddit')[['care', 'fairness', 'loyalty', 'authority', 'purity']].mean().reset_index()

# Print the resulting DataFrame
print(average_df)


In [None]:
full_df = full_df[full_df['body'].str.len() > 1000]