#### Read in Moral Foundation Dictionary

In [1]:
#File Path to Moral Foundations Dictionary
dic_file_path = 'moral foundations dictionary.dic'

moral_foundations_dict = {} # dictionary to map moral foundation to it's numeric key
word_to_moral_foundation = {} #dictionary to of word to moral foundation

line_counter = 0 #first 12 lines of dic contain the moral foundations and their numeric keys
with open(dic_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        if line.strip() and not line.startswith('%'):
            parts = line.strip().split()
            moral_found =  parts[0] 
            moral_found_code = parts[1:] #
            # Add to dictionary (note: you might need to adjust parsing logic)
            if line_counter < 12: #first 12 lines contain the moral foundations and their keys
                moral_foundations_dict[moral_found] = moral_found_code[0]
            else: #rest of lines contain words
                cats = [moral_foundations_dict[cat] for cat in moral_found_code]
                word_to_moral_foundation[moral_found] = cats
        line_counter+=1


In [2]:
moral_foundations_dict

{'01': 'HarmVirtue',
 '02': 'HarmVice',
 '03': 'FairnessVirtue',
 '04': 'FairnessVice',
 '05': 'IngroupVirtue',
 '06': 'IngroupVice',
 '07': 'AuthorityVirtue',
 '08': 'AuthorityVice',
 '09': 'PurityVirtue',
 '10': 'PurityVice',
 '11': 'MoralityGeneral'}

In [3]:
word_to_moral_foundation

{'safe*': ['HarmVirtue'],
 'peace*': ['HarmVirtue'],
 'compassion*': ['HarmVirtue'],
 'empath*': ['HarmVirtue'],
 'sympath*': ['HarmVirtue'],
 'care': ['HarmVirtue'],
 'caring': ['HarmVirtue'],
 'protect*': ['HarmVirtue'],
 'shield': ['HarmVirtue'],
 'shelter': ['HarmVirtue'],
 'amity': ['HarmVirtue'],
 'secur*': ['HarmVirtue'],
 'benefit*': ['HarmVirtue'],
 'defen*': ['HarmVirtue'],
 'guard*': ['HarmVirtue'],
 'preserve': ['HarmVirtue', 'AuthorityVirtue', 'PurityVirtue'],
 'harm*': ['HarmVice'],
 'suffer*': ['HarmVice'],
 'war': ['HarmVice'],
 'wars': ['HarmVice'],
 'warl*': ['HarmVice'],
 'warring': ['HarmVice'],
 'fight*': ['HarmVice'],
 'violen*': ['HarmVice'],
 'hurt*': ['HarmVice'],
 'kill': ['HarmVice'],
 'kills': ['HarmVice'],
 'killer*': ['HarmVice'],
 'killed': ['HarmVice'],
 'killing': ['HarmVice'],
 'endanger*': ['HarmVice'],
 'cruel*': ['HarmVice'],
 'brutal*': ['HarmVice'],
 'abuse*': ['HarmVice'],
 'damag*': ['HarmVice'],
 'ruin*': ['HarmVice', 'PurityVice'],
 'ravage': 

#### Expand Moral Foundation Dictionary with Word2Vec

Use HuggingFace Pre-trained model on tweets.

In [26]:
import os
import gensim.downloader as api

model = api.load("glove-twitter-25")

In [27]:
word_to_moral_foundation_expanded = word_to_moral_foundation.copy()  
expanded_dictionary = {}

similarity_threshold = 0.85

for word, categories in word_to_moral_foundation_expanded.items():
    if word in model.key_to_index:
        similar_words = model.most_similar(positive=[word], topn=100)
        
        # Filter based on the similarity threshold
        for similar_word, similarity_score in similar_words:
            if similarity_score >= similarity_threshold:
                expanded_dictionary[similar_word] = categories

word_to_moral_foundation_expanded.update(expanded_dictionary)

In [28]:
len(word_to_moral_foundation_expanded)

3131

In [29]:
word_to_moral_foundation_expanded

{'safe*': ['HarmVirtue'],
 'peace*': ['HarmVirtue'],
 'compassion*': ['HarmVirtue'],
 'empath*': ['HarmVirtue'],
 'sympath*': ['HarmVirtue'],
 'care': ['HarmVirtue'],
 'caring': ['HarmVirtue'],
 'protect*': ['HarmVirtue'],
 'shield': ['HarmVirtue'],
 'shelter': ['HarmVirtue'],
 'amity': ['HarmVirtue'],
 'secur*': ['HarmVirtue'],
 'benefit*': ['HarmVirtue'],
 'defen*': ['HarmVirtue'],
 'guard*': ['HarmVirtue'],
 'preserve': ['HarmVirtue', 'AuthorityVirtue', 'PurityVirtue'],
 'harm*': ['HarmVice'],
 'suffer*': ['HarmVice'],
 'war': ['HarmVice'],
 'wars': ['HarmVice'],
 'warl*': ['HarmVice'],
 'warring': ['HarmVice'],
 'fight*': ['HarmVice'],
 'violen*': ['HarmVice'],
 'hurt*': ['HarmVice'],
 'kill': ['HarmVice'],
 'kills': ['HarmVice'],
 'killer*': ['HarmVice'],
 'killed': ['HarmVice'],
 'killing': ['HarmVice'],
 'endanger*': ['HarmVice'],
 'cruel*': ['HarmVice'],
 'brutal*': ['HarmVice'],
 'abuse*': ['HarmVice'],
 'damag*': ['HarmVice'],
 'ruin*': ['HarmVice', 'PurityVice'],
 'ravage': 

#### Read in the Reddit Data

In [38]:
import pickle
!ls

with open('/Users/kathryn/Projects/Advanced ML/project/climate-conversations/project/data_collection/project_data/climateCommentsDf.pickle', 'rb') as f:
    comments_df = pickle.load(f)

with open('/Users/kathryn/Projects/Advanced ML/project/climate-conversations/project/data_collection/project_data/climateSubmissionsDf.pickle', 'rb') as f:
    submissions_df = pickle.load(f)

[34mgensim-data[m[m                      reddit_analysis.ipynb
moral foundations dictionary.dic


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  comments_df = pickle.load(f)


In [40]:
comments_df.head()


Unnamed: 0,id,subreddit,body,author,score,gilded,created_utc,parent_id,link_id,retrieved_on,controversiality,is_submitter
0,inlvurb,politics,[removed],[deleted],1,0,1662654718,x8y56x,x8y56x,1665086767,0,False
1,inlw0wi,worldnews,Needed a laugh!,serious-activity-228,3,0,1662654781,inlvhve,x8yhv5,1665086761,0,False
2,inlx8g7,politics,[deleted],[deleted],1,0,1662655233,inkxrse,x8y56x,1665086718,0,False
3,inlxwat,worldnews,Asked!,dieyoufool3,3,0,1662655481,inlszmb,x94ag0,1665086696,0,True
4,inlyusw,worldnews,[deleted],[deleted],1,0,1662655839,inlyqzt,x8yhv5,1665086662,0,False


In [59]:
comments_df['processed_body'] = comments_df['body'].str.lower()
comments_df.shape

(3966791, 13)

In [68]:
subset_comments_df = comments_df.iloc[0:1000]

In [69]:
from scipy.spatial.distance import cosine
import numpy as np

def calculate_similarity(comment_vector, foundation_vector):
    if np.all(comment_vector) and np.all(foundation_vector):
        return 1 - cosine(comment_vector, foundation_vector)
    else:
        return -1 

In [70]:
def assign_moral_foundations(comment, extended_dict):
    foundations = set()
    for word in comment.split(): 
        for key, values in extended_dict.items():
            if key.endswith('*') and word.startswith(key[:-1]):
                foundations.update(values)
            elif word == key:
                foundations.update(values)
    return list(foundations)
subset_comments_df['moral_foundations'] = subset_comments_df['processed_body'].apply(assign_moral_foundations, extended_dict=word_to_moral_foundation_expanded)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_comments_df['moral_foundations'] = subset_comments_df['processed_body'].apply(assign_moral_foundations, extended_dict=word_to_moral_foundation_expanded)


In [72]:
display(subset_comments_df['moral_foundations'])

0                                                     []
1                                                     []
2                                                     []
3                                                     []
4                                                     []
                             ...                        
995    [AuthorityVice, HarmVirtue, PurityVirtue, Ingr...
996    [PurityVirtue, IngroupVirtue, HarmVice, Author...
997                     [IngroupVirtue, MoralityGeneral]
998     [IngroupVirtue, MoralityGeneral, FairnessVirtue]
999    [PurityVirtue, IngroupVirtue, HarmVice, Author...
Name: moral_foundations, Length: 1000, dtype: object