In [None]:
import json
import ast
import re
import spacy
import emoji
import pandas as pd
import numpy as np
from ast import literal_eval
from collections import Counter
from ekphrasis.classes.segmenter import Segmenter
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons



# LOAD AND PRE-PROCESS DATASETS MFTC

In [None]:
#LOAD DATASETS

#ALM
alm=pd.read_csv('DATASETS/MFTC/ALM.tsv',sep='\t')
alm = alm.drop("Unnamed: 0", axis=1)

#BALTIMORE
baltimore=pd.read_csv('DATASETS/MFTC/Baltimore.tsv',sep='\t')
baltimore = baltimore.drop("Unnamed: 0", axis=1)

#BLM
with open('DATASETS/MFTC/BLM.tsv', 'r') as file:
    lines = file.readlines()

column_names = lines[0].strip().split('\t')
dataset = [dict(zip(column_names, line.strip().split('\t')[1:])) for line in lines[1:]]
blm = pd.DataFrame(dataset)
blm['annotations'] = blm['annotations'].apply(lambda x: json.loads(x.replace("'", "\"")) if isinstance(x, str) else x)

#DAVIDSON - hate speech and offensive language
davidson=pd.read_csv('DATASETS/MFTC/Davidson.tsv',sep='\t')
davidson = davidson.drop("Unnamed: 0", axis=1)

#ELECTION
election=pd.read_csv('DATASETS/MFTC/Election.tsv',sep='\t')
election = election.drop("Unnamed: 0", axis=1)

#SANDY
sandy=pd.read_csv('DATASETS/MFTC/Sandy.tsv',sep='\t')
sandy = sandy.drop("Unnamed: 0", axis=1)

#REDDIT
reddit=pd.read_csv('DATASETS/final_mfrc_data.csv')

In [None]:
#Fix problems reading datasets

#BLM
def index_list_blm(df):
    df_new=df.dropna(how='all').reset_index(drop=True)
    df_new.head(10)
    nan_index= df_new.index[df_new['annotations'].isna()].tolist()
    nan_index_plus1 = [idx + 1 for idx in nan_index]
    nan_index_plus1= nan_index_plus1[0::2]
    return nan_index_plus1, df_new
    
def blm_dataset(df): 
    nan_index_plus1, df_new = index_list_blm(df)
    blm_cleaned=df_new.copy()
    id_list = blm_cleaned['id'].tolist()
    text_list = blm_cleaned['text'].tolist()
    annotations_list = blm_cleaned['annotations'].tolist()
    label_list = blm_cleaned['label'].tolist()
    
    id_dict = dict(zip(blm_cleaned.index, id_list))
    text_dict = dict(zip(blm_cleaned.index, text_list))
    annotations_dict = dict(zip(blm_cleaned.index, annotations_list))
    label_dict = dict(zip(blm_cleaned.index, label_list))
    
    nan_indices_annotations = blm_cleaned.index[blm_cleaned['annotations'].isna()].tolist()
    nan_indices_label = blm_cleaned.index[blm_cleaned['label'].isna()].tolist()
    
    for idx in nan_indices_annotations:
        blm_cleaned.at[idx, 'annotations'] = [{'annotation': id_dict[idx + 1], 'annotator': 'annotator0'}]
    
    for idx in nan_indices_label:
        blm_cleaned.at[idx, 'label'] = text_dict[idx + 1]
    
    df_result = pd.DataFrame({
        'id': blm_cleaned['id'],
        'text': blm_cleaned['text'],
        'annotations': blm_cleaned['annotations'],
        'label': blm_cleaned['label']
    })
    df_result = df_result.drop(df_result[df_result.index.isin(nan_index_plus1)].index)
    
    return df_result

#ALM
def alm_dataset(df):
    df.loc[1315, 'annotations'] = df.loc[1316, 'id']
    df.loc[1315, 'label'] = df.loc[1316, 'text']

    df.loc[2247, 'annotations'] = df.loc[2248, 'id']
    df.loc[2247, 'label'] = df.loc[2248, 'text']

    df = df.drop([1316, 2248])
    df = df.reset_index(drop=True)
    return df


#ELECTION
def election_dataset(df):
    df.loc[394, 'annotations'] = df.loc[395, 'id']
    df.loc[394, 'label'] = df.loc[395, 'text']
    df = df.drop([395])
    df = df.reset_index(drop=True)

    return df

# LABELS BINARY (MPres)



In [None]:
'''Change label from categorical to int depends on the moral trait (MPres task)
    Moral Values: Moral1: care/harm
                  Moral2: fairness/cheating
                  Moral3: loyalty/betrayal
                  Moral4: authority/subversion
                  Moral5: purity/degradation'''

def labels_m1(df):
    df= df.replace({'label': {'care': 1, 'harm': 1,
                                'fairness': 0,'cheating': 0,
                                'loyalty': 0,'betrayal': 0,
                                'authority': 0,'subversion': 0,
                                 'purity': 0,'degradation': 0,'non-moral': 0,'nonmoral': 0}})
    return df


def labels_m2(df):
    df= df.replace({'label': {'care': 0, 'harm': 0,
                                'fairness': 1,'cheating': 1,
                                'loyalty': 0,'betrayal': 0,
                                'authority': 0,'subversion': 0,
                                 'purity': 0,'degradation': 0,'non-moral': 0, 'nonmoral': 0 }})
    return df


def labels_m3(df):
    df= df.replace({'label': {'care': 0, 'harm': 0,
                                'fairness': 0,'cheating': 0,
                                'loyalty': 1,'betrayal': 1,
                                'authority': 0,'subversion': 0,
                                 'purity': 0,'degradation': 0,'non-moral': 0,'nonmoral': 0}})
    return df


def labels_m4(df):
    df= df.replace({'label': {'care': 0, 'harm': 0,
                                'fairness': 0,'cheating': 0,
                                'loyalty': 0,'betrayal': 0,
                                'authority': 1,'subversion': 1,
                                 'purity': 0,'degradation': 0,'non-moral': 0,'nonmoral': 0}})
    return df

def labels_m5(df):
    df= df.replace({'label': {'care': 0, 'harm': 0,
                                'fairness': 0,'cheating': 0,
                                'loyalty': 0,'betrayal': 0,
                                'authority': 0,'subversion': 0,
                                 'purity': 1,'degradation': 1,'non-moral': 0,'nonmoral': 0 }})
    return df


# LABELS POLARITY (MPol)

In [None]:
#Changing the label from categorical to int depends on the moral trait and distinguishing its polarity (vice or virtue).

def label_mp1(df):
    df= df.replace({'label': {'care': 1, 'harm': 2,
                                'fairness': 0,'cheating': 0,
                                'loyalty': 0,'betrayal': 0,
                                'authority': 0,'subversion': 0,
                                 'purity': 0,'degradation': 0,'non-moral': 0,'nonmoral': 0,'nm': 0
                                }})
    return df


def label_mp2(df):
    df= df.replace({'label': {'care': 0, 'harm': 0,
                                'fairness': 1,'cheating': 2,
                                'loyalty': 0,'betrayal': 0,
                                'authority': 0,'subversion': 0,
                                 'purity': 0,'degradation': 0,'non-moral': 0, 'nonmoral': 0,'nm': 0
                                }})
    return df


def label_mp3(df):
    df= df.replace({'label': {'care': 0, 'harm': 0,
                                'fairness': 0,'cheating': 0,
                                'loyalty': 1,'betrayal': 2,
                                'authority': 0,'subversion': 0,
                                 'purity': 0,'degradation': 0,'non-moral': 0,'nonmoral': 0,'nm': 0
                                }})
    return df


def label_mp4(df):
    df= df.replace({'label': {'care': 0, 'harm': 0,
                                'fairness': 0,'cheating': 0,
                                'loyalty': 0,'betrayal': 0,
                                'authority': 1,'subversion': 2,
                                 'purity': 0,'degradation': 0,'non-moral': 0,'nonmoral': 0,'nm': 0
                                }})
    return df

def label_mp5(df):
    df= df.replace({'label': {'care': 0, 'harm': 0,
                                'fairness': 0,'cheating': 0,
                                'loyalty': 0,'betrayal': 0,
                                'authority': 0,'subversion': 0,
                                 'purity': 1,'degradation': 2,'non-moral': 0,'nonmoral': 0,'nm': 0
                                }})
    return df




In [None]:
def mp_label1():   
    id2label = {0:"NO-MORAL", 1:"CARE" ,2:"HARM"}
    label2id = {"NO-MORAL":0, "CARE":1 ,"HARM":2}
    return id2label,label2id 
    
def mp_label2():
    id2label = {0:"NO-MORAL", 1:"FAIRNESS" ,2:"CHEATING"}
    label2id = {"NO-MORAL":0, "FAIRNESS":1 ,"CHEATING":2}
    return id2label,label2id 
    
def mp_label3():
    id2label = {0:"NO-MORAL", 1:"LOYALTY" ,2:"BETRAYAL"}
    label2id = {"NO-MORAL":0, "LOYALTY":1 ,"BETRAYAL":2}
    return id2label,label2id 
    
def mp_label4():
    id2label = {0:"NO-MORAL", 1:"AUTHORITY" ,2:"SUBVERSION"}
    label2id = {"NO-MORAL":0, "AUTHORITY":1 ,"SUBVERSION":2}
    return id2label,label2id 
    
def mp_label5():
    id2label = {0:"NO-MORAL", 1:"PURITY" ,2:"DEGRADATION"}
    label2id = {"NO-MORAL":0, "PURITY":1 ,"DEGRADATION":2}
    return id2label,label2id 


# LABELS MULTICLASS 6 (MultiPres)

In [1]:
#Change the label from categorical to int using all moral traits at once. 
def label_multiclass6(df):
    df= df.replace({'label': {'care': 1, 'harm': 1,
                                'fairness': 2,'cheating': 2,
                                'loyalty': 3,'betrayal': 3,
                                'authority': 4,'subversion': 4,
                                 'purity': 5,'degradation': 5,'non-moral': 0,'nonmoral': 0,'nm': 0
                                }})
    return df


def multiclass_task_6():   
    id2label = {0:"NO-MORAL", 1:"CARE" ,1:"HARM",2:"FAIRNESS",2:"CHEATING",3:"LOYALTY",3:"BETRAYAL",4:"AUTHORITY",4:"SUBVERSION",5:"PURITY",5:"DEGRADATION"}
    label2id = {"NO-MORAL":0, "CARE": 1,"HARM":1,"FAIRNESS":2,"CHEATING":2,"LOYALTY":3,"BETRAYAL":3,"AUTHORITY":4,"SUBVERSION":4,"PURITY":5,"DEGRADATION":5}
    return id2label,label2id 


# LABELS MULTICLASS 11 (MultiPol)

In [2]:
#Change the label from categorical to int using all moral traits at once, distinguishing the polarity (vice or virtue).
def label_multiclass_11(df):
    df= df.replace({'label': {'care': 1, 'harm': 2,
                                'fairness': 3,'cheating': 4,
                                'loyalty': 5,'betrayal': 6,
                                'authority': 7,'subversion': 8,
                                 'purity': 9,'degradation': 10,'non-moral': 0,'nonmoral': 0,'nm': 0
                                }})
    return df


def multiclass_task_11():   
    id2label = {0:"NO-MORAL", 1:"CARE" ,2:"HARM",3:"FAIRNESS",4:"CHEATING",5:"LOYALTY",6:"BETRAYAL",7:"AUTHORITY",8:"SUBVERSION",9:"PURITY",10:"DEGRADATION"}
    label2id = {"NO-MORAL":0, "CARE":1 ,"HARM":2,"FAIRNESS":3,"CHEATING":4,"LOYALTY":5,"BETRAYAL":6,"AUTHORITY":7,"SUBVERSION":8,"PURITY":9,"DEGRADATION":10}
    return id2label,label2id 


 

# CLEAN DATA 

In [None]:
#Clean and pre-process data
'''Code from paper:
Liscio, E., Dondera, A.E., Geadau, A., Jonker, C.M., Murukannaiah,
P.K., 2022a. Cross-domain classification of moral values, in: 2022 Find-
ings of the Association for Computational Linguistics: NAACL 2022,
Association for Computational Linguistics (ACL). pp. 2727â€“2745

[Source code] https://github.com/adondera/transferability-of-values/blob/master/nlp/data/cleaners.py
'''


text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
               'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
              'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens

    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter",

    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter",

    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=True,  # spell correction for elongated words

    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,

    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

tweet_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
               'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={},
    fix_html=True,  # fix HTML tokens

    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter",

    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter",

    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=True,  # spell correction for elongated words

    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,

    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

# segmenter using the word statistics from Twitter
seg_tw = Segmenter(corpus="twitter")


# This preprocessing method was used for our experiments.
def cleaner5(tweet):
    tweet = emoji.demojize(tweet)
    tweet = "".join([char for char in tweet if char not in string.punctuation])
    tweet = re.sub(r'amp', ' ', tweet)
    return cleaner4(tweet)


def cleaner4(tweet):
    # remove pictures
    tweet = re.sub("pic.twitter.com/[A-Za-z0-9]+", "", tweet)

    # rectification for Sandy
    tweet = re.sub(" url ", "", tweet)
    tweet = re.sub(" at_user ", "", tweet)

    # remove numbers
    tweet = re.sub("[0-9]+", "", tweet)

    # deabbreviate most used abbreviations
    tweet = tweet.replace("#iuic", "#IsraelUnitedInChrist")
    tweet = tweet.replace("#tcot", "#TopConservativesOnTweeter")

    # custom preprocessor
    tweet = " ".join(tweet_processor.pre_process_doc(tweet))

    # remove tags
    tweet = re.sub("<[^\s]+>", "", tweet)
    tweet = tweet.replace("_", " ")

    # remove left usernames
    tweet = re.sub("@[^\s]+", "", tweet)

    # remove punctation

    # remove reserved words
    tweet = tweet.replace(" rt ", "")
    tweet = re.sub("^rt ", "", tweet)

    # manual word corrections
    tweet = tweet.replace(" s ", " is ").replace(" al ", " all ").replace(" nt ", " not ").replace(" ppl ",
                                                                                                   " people ").replace(
        " m ", " am ").replace(" u ", " you ").replace(" r ", " are ").replace(" w ", " with ")

    # remove math signs
    tweet = tweet.replace("+", "").replace("=", "").replace(">", "").replace("<", "").replace("|", "")
    tweet = tweet.replace("https", "").replace("http", "")

    # manual ALM and BLM word splitting
    tweet = tweet.replace(" alllivesmatter ", " all lives matter ").replace(" alm ", " all lives matter ")
    tweet = tweet.replace(" blacklivesmatter ", " black lives matter ").replace(" blm ", " black lives matter ")

    # remove extra white spaces
    tweet = " ".join(tweet.split())
    return tweet.lower()


def cleaner3(tweet):
    tweet = tweet.lower()
    tweet = re.sub("^rt ", "", tweet)
    tweet = re.sub("pic.twitter.com/[A-Za-z0-9]+", "", tweet)
    tweet = " ".join(text_processor.pre_process_doc(tweet))
    tweet = re.sub("<[^\s]+>", "", tweet)
    tweet = tweet.strip()
    tweet = " ".join(tweet.split())
    return tweet.lower()


def cleaner2(tweet):
    tweet = tweet.lstrip('\"')
    tweet = tweet.rstrip('\"')
    tweet = remove_emojis(tweet)
    tweet = tweet.lower()
    tweet = re.sub("^rt", "", tweet)
    tweet = re.sub("\s[0-9]+\s", "", tweet)

    # remove usernames
    tweet = re.sub("@[^\s]+", "", tweet)
    tweet = re.sub("at_user", "", tweet)

    # remove # sign 
    tweet = tweet.replace("#", "").replace("_", " ")

    # remove urls
    tweet = re.sub("pic.twitter.com/[A-Za-z0-9]+", "", tweet)
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet)
    tweet = tweet.replace("url", "")

    tweet = tweet.strip()
    tweet = " ".join(tweet.split())

    return tweet


def cleaner1(tweet):
    # remove usernames
    # tweet = re.sub("@[A-Za-z0-9]+","",tweet)
    tweet = remove_emojis(tweet)
    tweet = tweet.lower()
    tweet = re.sub("^rt", "", tweet)
    tweet = re.sub("\s[0-9]+\s", "", tweet)

    # remove usernames
    tweet = re.sub("@[^\s]+", "", tweet)
    tweet = re.sub("at_user", "", tweet)

    # remove # sign 
    tweet = tweet.replace("#", "").replace("_", " ")

    # remove urls
    tweet = re.sub("pic.twitter.com/[A-Za-z0-9]+", "", tweet)
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet)
    tweet = tweet.replace("url", "")

    tweet = tweet.strip()
    tweet = " ".join(tweet.split())

    return tweet


def remove_emojis(data):
    emoj = re.compile("["
                      u"\U0001F600-\U0001F64F"  # emoticons
                      u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                      u"\U0001F680-\U0001F6FF"  # transport & map symbols
                      u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                      u"\U00002500-\U00002BEF"  # chinese char
                      u"\U00002702-\U000027B0"
                      u"\U00002702-\U000027B0"
                      u"\U000024C2-\U0001F251"
                      u"\U0001f926-\U0001f937"
                      u"\U00010000-\U0010ffff"
                      u"\u2640-\u2642"
                      u"\u2600-\u2B55"
                      u"\u200d"
                      u"\u23cf"
                      u"\u23e9"
                      u"\u231a"
                      u"\ufe0f"  # dingbats
                      u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

def clean_data(df):
    df['text']=df['text'].apply(lambda x: remove_emojis(x))
    df['text']=df['text'].apply(lambda x: cleaner1(x))
    df['text']=df['text'].apply(lambda x: cleaner2(x))
    df['text']=df['text'].apply(lambda x: cleaner3(x))
    df['text']=df['text'].apply(lambda x: cleaner4(x))
    df['text']=df['text'].apply(lambda x: cleaner5(x))
    return df


# New annotations MFTD


In [None]:
'''Create annotations with the distinction between vice and virtue MFTC
Taking into account the annotations from the annotators and label the moral 
value that has the most agreement among the annotators.'''

def label_column(row):
    #select annotation values from dict
    return list(x["annotation"] for x in row)

def annotation(row): 
    #list of anotations, count items type 
    annotation_list = list(x.split(',') for x in row)
    final_list = [i for s in annotation_list for i in s]
    annotation=max(Counter(final_list), key=(Counter(final_list)).get)
    return annotation


def new_labels_polarity(df):
    df['annotations'] = df['annotations'].apply(lambda x: literal_eval(str(x)))
    df['labels'] = df['annotations'].apply(label_column)
    df['label_annotators'] = df.labels.apply(annotation)
    df.label=df.label.replace({"hate": 'degradation','Care':'care','Harm':'harm','Fairness':'fairness','Cheating':'cheating','Loyalty':'loyalty','Betrayal':'betrayal','Authority':'authority','Subversion':'subversion','Purity':'purity','Degradation':'degradation','Non-moral':'non-moral','nm':'non-moral','Non-Moral':'non-moral'})
    df.drop(['annotations', 'labels'],axis=1,inplace=True)
    return df

#BLM dataset 
def process_labels(row):
    try:
        labels_list = row['labels']
        if isinstance(labels_list, list) and len(labels_list) == 1:
            label = ast.literal_eval(labels_list[0])
            if isinstance(label, list):
                return [x['annotation'] for x in label]
    except (SyntaxError, ValueError, TypeError, KeyError):
        pass  
    return row['labels']


def new_labels_blm_polarity(df):
    '''add label'''
    df['annotations'] = df['annotations'].apply(lambda x: literal_eval(str(x)))
    df['labels'] = df['annotations'].apply(label_column)
    df['labels'] = df.apply(process_labels, axis=1)
    df['label_annotators'] = df.labels.apply(annotation)
    df.label=df.label.replace({"hate": 'degradation','nm':'non-moral'})
    df.drop(['annotations', 'labels'],axis=1,inplace=True)
    return df


# New annotations MFRD

In [None]:
'''Create annotations with the distinction between vice and virtue REDDIT
Taking into account the annotations from the annotators and label the moral value 
that has the most agreement among the annotators.'''

def annotation(row): 
    #list of anotations, count items type 
    annotation_list = list(x.split(',') for x in row)
    final_list = [i for s in annotation_list for i in s]
    annotation=max(Counter(final_list), key=(Counter(final_list)).get)
    return annotation

def clean_reddit_dataset(df):
    df= df[df['annotation'] != 'Thin Morality']
    df= df.groupby('text').agg(lambda x: list(x)).reset_index()
    df['label']=df.annotation.apply(annotation)
    df=df.drop(columns=['subreddit','bucket','annotator','confidence'])
    df['label'] = df['label'].replace({'Proportionality': 'Fairness', 'Equality': 'Fairness'})
    df.label=df.label.replace({'Care':'care','Harm':'harm','Fairness':'fairness','Cheating':'cheating','Loyalty':'loyalty','Betrayal':'betrayal','Authority':'authority','Subversion':'subversion','Purity':'purity','Degradation':'degradation','Non-moral':'non-moral','nm':'non-moral','Non-Moral':'non-moral'})

    return df


# REDDIT dataset pre-process
anot=df.annotation.unique()
annotation_list = list(x.split(',') for x in anot)
new_list=list(itertools.chain(*annotation_list))
c = collections.Counter(new_list)
#filter, not to use thin morality
df_reddit2 = reddit[reddit['annotation'] != 'Thin Morality']
df_reddit3= clean_reddit_dataset(df_reddit2)
df_reddit3['text'].astype(str)
df_reddit = clean_data(df_reddit3)
df_reddit
#df_reddit.to_csv('DATASETS/REDDIT_clean.csv')
