In [2]:
import pandas as pd
import json
import random
from collections import Counter
import re

random.seed(42)

In [3]:
def divideTrainDevTest(inputlist, labellist):       ## 75:10:15 split
    test_idx = random.sample(range(len(inputlist)), int(len(inputlist)*0.15))

    new_list = [d for d in range(len(inputlist)) if d not in test_idx]
    val_idx = random.sample(new_list, int(len(inputlist)*0.10))

    train_idx = [d for d in new_list if d not in val_idx]

    train_inputs = [inputlist[x] for x in train_idx]
    train_labels = [labellist[x] for x in train_idx]

    val_inputs = [inputlist[x] for x in val_idx]
    val_labels = [labellist[x] for x in val_idx]

    test_inputs = [inputlist[x] for x in test_idx]
    test_labels = [labellist[x] for x in test_idx]

    return train_inputs, train_labels, val_inputs, val_labels, test_inputs, test_labels

In [4]:
def save_to_csv(inputs, labels, filepath, filename):
    df = pd.DataFrame()
    df['input'] = inputs
    df['label'] = labels

    df.to_csv(filepath + filename)
    print("File saved to ", filepath + filename)

In [5]:
def processText(text):
    text = text.replace('\n', ' ')
    text = text.replace('\r', ' ')
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    text = re.sub(' +', ' ', text).strip()
    return text

## For ETHICS-justice

In [5]:
def convertEthicsJustice(input_file, output_file):
    inputs, labels = [], []
    for sc, lab in zip(input_file['scenario'], input_file['label']):
        inputs.append(sc)
        labels.append(lab)

    save_to_csv(inputs, labels, input_path, output_file)
    
input_path = "/shared/0/Morality/OnlyFiles/ethics/justice/"
input_train_file = "justice_train.csv"
# input_dev_file = ""
input_test_file = "justice_test.csv"
input_test_file2 = "justice_test_hard.csv"      ## Adversarialy selected hard test samples

train_file = pd.read_csv(input_path + input_train_file)
# dev_file = ""
test_file = pd.read_csv(input_path + input_test_file)
test_file2 = pd.read_csv(input_path + input_test_file2)

convertEthicsJustice(train_file, "train.csv")
convertEthicsJustice(test_file, "test.csv")
convertEthicsJustice(test_file2, "test_hard.csv")

File saved to  /shared/0/Morality/OnlyFiles/ethics/justice/train.csv
File saved to  /shared/0/Morality/OnlyFiles/ethics/justice/test.csv
File saved to  /shared/0/Morality/OnlyFiles/ethics/justice/test_hard.csv


## For MoralExceptQA

In [6]:
input_path = "/shared/0/Morality/OnlyFiles/moralexceptqa/"
input_train_file = "MoralExceptQA.json"

with open(input_path + input_train_file, 'r') as f:
    data = f.readlines()
print(len(data))

inputs, labels = [], []

for d in data:
    d = json.loads(d)
    inp = d['context'] + ' [SEP] ' + d['condition'] + ' [SEP] ' + d['scenario']
    lab = 1 if d['human.response'] > 0.5 else 0
    inputs.append(inp)
    labels.append(lab)

train_inputs, train_labels, val_inputs, val_labels, test_inputs, test_labels = divideTrainDevTest(inputs, labels)

save_to_csv(inputs, labels, input_path, "full.csv")
save_to_csv(train_inputs, train_labels, input_path, "train.csv")
save_to_csv(val_inputs, val_labels, input_path, "val.csv")
save_to_csv(test_inputs, test_labels, input_path, "test.csv")


148
File saved to  /shared/0/Morality/OnlyFiles/moralexceptqa/full.csv
File saved to  /shared/0/Morality/OnlyFiles/moralexceptqa/train.csv
File saved to  /shared/0/Morality/OnlyFiles/moralexceptqa/val.csv
File saved to  /shared/0/Morality/OnlyFiles/moralexceptqa/test.csv


## For ETHICS-Utilitarianism

In [7]:
def convertEthicsUtil(input_file, output_file):
    inputs, inputs2, labels = [], [], []
    for i, (sc1, sc2) in enumerate(zip(input_file['scenario1'], input_file['scenario2'])):
        if i % 2 == 0:
            inputs.append(sc2)
            inputs2.append(sc1)
            labels.append(1)
        else:
            inputs.append(sc1)
            inputs2.append(sc2)
            labels.append(0)

    df = pd.DataFrame()
    df['inputs1'] = inputs
    df['inputs2'] = inputs2
    df['labels'] = labels

    df.to_csv(input_path + output_file)
    print("File saved to ", input_path + output_file)

input_path = "/shared/0/Morality/OnlyFiles/ethics/utilitarianism/"
## 0 is better than 1 always
input_train_file = "util_train.csv"
input_test_file = "util_test.csv"
input_test_file2 = "util_test_hard.csv"      ## Adversarialy selected hard test samples

train_file = pd.read_csv(input_path + input_train_file, names=['scenario1', 'scenario2'])
test_file = pd.read_csv(input_path + input_test_file, names=['scenario1', 'scenario2'])
test_file2 = pd.read_csv(input_path + input_test_file2, names=['scenario1', 'scenario2'])

convertEthicsUtil(train_file, "train.csv")
convertEthicsUtil(test_file, "test.csv")
convertEthicsUtil(test_file2, "test_hard.csv")

File saved to  /shared/0/Morality/OnlyFiles/ethics/utilitarianism/train.csv
File saved to  /shared/0/Morality/OnlyFiles/ethics/utilitarianism/test.csv
File saved to  /shared/0/Morality/OnlyFiles/ethics/utilitarianism/test_hard.csv


## For Moral Stories - Action Classification

In [8]:
input_path = "/shared/0/Morality/OnlyFiles/moralstories/action_classification/"
input_train_file = "train.jsonl"
input_val_file = "valid.jsonl"
input_test_file = "test.jsonl"

def convertMoralStoriesAction(jsonfile):
    inputs, labels = [], []
    with open(input_path + jsonfile, 'r') as f:
        jsonlines = f.readlines()
    for line in jsonlines:
        data = json.loads(line)
        if 'moral_action' in data.keys():
            action = 'moral_action'
        else:
            action = 'immoral_action'
        inp = data['norm'] + " [SEP] " + data[action]
        inputs.append(inp)
        labels.append(data['label'])
    return inputs, labels

train_inputs, train_labels = convertMoralStoriesAction(input_train_file)
val_inputs, val_labels = convertMoralStoriesAction(input_val_file)
test_inputs, test_labels = convertMoralStoriesAction(input_test_file)

save_to_csv(train_inputs, train_labels, input_path, 'train.csv')
save_to_csv(val_inputs, val_labels, input_path, 'val.csv')
save_to_csv(test_inputs, test_labels, input_path, 'test.csv')

File saved to  /shared/0/Morality/OnlyFiles/moralstories/action_classification/train.csv
File saved to  /shared/0/Morality/OnlyFiles/moralstories/action_classification/val.csv
File saved to  /shared/0/Morality/OnlyFiles/moralstories/action_classification/test.csv


## For DILEMMAS

In [9]:
input_path = "/shared/0/Morality/OnlyFiles/dilemmas/"
input_train_file = "train.scruples-dilemmas.jsonl"
input_train_file2 = "train-extra.scruples-dilemmas.jsonl"       ## Data used to qualify workers
input_val_file = "dev.scruples-dilemmas.jsonl"
input_test_file = "test.scruples-dilemmas.jsonl"

def convertDilemmas(jsonfile, output_file):
    inputs, inputs2, labels = [], [], []        ## Select which is a bad action
    with open(input_path + jsonfile, 'r') as f:
        jsonlines = f.readlines()
    for line in jsonlines:
        data = json.loads(line)
        inp = data['actions'][0]['description']
        inp2 = data['actions'][1]['description']
        inputs.append(inp)
        inputs2.append(inp2)

        labels.append(data['gold_label'])

    if jsonfile == input_train_file:
        with open(input_path + input_train_file2, 'r') as f:
            jsonlines = f.readlines()
        for line in jsonlines:
            data = json.loads(line)
            inp = data['actions'][0]['description']
            inp2 = data['actions'][1]['description']
            inputs.append(inp)
            inputs2.append(inp2)

            labels.append(data['gold_label'])
    
    df = pd.DataFrame()
    df['inputs1'] = inputs
    df['inputs2'] = inputs2
    df['labels'] = labels

    df.to_csv(input_path + output_file)
    print("File saved to ", input_path + output_file)

convertDilemmas(input_train_file, "train.csv")
convertDilemmas(input_val_file, "val.csv")
convertDilemmas(input_test_file, "test.csv")

File saved to  /shared/0/Morality/OnlyFiles/dilemmas/train.csv
File saved to  /shared/0/Morality/OnlyFiles/dilemmas/val.csv
File saved to  /shared/0/Morality/OnlyFiles/dilemmas/test.csv


## For ETHICS Commonsense

In [10]:
input_path = "/shared/0/Morality/OnlyFiles/ethics/commonsense/"
input_train_file = "cm_train.csv"
input_test_file = "cm_test.csv"
input_test_file2 = "cm_test_hard.csv"      ## Adversarialy selected hard test samples

def convertEthicsCommonsense(input_file, output_file):
    inputs, labels = [], []
    for inp, lab in zip(input_file['input'], input_file['label']):
        inputs.append(inp)
        labels.append(lab)

    save_to_csv(inputs, labels, input_path, output_file)

train_file = pd.read_csv(input_path + input_train_file)
test_file = pd.read_csv(input_path + input_test_file)
test_file2 = pd.read_csv(input_path + input_test_file2)

convertEthicsCommonsense(train_file, "train.csv")
convertEthicsCommonsense(test_file, "test.csv")
convertEthicsCommonsense(test_file2, "test_hard.csv")

File saved to  /shared/0/Morality/OnlyFiles/ethics/commonsense/train.csv
File saved to  /shared/0/Morality/OnlyFiles/ethics/commonsense/test.csv
File saved to  /shared/0/Morality/OnlyFiles/ethics/commonsense/test_hard.csv


## For ETHICS Deontology

In [11]:
input_path = "/shared/0/Morality/OnlyFiles/ethics/deontology/"
input_train_file = "deontology_train.csv"
input_test_file = "deontology_test.csv"
input_test_file2 = "deontology_test_hard.csv"      ## Adversarialy selected hard test samples

def convertEthicsDeontology(input_file, output_file):
    inputs, labels = [], []
    for sc, exc, lab in zip(input_file['scenario'], input_file['excuse'], input_file['label']):
        inp = sc + ' [SEP] ' + exc
        inputs.append(inp)
        labels.append(lab)

    save_to_csv(inputs, labels, input_path, output_file)

train_file = pd.read_csv(input_path + input_train_file)
test_file = pd.read_csv(input_path + input_test_file)
test_file2 = pd.read_csv(input_path + input_test_file2)

convertEthicsDeontology(train_file, "train.csv")
convertEthicsDeontology(test_file, "test.csv")
convertEthicsDeontology(test_file2, "test_hard.csv")

File saved to  /shared/0/Morality/OnlyFiles/ethics/deontology/train.csv
File saved to  /shared/0/Morality/OnlyFiles/ethics/deontology/test.csv
File saved to  /shared/0/Morality/OnlyFiles/ethics/deontology/test_hard.csv


## For Moral Stories - Consequence Classification

In [12]:
input_path = "/shared/0/Morality/OnlyFiles/moralstories/consequence_classification/"
input_train_file = "train-2.jsonl"
input_val_file = "valid-2.jsonl"
input_test_file = "test-2.jsonl"

def convertMoralStoriesConsequence(jsonfile):
    inputs, labels = [], []
    with open(input_path + jsonfile, 'r') as f:
        jsonlines = f.readlines()
    for line in jsonlines:
        data = json.loads(line)
        if 'moral_action' in data.keys():
            action = 'moral_action'
            consequence = "moral_consequence"
        else:
            action = 'immoral_action'
            consequence = "immoral_consequence"
        inp = data['norm'] + " [SEP] " + data[action] + " [SEP] " + data[consequence]
        inputs.append(inp)
        labels.append(data['label'])
    return inputs, labels

train_inputs, train_labels = convertMoralStoriesConsequence(input_train_file)
val_inputs, val_labels = convertMoralStoriesConsequence(input_val_file)
test_inputs, test_labels = convertMoralStoriesConsequence(input_test_file)

save_to_csv(train_inputs, train_labels, input_path, 'train.csv')
save_to_csv(val_inputs, val_labels, input_path, 'val.csv')
save_to_csv(test_inputs, test_labels, input_path, 'test.csv')

File saved to  /shared/0/Morality/OnlyFiles/moralstories/consequence_classification/train.csv
File saved to  /shared/0/Morality/OnlyFiles/moralstories/consequence_classification/val.csv
File saved to  /shared/0/Morality/OnlyFiles/moralstories/consequence_classification/test.csv


## For ANECDOTES

In [13]:
input_path = "/shared/0/Morality/OnlyFiles/anecdotes/"
input_train_file = "train.scruples-anecdotes.jsonl"
input_val_file = "dev.scruples-anecdotes.jsonl"
input_test_file = "test.scruples-anecdotes.jsonl"

def convertAnecdotes(jsonfile, output_file):
    inputs, inputs2, labels = [], [], []        ## Select which is a bad action
    with open(input_path + jsonfile, 'r') as f:
        jsonlines = f.readlines()
    for line in jsonlines:
        data = json.loads(line)
        inputs.append(processText(data['text']))
        labels.append(data['label'])

    return inputs, labels

train_inputs, train_labels = convertAnecdotes(input_train_file, "train.csv")
val_inputs, val_labels = convertAnecdotes(input_val_file, "val.csv")
test_inputs, test_labels = convertAnecdotes(input_test_file, "test.csv")

save_to_csv(train_inputs, train_labels, input_path, "train.csv")
save_to_csv(val_inputs, val_labels, input_path, "val.csv")
save_to_csv(test_inputs, test_labels, input_path, "test.csv")

File saved to  /shared/0/Morality/OnlyFiles/anecdotes/train.csv
File saved to  /shared/0/Morality/OnlyFiles/anecdotes/val.csv
File saved to  /shared/0/Morality/OnlyFiles/anecdotes/test.csv


## For ETHICS Virtue

In [14]:
input_path = "/shared/0/Morality/OnlyFiles/ethics/virtue/"
input_train_file = "virtue_train.csv"
input_test_file = "virtue_test.csv"
input_test_file2 = "virtue_test_hard.csv"      ## Adversarialy selected hard test samples

def convertEthicsVirtue(input_file, output_file):
    inputs, labels = [], []
    for sc, lab in zip(input_file['scenario'], input_file['label']):
        inputs.append(sc)
        labels.append(lab)

    save_to_csv(inputs, labels, input_path, output_file)

train_file = pd.read_csv(input_path + input_train_file)
test_file = pd.read_csv(input_path + input_test_file)
test_file2 = pd.read_csv(input_path + input_test_file2)

convertEthicsVirtue(train_file, "train.csv")
convertEthicsVirtue(test_file, "test.csv")
convertEthicsVirtue(test_file2, "test_hard.csv")

File saved to  /shared/0/Morality/OnlyFiles/ethics/virtue/train.csv
File saved to  /shared/0/Morality/OnlyFiles/ethics/virtue/test.csv
File saved to  /shared/0/Morality/OnlyFiles/ethics/virtue/test_hard.csv


## For STORAL - Concept Understanding

In [15]:
def convertStoralMocpt(input_file, output_file):
    inputs2, inputs3, inputs4, inputs5, inputs6, labels = [], [], [], [], [], []
    with open(input_path + input_file, 'r') as f:
        jsonlines = f.readlines()
    for line in jsonlines:
        data = json.loads(line)
        inputs2.append(data['story'] + " [SEP] " + data['moral1'])
        inputs3.append(data['story'] + " [SEP] " + data['moral2'])
        inputs4.append(data['story'] + " [SEP] " + data['moral3'])
        inputs5.append(data['story'] + " [SEP] " + data['moral4'])
        inputs6.append(data['story'] + " [SEP] " + data['moral5'])
        labels.append(data['label'])

    df = pd.DataFrame()
    df['inputs2'], df['inputs3'], df['inputs4'], df['inputs5'], df['inputs6'], df['labels'] = inputs2, inputs3, inputs4, inputs5, inputs6, labels
    df.to_csv(input_path + output_file)
    print("File saves to ", input_path + output_file)

## English
input_path = "/shared/0/Morality/OnlyFiles/storal/mocpt_data/storal_en/"
input_train_file = "storal_en_label_train.jsonl"
input_val_file = "storal_en_label_valid.jsonl"
input_test_file = "storal_en_label_test.jsonl"

convertStoralMocpt(input_train_file, "train.csv")
convertStoralMocpt(input_val_file, "val.csv")
convertStoralMocpt(input_test_file, "test.csv")

## Chinese

input_path = "/shared/0/Morality/OnlyFiles/storal/mocpt_data/storal_zh/"
input_train_file = "storal_zh_label_train.jsonl"
input_val_file = "storal_zh_label_valid.jsonl"
input_test_file = "storal_zh_label_test.jsonl"

convertStoralMocpt(input_train_file, "train.csv")
convertStoralMocpt(input_val_file, "val.csv")
convertStoralMocpt(input_test_file, "test.csv")

File saves to  /shared/0/Morality/OnlyFiles/storal/mocpt_data/storal_en/train.csv
File saves to  /shared/0/Morality/OnlyFiles/storal/mocpt_data/storal_en/val.csv
File saves to  /shared/0/Morality/OnlyFiles/storal/mocpt_data/storal_en/test.csv
File saves to  /shared/0/Morality/OnlyFiles/storal/mocpt_data/storal_zh/train.csv
File saves to  /shared/0/Morality/OnlyFiles/storal/mocpt_data/storal_zh/val.csv
File saves to  /shared/0/Morality/OnlyFiles/storal/mocpt_data/storal_zh/test.csv


## For STORAL - Preference Alignment

In [16]:
def convertStoralMocpt(input_file, output_file):
    inputs2, inputs3, labels = [], [], []
    with open(input_path + input_file, 'r') as f:
        jsonlines = f.readlines()
    for line in jsonlines:
        data = json.loads(line)
        inputs2.append(data['story'] + " [SEP] " + data['moral1'])
        inputs3.append(data['story'] + " [SEP] " + data['moral2'])
        labels.append(data['label'])

    df = pd.DataFrame()
    df['inputs2'], df['inputs3'], df['labels'] = inputs2, inputs3, labels
    df.to_csv(input_path + output_file)
    print("File saves to ", input_path + output_file)

## English
input_path = "/shared/0/Morality/OnlyFiles/storal/mopref_data/storal_en/"
input_train_file = "storal_en_label_train.jsonl"
input_val_file = "storal_en_label_valid.jsonl"
input_test_file = "storal_en_label_test.jsonl"

convertStoralMocpt(input_train_file, "train.csv")
convertStoralMocpt(input_val_file, "val.csv")
convertStoralMocpt(input_test_file, "test.csv")

## Chinese

input_path = "/shared/0/Morality/OnlyFiles/storal/mopref_data/storal_zh/"
input_train_file = "storal_zh_label_train.jsonl"
input_val_file = "storal_zh_label_valid.jsonl"
input_test_file = "storal_zh_label_test.jsonl"

convertStoralMocpt(input_train_file, "train.csv")
convertStoralMocpt(input_val_file, "val.csv")
convertStoralMocpt(input_test_file, "test.csv")

File saves to  /shared/0/Morality/OnlyFiles/storal/mopref_data/storal_en/train.csv
File saves to  /shared/0/Morality/OnlyFiles/storal/mopref_data/storal_en/val.csv
File saves to  /shared/0/Morality/OnlyFiles/storal/mopref_data/storal_en/test.csv
File saves to  /shared/0/Morality/OnlyFiles/storal/mopref_data/storal_zh/train.csv
File saves to  /shared/0/Morality/OnlyFiles/storal/mopref_data/storal_zh/val.csv
File saves to  /shared/0/Morality/OnlyFiles/storal/mopref_data/storal_zh/test.csv


## For Moral Foundation Reddit Corpus

In [17]:
## Finding majority to get final labels

df = pd.read_csv("/shared/0/Morality/OnlyFiles/moralfoundationredditcorpus/final_mfrc_data.csv")
text = []
labels = []

this_text = ""
this_labels = []
for i in range(len(df)):
    lab = False
    if df['text'][i] != this_text:
        if this_labels:
            cnt = dict(Counter(this_labels))
            cnt = {k: v for k, v in sorted(cnt.items(), key=lambda item: item[1], reverse=True)}
            max = list(cnt.values())[0]
            if max >= 50/100 * sum(list(cnt.values())):
                lab = list(cnt.keys())[0]
            else:
                lab = False
            this_labels = []
        this_text = df['text'][i]

        if lab:
            text.append(processText(this_text))
            labels.append(lab)
    lab = df['annotation'][i].split(",")
    this_labels.extend(lab)

df = pd.DataFrame()
df['text'] = text
df['labels'] = labels

df.to_csv("/shared/0/Morality/OnlyFiles/moralfoundationredditcorpus/mfrc_data_majority.csv")

# Counter({'Non-Moral': 13751,
#          'Thin Morality': 3246,
#          'Care': 1892,
#          'Equality': 1093,
#          'Authority': 948,
#          'Proportionality': 896,
#          'Loyalty': 540,
#          'Purity': 358})

In [22]:
## Preparing data

input_path = "/shared/0/Morality/OnlyFiles/moralfoundationredditcorpus/"
input_train_file = "mfrc_data_majority.csv"

inputs, labels = [], []
data = pd.read_csv(input_path + input_train_file)
inputs = data['text'].tolist()
labels = data['labels'].tolist()

train_inputs, train_labels, val_inputs, val_labels, test_inputs, test_labels = divideTrainDevTest(inputs, labels)

save_to_csv(train_inputs, train_labels, input_path, "train.csv")
save_to_csv(val_inputs, val_labels, input_path, "val.csv")
save_to_csv(test_inputs, test_labels, input_path, "test.csv")

File saved to  /shared/0/Morality/OnlyFiles/moralfoundationredditcorpus/train.csv
File saved to  /shared/0/Morality/OnlyFiles/moralfoundationredditcorpus/val.csv
File saved to  /shared/0/Morality/OnlyFiles/moralfoundationredditcorpus/test.csv


## For Moral Foundations Twitter Corpus

In [23]:
## Finding tweet texts and majority

with open("/shared/0/Morality/OnlyFiles/moralfoundationstwittercorpus/MFTC_V4_text.json","r") as f:
    data = json.load(f)

tweets_id = []
tweets_text = []
labels = []

for item in data:
    tweets = item['Tweets']
    for tweet in tweets:
        tweets_id.append(tweet['tweet_id'])

        this_labels = []
        for ann in tweet['annotations']:
            an = ann['annotation'].split(",")
            this_labels.extend(an)
        cnt = dict(Counter(this_labels))
        cnt = {k: v for k, v in sorted(cnt.items(), key=lambda item: item[1], reverse=True)}
        lab = list(cnt.keys())[0]
        labels.append(lab)

        tweets_text.append(tweet['tweet_text'])

df = pd.DataFrame()
df['tweet_id'] = tweets_id
df['tweets'] = tweets_text
df['labels'] = labels

df.to_csv("/shared/0/Morality/OnlyFiles/moralfoundationstwittercorpus/MFTC_majority.csv")

# Counter({'non-moral': 15422,
#          'harm': 3558,
#          'cheating': 3025,
#          'loyalty': 2258,
#          'fairness': 2255,
#          'care': 2161,
#          'subversion': 1764,
#          'authority': 1375,
#          'betrayal': 1279,
#          'degradation': 1193,
#          'purity': 697})


In [26]:
## Preparing data

input_path = "/shared/0/Morality/OnlyFiles/moralfoundationstwittercorpus/"
input_train_file = "MFTC_majority.csv"

inputs, labels = [], []
data = pd.read_csv(input_path + input_train_file)
inputs = data['tweets'].tolist()
labels = data['labels'].tolist()

train_inputs, train_labels, val_inputs, val_labels, test_inputs, test_labels = divideTrainDevTest(inputs, labels)

save_to_csv(train_inputs, train_labels, input_path, "train.csv")
save_to_csv(val_inputs, val_labels, input_path, "val.csv")
save_to_csv(test_inputs, test_labels, input_path, "test.csv")

File saved to  /shared/0/Morality/OnlyFiles/moralfoundationstwittercorpus/train.csv
File saved to  /shared/0/Morality/OnlyFiles/moralfoundationstwittercorpus/val.csv
File saved to  /shared/0/Morality/OnlyFiles/moralfoundationstwittercorpus/test.csv


## For Moral Integrity Corpus

In [8]:
input_path = "/shared/0/Morality/OnlyFiles/moralintegritycorpus/"
input_train_file = "MIC.csv"

train_inputs, train_labels, val_inputs, val_labels, test_inputs, test_labels = [], [], [], [], [], []

data = pd.read_csv(input_path + input_train_file)

inputs = data['QA'].tolist()
labels = data['moral'].tolist()
splits = data['split'].tolist()

for inp, lab, spl in zip(inputs, labels, splits):
    if spl == 'train':
        train_inputs.append(inp)
        try:
            lab = ",".join([x.strip() for x in lab.split("|")])
        except:
            lab = "None"
        train_labels.append(lab)
    elif spl == 'test':
        test_inputs.append(inp)
        try:
            lab = ",".join([x.strip() for x in lab.split("|")])
        except:
            lab = "None"
        test_labels.append(lab)
    else:
        val_inputs.append(inp)
        try:
            lab = ",".join([x.strip() for x in lab.split("|")])
        except:
            lab = "None"
        val_labels.append(lab)

save_to_csv(train_inputs, train_labels, input_path, "train.csv")
save_to_csv(val_inputs, val_labels, input_path, "val.csv")
save_to_csv(test_inputs, test_labels, input_path, "test.csv")

File saved to  /shared/0/Morality/OnlyFiles/moralintegritycorpus/train.csv
File saved to  /shared/0/Morality/OnlyFiles/moralintegritycorpus/val.csv
File saved to  /shared/0/Morality/OnlyFiles/moralintegritycorpus/test.csv


## For MoralConvita

In [30]:
## Finding tweet texts and labels

data = pd.read_csv("/shared/0/Morality/OnlyFiles/moralconvita/moralConvITA_disaggregated.csv")
tweets_text = []
morals = []

for i in range(len(data)):
    tweets_text.append(data['text'][i])
    tweets_text.append(data['text'][i] + " [SEP] " + data['text_reply'][i])

    morals.append(data['first_pair_label'][i])
    morals.append(data['reply_label'][i])

df = pd.DataFrame()
df['tweets'] = tweets_text
df['morals'] = morals

df.to_csv("/shared/0/Morality/OnlyFiles/moralconvita/MoralConvITA_text.csv")


In [31]:
## Preparing data

input_path = "/shared/0/Morality/OnlyFiles/moralconvita/"
input_train_file = "MoralConvITA_text.csv"

inputs, labels = [], []
data = pd.read_csv(input_path + input_train_file)
inputs = data['tweets'].tolist()
labels = data['morals'].tolist()

train_inputs, train_labels, val_inputs, val_labels, test_inputs, test_labels = divideTrainDevTest(inputs, labels)

save_to_csv(train_inputs, train_labels, input_path, "train.csv")
save_to_csv(val_inputs, val_labels, input_path, "val.csv")
save_to_csv(test_inputs, test_labels, input_path, "test.csv")

File saved to  /shared/0/Morality/OnlyFiles/moralconvita/train.csv
File saved to  /shared/0/Morality/OnlyFiles/moralconvita/val.csv
File saved to  /shared/0/Morality/OnlyFiles/moralconvita/test.csv


## For Story Commonsense

In [32]:
input_path = "/shared/0/Morality/OnlyFiles/storycommonsense/"
input_train_file = "training/allcharlinepairs.csv"
input_val_file = "dev/emotion/allcharlinepairs.csv"
input_test_file = "test/emotion/allcharlinepairs.csv"

def convertStoryCommonsense(inputfile):
    inputs, labels = [], []
    data = pd.read_csv(input_path + inputfile)
    for i in range(len(data)):
        inp = str(data['context'][i]) + " [SEP] " + data['sentence'][i] + " [SEP] " + data['char'][i]
        inputs.append(inp)
        labels.append(data['emotion'][i])

    return inputs, labels

train_inputs, train_labels = convertStoryCommonsense(input_train_file)
val_inputs, val_labels = convertStoryCommonsense(input_val_file)
test_inputs, test_labels = convertStoryCommonsense(input_test_file)

save_to_csv(train_inputs, train_labels, input_path, "train.csv")
save_to_csv(val_inputs, val_labels, input_path, "val.csv")
save_to_csv(test_inputs, test_labels, input_path, "test.csv")

File saved to  /shared/0/Morality/OnlyFiles/storycommonsense/train.csv
File saved to  /shared/0/Morality/OnlyFiles/storycommonsense/val.csv
File saved to  /shared/0/Morality/OnlyFiles/storycommonsense/test.csv
