In [2]:
import pandas as pd
import json
import gzip
from glob import glob
from tqdm.auto import tqdm
from collections import defaultdict
import re

In [5]:
files = glob('/shared/4/projects/reddit-morals/data/RS*')
len(files)

172

In [6]:
df = defaultdict(list)
for fname in tqdm(files):
    with gzip.open(fname, 'rt') as f:
        try:
            for line in f:
                try:
                    data = json.loads(line)
                except json.JSONDecodeError:
                    continue
                if 'selftext' not in data:
                    continue

                title = data['title']
                body = data['selftext']
                question = title + '\n' + body
                if len(question.split()) < 50:
                    continue

                df['subreddit'].append(data['subreddit'])
                df['id'].append(data['id'])
                df['title'].append(data['title'])
                df['question'].append(question)
        except Exception as e:
            print(e)
df = pd.DataFrame(df)
print(len(df))

  0%|          | 0/172 [00:00<?, ?it/s]

2060960


In [7]:
len(df)

2060960

In [8]:
df.sample(5)

Unnamed: 0,subreddit,id,title,question
1494842,AskReddit,nirs6,"Apartment Managers/Staff, what do you look for...","Apartment Managers/Staff, what do you look for..."
32069,AskReddit,mr2g8,Girlfriend help?! What Do I Do?!,Girlfriend help?! What Do I Do?!\nOk so my gir...
1162044,AmItheAsshole,cihpqp,AITA for asking to use my stepbrothers room wh...,AITA for asking to use my stepbrothers room wh...
484425,AskReddit,v7nal,Am I being selfish?,Am I being selfish?\nMy parents promised me th...
1635191,AskReddit,1bu7e7,Would living out on my own be a good idea? (Is...,Would living out on my own be a good idea? (Is...


In [9]:
def clean(s):
    s = re.sub(r'\[[^]]+\]', '', s)
    s = s.replace('AITA', "Am I an asshole")
    s = s.replace('WIBTA', "Would I be an asshole")
    return s
df['cleaned_question'] = df['question'].apply(clean)

In [10]:
def is_valid(row):
    if row['subreddit'] == 'AskReddit':
        return False
    s = row['cleaned_question']
    if 'EDIT:' in s:
        return False
    s = s.lower()
    if 'reddit' in s:
        return False
    if 'http' in s:
        return False
    if re.search(r'r/[a-z0-9]+', s):
        return False
    return True
df['valid'] = df.apply(is_valid, axis=1)

In [11]:
for s in df[df['valid']]['cleaned_question'].sample(5):
    print(s)
    print()

Am I an asshole for missing a friend's last comedy show
I am a sophmore in college and a have a friend who is a member of a comedy group on campus, who preforms about three to four times a school year. I have been to several of her shows, but unless I show up 45 minutes early, all the seats are filled in the theatre she preforms in and the group I come with and I have to stand in the back. The show itself is about an hour, so in total its about a 1:45 hour time dedication to be seated comfortably usually during a friday or saturday night. 
This friend is most likely transferring to a different college soon so she dubbed her show last night "her last show." I said I would go and so did my friends until later that night, when everyone I was going to go with bailed. The general consensus was the show took a long time,  and just wasn't funny enough to justify seeing. They also mentioned how our friend (who was preforming) was just memorizing her set hours before the show and it wasn't a ve

In [12]:
df2 = df[df['valid']].copy()
del df2['valid']
df2.head()

Unnamed: 0,subreddit,id,title,question,cleaned_question
87053,AmItheAsshole,4czypu,AITA for reporting a seller to Etsy?,AITA for reporting a seller to Etsy?\nEarly la...,Am I an asshole for reporting a seller to Etsy...
87055,AmItheAsshole,4d3evq,AITA...pissed that my boyfriend has had my car...,AITA...pissed that my boyfriend has had my car...,Am I an asshole...pissed that my boyfriend has...
87056,AmItheAsshole,4d4npn,AITA for setting up Tasker to auto-text my gir...,AITA for setting up Tasker to auto-text my gir...,Am I an asshole for setting up Tasker to auto-...
87059,AmItheAsshole,4dcnuy,AITA for not wanting spectators present during...,AITA for not wanting spectators present during...,Am I an asshole for not wanting spectators pre...
87066,AmItheAsshole,4diwg3,AITA for always tipping 15%?,AITA for always tipping 15%?\nI live in Canada...,Am I an asshole for always tipping 15%?\nI liv...


In [13]:
df2.to_csv('/shared/4/projects/reddit-morals/data/moral-questions.csv', index=False)

In [14]:
len(df2)

399560

In [16]:
        instruction = 'The following text is a question to a general audience:\n%s\n' \
        + '\nInstructions:\n' \
        + ' Determine whether the question can be summarized or reframed as a general moral dilemma where as person has to choose between two mutually-exclusive options.' \
        + ' If the question can be reframed as a moral dilemma, generate an answer in JSON format with the following information:\n' \
        + ' * A short summary of the question as a moral dilemma in fewer than five sentences. The summary should highlight the general moral dilemma that the author faces. This summary should have the JSON key "summary".\n' \
        + ' * A one sentence description of the first action that could be taken based on the dilemma, with the JSON key "option1".\n' \
        + ' * A one sentence description of the second action that could be taken based on the dilemma, with the JSON key "option2".\n' \
        + 'The two actions should describe distinct choices that can be taken based on the dilemma. Options should reflect different values or morals. Actions should be mutually exclusive and not overlap in their description.\n' \
        + 'If the question cannot be reframed as a moral dilemma, generate a JSON object with the key "summary" with the empty string "" for that key.\n' \
        + ' Do not explain your answer. Generate the string "Answer: " before the JSON string.'