In [2]:
import json
import numpy as np
import pdb
import re
from anytree import NodeMixin, RenderTree, PreOrderIter

with open('data_samples/reddit_female.json') as json_file:
    data = json.load(json_file)


In [3]:
# NodeMixin class: extends any python class to a tree node

class WNode(NodeMixin):

     def __init__(self, foo, weight, parent=None):
         super(WNode, self).__init__()
         self.foo = foo
         self.parent = parent
         self.weight = weight

     def _post_detach(self, parent):
         self.weight = 0


In [4]:
# Recursive function to populate deeper reply levels

def NextLevel(replies, node_reply):
    if len(replies['replies'])>0:
        # Sort replies2
        score_list = []
        for replies2 in replies['replies']:
            score_list.append(replies2['score'])
        ZipList = zip(score_list, replies['replies'])
        replies_sorted = [x for _, x in sorted(ZipList, key=lambda pair: pair[0], reverse=True)] 
        # For each replies2 in sorted replies2
        for replies2 in replies_sorted:
            # Create node at replies-replies2
            node_replies2 = WNode(replies2['body'], parent=node_reply, weight=replies2['score'])
            # call NextLevel(replies2, node_replies2)
            NextLevel(replies2, node_replies2)


In [5]:
# Building ordered tree given a submission

def BuildTree(submission):
    title = submission['title']
    r = WNode(title, submission['score'])

    # Extracting Comments (Depth=0)
    score_list = []
    for comments in submission['comments']: # Ordering Comments
        score_list.append(comments['score'])
    ZipList = zip(score_list, submission['comments'])
    comments_sorted = [x for _, x in sorted(ZipList, key=lambda pair: pair[0], reverse=True)] 
    # Test: Scaling Scores to Max
    # if len(score_list)>0:
    #     score_max = max(score_list)
    # else:
        # score_max = 1
    # score_max = 1
    
    for comments in comments_sorted:  # Creating sorted Comment-Nodes
        node_comment = WNode(comments['body'], parent=r, weight=comments['score'])

        # Extracting replies (Depth=1)
        score_list = []
        for replies in comments['replies']:
            score_list.append(replies['score'])
        ZipList = zip(score_list, comments['replies'])
        replies_sorted = [x for _, x in sorted(ZipList, key=lambda pair: pair[0], reverse=True)] 

        for replies in replies_sorted:  # Creating sorted Reply-Nodes
            node_reply = WNode(replies['body'], parent=node_comment, weight=replies['score'])
            NextLevel(replies, node_reply)
        
    # Returning ordered tree        
    return r


In [6]:
# Visualising the unfiltered sorted tree 
for submission in data:
     # Build tree for each submission
    r = BuildTree(submission)
    
for pre, _, node in RenderTree(r):
    print("%s%s (%s)" % (pre, node.foo[0:20], node.weight or 0))


Is 2020 the year of  (303)
â”œâ”€â”€ > "house dresses," i (284)
â”‚   â”œâ”€â”€ Oh, I didn't mean to (58)
â”‚   â”‚   â””â”€â”€ I agree that the sty (39)
â”‚   â”‚       â””â”€â”€ Yeah I used to feel  (14)
â”‚   â””â”€â”€ How do you find the  (3)
â”‚       â””â”€â”€ I got it in a clothi (1)
â”œâ”€â”€ I read it slightly o (89)
â”‚   â”œâ”€â”€ I actually went thro (15)
â”‚   â”‚   â”œâ”€â”€ [deleted] (34)
â”‚   â”‚   â”‚   â””â”€â”€ Yes, you're absolute (15)
â”‚   â”‚   â”‚       â””â”€â”€ [deleted] (23)
â”‚   â”‚   â”‚           â”œâ”€â”€ Eff heels. Burn them (11)
â”‚   â”‚   â”‚           â”œâ”€â”€ I definitely think t (1)
â”‚   â”‚   â”‚           â””â”€â”€ As a very tall woman (-2)
â”‚   â”‚   â”‚               â””â”€â”€ [deleted] (3)
â”‚   â”‚   â”‚                   â””â”€â”€ I'm in my 30s too an (2)
â”‚   â”‚   â”‚                       â””â”€â”€ Super short girl her (2)
â”‚   â”‚   â””â”€â”€ Oh definitely not, b (1)
â”‚   â””â”€â”€ Link to your dress?  (1)
â”‚       â””â”€â”€ It 

In [7]:
# Build episode corresponding to N best-ranked conversations

def BuildBestEpisodes(r, N):
    # Scoring every leaf path
    score_path_list = []
    leaves = list(PreOrderIter(r, filter_=lambda node: node.is_leaf))
    for leaf in leaves:
        leaf_path = leaf.path
        lenth_path = len(leaf_path)
        score_path = 0
        for node in leaf_path:
            score_path = score_path + node.weight
        score_path_list.append(score_path/lenth_path) # Leaf score = (sum of scores)/(length of conversation)
    score_path_list

    # Sorting every leaf path
    ZipList = zip(score_path_list, (leaf.path for leaf in leaves))
    leaf_path_sorted = [x for _, x in sorted(ZipList, key=lambda pair: pair[0], reverse=True)] # List of conversations, ordered using theirs scores
    # leaf_path_sorted[1][2].foo[0:50] #path=1, node=2

    # labeling first N paths
    Npath_max = N
    
    submission_episode_texts = []
    submission_episode_labels = []
    submission_episode_dones = []
    # leaf = leaf_path_sorted[0]
    upper = min(Npath_max, len(leaf_path_sorted))
    for leaf_path in leaf_path_sorted[0:upper]:
        conversation = []
        for nodes in leaf_path:
            conversation.append(nodes.foo)

        episode_text = conversation[::2]
        episode_label = conversation[1::2]

        if len(episode_text)>len(episode_label):
            episode_label.append('')

        episode_done = [False for k in range(len(episode_text))]
        episode_done[-1] = True

        submission_episode_texts.extend(episode_text)
        submission_episode_labels.extend(episode_label)
        submission_episode_dones.extend(episode_done)

    # breakpoint()
    return submission_episode_texts, submission_episode_labels, submission_episode_dones


In [8]:
# Filtering Tree: trimming branches at invalid nodes

def FilterTree(r):
    r_filtered = r
    for pre, _, node in RenderTree(r_filtered):
        # Defining filtering rules
        URL = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', node.foo)
        # breakpoint()
        Filters = (node.foo == '[deleted]' or len(node.foo)>2048 or len(node.foo)<5 or node.weight<0 or len(URL) >=1 or not(node.foo[0].isascii()))
        node_weight = node.weight
        if not(node.parent == None):
            if Filters:
                node.children = ()
                node.foo = '[INVALID]'
                node.weight = 0

    return r_filtered

In [15]:
# Building episode files

episode_texts = []
episode_labels = []
episode_dones = []

N = 10 # Number of best-ranked episodes taken for each submission

C=0
for submission in data:
     # Build tree for each submission
    r = BuildTree(submission)
    # Filtering Tree
    r_filtered = FilterTree(r)
    # Extract N best-ranked episodes
    submission_episode_texts, submission_episode_labels, submission_episode_dones = BuildBestEpisodes(r_filtered, N) 
    # breakpoint()

    episode_texts.append(submission_episode_texts)
    episode_labels.append(submission_episode_labels)
    episode_dones.append(submission_episode_dones)
    # C = C+1
    # if C == 2:
    #     breakpoint()

print('Texts for first episode:')
episode_texts[0]

Texts for first episode:


['How would you store sentimental clothing?',
 "Thank you for sharing. I have some of my mom's clothes and I want to preserve it in good condition.",
 'How would you store sentimental clothing?',
 'How would you store sentimental clothing?',
 'How would you store sentimental clothing?',
 'How would you store sentimental clothing?',
 'How would you store sentimental clothing?',
 'How would you store sentimental clothing?',
 'How would you store sentimental clothing?',
 'How would you store sentimental clothing?',
 'Iâ€™m getting married this year so Iâ€™m curious. How? Isnâ€™t plastic bad for storing wedding dresses (and basically most materials)? Can you vacuum seal in other material bags?',
 'I got my wedding dress cleaned and preserved at the cleaners. Why shouldnâ€™t I have done that?',
 'How would you store sentimental clothing?',
 'Iâ€™m getting married this year so Iâ€™m curious. How? Isnâ€™t plastic bad for storing wedding dresses (and basically most materials)? Can you vacuum s

In [16]:
# Visualising Filtered Tree

r_filtered = FilterTree(r)
for pre, _, node in RenderTree(r_filtered):
    print("%s%s (%s)" % (pre, node.foo[0:20], node.weight))


Is 2020 the year of  (303)
â”œâ”€â”€ > "house dresses," i (284)
â”‚   â”œâ”€â”€ Oh, I didn't mean to (58)
â”‚   â”‚   â””â”€â”€ I agree that the sty (39)
â”‚   â”‚       â””â”€â”€ Yeah I used to feel  (14)
â”‚   â””â”€â”€ How do you find the  (3)
â”‚       â””â”€â”€ I got it in a clothi (1)
â”œâ”€â”€ I read it slightly o (89)
â”‚   â”œâ”€â”€ I actually went thro (15)
â”‚   â”‚   â”œâ”€â”€ [INVALID] (0)
â”‚   â”‚   â””â”€â”€ Oh definitely not, b (1)
â”‚   â””â”€â”€ Link to your dress?  (1)
â”‚       â””â”€â”€ [INVALID] (0)
â”œâ”€â”€ [INVALID] (0)
â”œâ”€â”€ [INVALID] (0)
â”œâ”€â”€ This style was trend (46)
â”œâ”€â”€ [INVALID] (0)
â”œâ”€â”€ These kinds of dress (26)
â”œâ”€â”€ I recently bought an (11)
â”‚   â””â”€â”€ Same, I read this ar (1)
â”œâ”€â”€ [INVALID] (0)
â”œâ”€â”€ Iâ€™m a fan of the muu (10)
â”œâ”€â”€ I am currently IN a  (7)
â”œâ”€â”€ I think the â€œoversiz (7)
â”œâ”€â”€ It goes back a lot f (7)
â”œâ”€â”€ i love an all cotton (6)
â”œâ”€â”€ I have a few what I  (5)
â”œâ”€â”€ I 

In [17]:
# Formating episode files into parlAI format
import pandas as pd

rows = pd.DataFrame()
flat_texts = [item for sublist in episode_texts for item in sublist]
flat_labels = [item for sublist in episode_labels for item in sublist]
flat_dones = [item for sublist in episode_dones for item in sublist]

rows['texts'] = flat_texts
rows['labels'] = flat_labels
rows['dones'] = flat_dones


with open('FH_output_parlai.txt', 'w') as f:
    for idx, row in rows.iterrows():
        t = ("text:{}\tlabels:{}").format(row['texts'].replace('\n', ' '), row['labels'].replace('\n', ' '))
        if row['dones'] == True:
            t = t + '\tepisode_done:True\n'
        else:
            t = t + '\n'
        f.write("%s" % t)


print('Keeping ' + str(N) + ' highest ranked episodes for each submission...')
print('Number of episodes:')
print(flat_dones.count(True))
        

Keeping 10 highest ranked episodes for each submission...
Number of episodes:
6042
