In [41]:
import json
import numpy as np
import pdb
import re
from anytree import NodeMixin, RenderTree, PreOrderIter

DataSet = 'male'
path = 'data_samples/reddit_' + DataSet + '.json'
with open(path) as json_file:
    data = json.load(json_file)


In [42]:
# NodeMixin class: extends any python class to a tree node

class WNode(NodeMixin):

     def __init__(self, foo, weight, parent=None):
         super(WNode, self).__init__()
         self.foo = foo
         self.parent = parent
         self.weight = weight

     def _post_detach(self, parent):
         self.weight = 0


In [43]:
# Recursive function to populate deeper reply levels

def NextLevel(replies, node_reply):
    if len(replies['replies'])>0:
        # Sort replies2
        score_list = []
        for replies2 in replies['replies']:
            score_list.append(replies2['score'])
        ZipList = zip(score_list, replies['replies'])
        replies_sorted = [x for _, x in sorted(ZipList, key=lambda pair: pair[0], reverse=True)] 
        # For each replies2 in sorted replies2
        for replies2 in replies_sorted:
            # Create node at replies-replies2
            node_replies2 = WNode(replies2['body'], parent=node_reply, weight=replies2['score'])
            # call NextLevel(replies2, node_replies2)
            NextLevel(replies2, node_replies2)


In [44]:
# Building ordered tree given a submission

def BuildTree(submission):
    title = submission['title']
    r = WNode(title, submission['score'])

    # Extracting Comments (Depth=0)
    score_list = []
    for comments in submission['comments']: # Ordering Comments
        score_list.append(comments['score'])
    ZipList = zip(score_list, submission['comments'])
    comments_sorted = [x for _, x in sorted(ZipList, key=lambda pair: pair[0], reverse=True)] 
    # Test: Scaling Scores to Max
    # if len(score_list)>0:
    #     score_max = max(score_list)
    # else:
        # score_max = 1
    # score_max = 1
    
    for comments in comments_sorted:  # Creating sorted Comment-Nodes
        node_comment = WNode(comments['body'], parent=r, weight=comments['score'])

        # Extracting replies (Depth=1)
        score_list = []
        for replies in comments['replies']:
            score_list.append(replies['score'])
        ZipList = zip(score_list, comments['replies'])
        replies_sorted = [x for _, x in sorted(ZipList, key=lambda pair: pair[0], reverse=True)] 

        for replies in replies_sorted:  # Creating sorted Reply-Nodes
            node_reply = WNode(replies['body'], parent=node_comment, weight=replies['score'])
            NextLevel(replies, node_reply)
        
    # Returning ordered tree        
    return r


In [45]:
# Visualising the unfiltered sorted tree 
for submission in data:
     # Build tree for each submission
    r = BuildTree(submission)
    
for pre, _, node in RenderTree(r):
    print("%s%s (%s)" % (pre, node.foo[0:20], node.weight or 0))


WAYWT - October 19 (28)
├── [from the fit battle (63)
│   ├── this color scheme is (9)
│   │   └── thanks mate 🙏 (1)
│   ├── Great colour scheme! (2)
│   │   └── Thanks! Yeah, they’r (1)
│   └── Really like the colo (1)
│       └── Thanks! The colour's (3)
│           └── Thanks! I have a sim (1)
│               └── I can’t recommend of (5)
│                   └── Makes sense, I wear  (2)
├── [Today](https://imgu (46)
│   ├── that coat's great (8)
│   ├── where the fuck are t (5)
│   │   └── Can't remember the l (6)
│   └── Do you have a link f (1)
│       └── They don’t make this (3)
├── [oops all autumn](ht (41)
│   ├── The zipper removal l (3)
│   ├── That mask/sweater co (1)
│   └── Where did you get th (1)
│       └── Got it from a vintag (2)
├── [deleted] (34)
│   ├── Fits #1 and #4 with  (7)
│   ├── nothing but winners (6)
│   ├── 3 is really really g (4)
│   ├── 3, 4, and 6 are my f (2)
│   └── all beautiful as alw (2)
├── [Fit battle outtake] (32)
│   ├── Another +1 for that  

In [46]:
# Build episode corresponding to N best-ranked conversations

def BuildBestEpisodes(r, N):
    # Scoring every leaf path
    score_path_list = []
    leaves = list(PreOrderIter(r, filter_=lambda node: node.is_leaf))
    for leaf in leaves:
        leaf_path = leaf.path
        lenth_path = len(leaf_path)
        score_path = 0
        for node in leaf_path:
            score_path = score_path + node.weight
        score_path_list.append(score_path/lenth_path) # Leaf score = (sum of scores)/(length of conversation)
    score_path_list

    # Sorting every leaf path
    ZipList = zip(score_path_list, (leaf.path for leaf in leaves))
    leaf_path_sorted = [x for _, x in sorted(ZipList, key=lambda pair: pair[0], reverse=True)] # List of conversations, ordered using theirs scores
    # leaf_path_sorted[1][2].foo[0:50] #path=1, node=2

    # labeling first N paths
    Npath_max = N
    
    submission_episode_texts = []
    submission_episode_labels = []
    submission_episode_dones = []
    # leaf = leaf_path_sorted[0]
    upper = min(Npath_max, len(leaf_path_sorted))
    for leaf_path in leaf_path_sorted[0:upper]:
        conversation = []
        for nodes in leaf_path:
            conversation.append(nodes.foo)

        episode_text = conversation[::2]
        episode_label = conversation[1::2]

        if len(episode_text)>len(episode_label):
            episode_label.append('')

        episode_done = [False for k in range(len(episode_text))]
        episode_done[-1] = True

        submission_episode_texts.extend(episode_text)
        submission_episode_labels.extend(episode_label)
        submission_episode_dones.extend(episode_done)

    # breakpoint()
    return submission_episode_texts, submission_episode_labels, submission_episode_dones


In [47]:
# Filtering Tree: trimming branches at invalid nodes

def FilterTree(r):
    r_filtered = r
    for pre, _, node in RenderTree(r_filtered):
        # Defining filtering rules
        URL = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', node.foo)
        # breakpoint()
        Filters = (node.foo == '[deleted]' or len(node.foo)>2048 or len(node.foo)<5 or node.weight<0 or len(URL) >=1 or not(node.foo[0].isascii()))
        node_weight = node.weight
        if not(node.parent == None):
            if Filters:
                node.children = ()
                node.foo = '[INVALID]'
                node.weight = 0

    return r_filtered

In [48]:
# Building episode files

episode_texts = []
episode_labels = []
episode_dones = []

N = 10 # Number of best-ranked episodes taken for each submission

C=0
for submission in data:
     # Build tree for each submission
    r = BuildTree(submission)
    # Filtering Tree
    r_filtered = FilterTree(r)
    # Extract N best-ranked episodes
    submission_episode_texts, submission_episode_labels, submission_episode_dones = BuildBestEpisodes(r_filtered, N) 
    # breakpoint()

    episode_texts.append(submission_episode_texts)
    episode_labels.append(submission_episode_labels)
    episode_dones.append(submission_episode_dones)
    # C = C+1
    # if C == 2:
    #     breakpoint()

print('Texts for first episode:')
episode_texts[0]

Texts for first episode:


['Creating your own clothes: Advice?',
 'Creating your own clothes: Advice?',
 'Creating your own clothes: Advice?',
 'Creating your own clothes: Advice?']

In [49]:
# Visualising Filtered Tree

r_filtered = FilterTree(r)
for pre, _, node in RenderTree(r_filtered):
    print("%s%s (%s)" % (pre, node.foo[0:20], node.weight))


WAYWT - October 19 (28)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
├── [INVALID] (0)
└── [INVALID] (0)


In [50]:
# Formating episode files into parlAI format
import pandas as pd

rows = pd.DataFrame()
flat_texts = [item for sublist in episode_texts for item in sublist]
flat_labels = [item for sublist in episode_labels for item in sublist]
flat_dones = [item for sublist in episode_dones for item in sublist]

rows['texts'] = flat_texts
rows['labels'] = flat_labels
rows['dones'] = flat_dones

NameFile = 'FH_output_parlai_' + DataSet + '.txt'
with open(NameFile, 'w') as f:
    for idx, row in rows.iterrows():
        t = ("text:{}\tlabels:{}").format(row['texts'].replace('\n', ' '), row['labels'].replace('\n', ' '))
        if row['dones'] == True:
            t = t + '\tepisode_done:True\n'
        else:
            t = t + '\n'
        f.write("%s" % t)


print('Keeping ' + str(N) + ' highest ranked episodes for each submission...')
print('Number of episodes:')
print(flat_dones.count(True))
        

Keeping 10 highest ranked episodes for each submission...
Number of episodes:
7048
