In [None]:
import seaborn as sns
import pandas
import numpy as np

In [None]:
train_set_df = pandas.read_csv("data/training data/labeled_training_set.csv")
test_set_df = pandas.read_csv("data/test data/unlabeled_test_set.csv")
posts_traintest_df = pandas.read_csv("data/unlabeled data/posts_trainingandtest_fullsent.csv")
posts_test_df = pandas.read_csv("data/unlabeled data/posts_unlabeled_fullsent.csv")
comments_score_df = pandas.read_csv("data/unlabeled data/comments_score_unlabeled.csv")

In [None]:
labels = ['Emotional_disclosure',
 'Information_disclosure',
 'Support',
 'General_support',
 'Info_support',
 'Emo_support']

In [None]:
train_set_df.columns.values.tolist()
train_set_df.head()

In [None]:
test_set_df.columns.values.tolist()

In [None]:
posts_traintest_df.columns.values.tolist()
posts_traintest_df.head()

In [None]:
posts_test_df.columns.values.tolist()

In [None]:
comments_score_df = comments_score_df.rename(columns={"V1": "full_text"})
comments_score_df.columns.values.tolist()

In [None]:
def consolidate_comments_with_posts():
    post_comments = {}
    total_comments_labeled = 0
    total_comments_unlabeled = 0
    
    def add_new_key(pid):
        post_comments[pid] = {
            # "comments": [], 
            "sentenceids": [], "nlabeled": 0, 
                              "authors": [], 
            "timestamp": [], 
            "wordcounts": []}
        for label in labels:
            post_comments[pid][label] = []
    
    def add_comment_with_labels(pid, comment):
        # post_comments[pid]["comments"].append(comment["full_text"])
        post_comments[pid]["sentenceids"].append(int(comment["sentenceid"]))
        post_comments[pid]["wordcounts"].append(comment["wordcount"])
        post_comments[pid]["authors"].append(comment["author"])
        post_comments[pid]["timestamp"].append(comment["created_utc"])
        for label in labels:
            if label in comment:
                post_comments[pid][label].append(comment[label])
            else:
                post_comments[pid][label].append(-1)
        if "Support" in comment:
            post_comments[pid]["nlabeled"] += 1
    
    for i, comment in train_set_df.iterrows():
        post_id = comment["id"]
        if post_id not in post_comments: add_new_key(post_id)
        add_comment_with_labels(post_id, comment)
        total_comments_labeled += 1
        # if i > 10: break
    
    for i, comment in comments_score_df.iterrows():
        post_id = comment["id"]
        if post_id not in post_comments: add_new_key(post_id)
        add_comment_with_labels(post_id, comment)
        total_comments_unlabeled += 1
        # if i > 10: break
    
#     for i, comment in test_set_df.iterrows():
#         post_id = comment["id"]
#         if post_id not in post_comments: add_new_key(post_id)
#         add_comment_with_labels(post_id, comment)
#         total_comments_unlabeled += 1
    
    for post_id in post_comments:
        idx = np.argsort(post_comments[post_id]["sentenceids"])
        for key in post_comments[post_id]:
            if key == "nlabeled":
                continue
            post_comments[post_id][key] = [post_comments[post_id][key][id] for id in idx]
    
    return post_comments, total_comments_labeled, total_comments_unlabeled

In [None]:
post_comments, t_labeled, t_unlabeled = consolidate_comments_with_posts()

In [None]:
post_comments['91px39']

In [None]:
print (len(post_comments.keys()), t_labeled, t_unlabeled)

In [None]:
num_comments_per = 0
for post_id in post_comments:
    num_comments_per += len(post_comments[post_id]["authors"])
print (num_comments_per / 8403)

In [None]:
n_complete_labeled_grouped_comments = 0
n_partially_labeled_grouped_comments = 0
n_unlabeled_grouped_comments = 0
users_comments = {}

for post_id in post_comments:
    is_completely_labeled_group = True
    is_partially_labeled_group = False
    prev_author = post_comments[post_id]["authors"][0]
    prev_timestamp = post_comments[post_id]["timestamp"][0]
    
    for i, author in enumerate(post_comments[post_id]["authors"]):
        if prev_author != author or prev_timestamp != post_comments[post_id]["timestamp"][i]:
            if is_partially_labeled_group == True:
                n_partially_labeled_grouped_comments += 1
            else:
                n_unlabeled_grouped_comments += 1
            is_partially_labeled_group = False
            
            if is_completely_labeled_group == True:
                n_complete_labeled_grouped_comments += 1
            is_completely_labeled_group = True
            
            if prev_author not in users_comments:
                users_comments[prev_author] = 0
            users_comments[prev_author] += 1
            
            prev_author = author
            prev_timestamp = post_comments[post_id]["timestamp"][i]
        
        if post_comments[post_id]["Support"][i] != -1:
            is_partially_labeled_group = True
        else:
            is_completely_labeled_group = False
        
    if is_partially_labeled_group == True:
        n_partially_labeled_grouped_comments += 1
    else:
        n_unlabeled_grouped_comments += 1
        
    if is_completely_labeled_group == True:
        n_complete_labeled_grouped_comments += 1
    
    if author not in users_comments:
        users_comments[author] = 0
    users_comments[author] += 1

n_partially_labeled_grouped_comments -= n_complete_labeled_grouped_comments
        
print (n_complete_labeled_grouped_comments, n_partially_labeled_grouped_comments, n_unlabeled_grouped_comments)
print (n_complete_labeled_grouped_comments + n_partially_labeled_grouped_comments + n_unlabeled_grouped_comments)

How long are posts? Posts distribution

In [None]:
post_size = []

for i, post in posts_traintest_df.iterrows():
    post_id = post["id"]
    post_size.append(len(post["selftext"].split(" ")))

In [None]:
np.sqrt(np.std(post_size))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# matplotlib histogram
plt.hist(post_size, color = 'blue', edgecolor = 'black',
        bins = int(90))

# seaborn histogram
#sns.distplot(post_size, hist=True, kde=False, 
#             bins=int(90), color = 'blue',
#             hist_kws={'edgecolor':'black'})
# Add labels
plt.title('Histogram of Post sizes')
plt.xlabel('post size')
plt.ylabel('number of posts')
plt.show()

In [None]:
users_posts = {}
total_posts = 0
maxposts = 0

for i, post in posts_traintest_df.iterrows():
    author = post["author"]
    if author not in users_posts:
        users_posts[author] = 0
    users_posts[author] += 1
    if users_posts[author] > maxposts:
        maxposts = users_posts[author]
    total_posts += 1

print (total_posts / len(users_posts.keys()), maxposts)

In [None]:
list_users_posts = []

for user in users_posts:
    list_users_posts.append(users_posts[user])

# matplotlib histogram
plt.hist(list_users_posts, color = 'blue', edgecolor = 'black',
        bins = int(30))

# seaborn histogram
#sns.distplot(post_size, hist=True, kde=False, 
#             bins=int(90), color = 'blue',
#             hist_kws={'edgecolor':'black'})
# Add labels
plt.title('Histogram of posts')
plt.xlabel('number of posts')
plt.ylabel('authors')
plt.show()

In [None]:
np.sqrt(np.std(list_users_posts))

In [None]:
list_users_comments = []

for user in users_comments:
    list_users_comments.append(users_comments[user])

# matplotlib histogram
plt.hist(list_users_comments, color = 'blue', edgecolor = 'black',
        bins = int(30))

# seaborn histogram
#sns.distplot(post_size, hist=True, kde=False, 
#             bins=int(90), color = 'blue',
#             hist_kws={'edgecolor':'black'})
# Add labels
plt.title('Histogram of comments')
plt.xlabel('number of comments')
plt.ylabel('authors')
plt.show()

In [None]:
np.sqrt(np.std(list_users_comments))

In [None]:
np.max(list_users_comments)

In [None]:
# import urllib3
# import json
# from bs4 import BeautifulSoup
# import requests
import pandas as pd

dataset=pd.DataFrame()

## Comment tree download from PRAW 

In [None]:
import praw
reddit = praw.Reddit(client_id='KjQ7KCga-fZb1w',
                     client_secret="JmEK20QNxS_WK0EeXKldbFSch04", password='XKfiQ.G5HsW#9a$',
                     user_agent='ChangeMeClient/0.1 by pantut', username='pantut')

In [None]:
fully_labeled = 0
partially_labeled = 0
fully_unlabeled = 0

comm_lengths = []

In [None]:
from tqdm import tqdm

In [None]:
# comment level data
commentid = []
treeid = []
authors = []
created_utc = []
score = []
wordcount = []
full_text = []
parent = []
postid = []
values = [[], [], [], [], [], []]

# tree level data
treeid_t = []
depth_t = []
length_t = []

In [None]:
comment_idx = 0
tree_idx = 1000000

def _get_comment_idx():
    global comment_idx
    comment_idx += 1
    return comment_idx

def _get_tree_idx():
    global tree_idx
    tree_idx += 1
    return tree_idx

In [None]:
keys = [key for key in post_comments.keys()]
for ind, submission_id in tqdm(enumerate(keys)):
    if 'E+' in submission_id:
        continue
    
    labeled_timestamps = post_comments[submission_id]['timestamp']
    num_labeled_comments = len(labeled_timestamps)
    
    '''
    timestamps2labels['emo_support'] = {.. whether comments wrt timestamps are weakly labeled ..}
    '''
    timestamps2label = [{}]*6
    for i in range(6):
        for ts in labeled_timestamps:
            timestamps2label[i][ts] = -1
        for j, value in enumerate(post_comments[submission_id][labels[i]]):
            ts = labeled_timestamps[j]
            if value == -1:
                continue
            elif value == 0 and timestamps2label[i][ts] == -1:
                timestamps2label[i][ts] = 0
            elif value == 0 and timestamps2label[i][ts] == 1:
                continue
            else:
                timestamps2label[i][ts] = 1
    
#     print (timestamps2label)
    
    submission = reddit.submission(id=submission_id)
    submission.comments.replace_more(limit=0)
    
    def parse_comment(comment, tree_id, parent_id, depth = 1, length = 1):
        newcomment_idx = _get_comment_idx()
        
        ts = comment.created_utc
        
        '''
        prepare data for collection
        '''
        commentid.append(newcomment_idx)
        treeid.append(tree_id)
        authors.append(comment.author)
        created_utc.append(comment.created_utc)
        score.append(comment.score)
        wordcount.append(len(comment.body.split()))
        full_text.append(comment.body)
        parent.append(parent_id)
        postid.append(submission_id)
        for i, label in enumerate(labels):
            if ts in timestamps2label[i]:
                values[i].append(timestamps2label[i][ts])
            else:
                values[i].append(-1)
        
        max_depth = depth
        
        for reply in comment.replies:
            d, l = parse_comment(reply, tree_id, newcomment_idx, depth + 1)
            max_depth = max(max_depth, d)
            length += l
        
        return max_depth, length
    
    for comment in submission.comments:
        if comment.score <= -100:
            # don't process highly negative voted comments
            continue
        
        treeidx = _get_tree_idx()
        
        depth, length = parse_comment(comment, treeidx, -1)
        
        treeid_t.append(treeidx)
        depth_t.append(depth)
        length_t.append(length)