# Convert snippets to conversations

## first pass:

 - [x] get posts + comments
 - [x] append toplevel posts and comments
 - [x] groupby root_id
 - [x] sort by root_id, parent_id, time?
 - [x] make a single row per post + comments with display text: f"@{post_author}:{post_text} \n @{reply_author}: {reply_text} etc." up to x00 words.
 - [x] Save on local_artifacts
 - [x] show in prodigy
 
## second pass?

 Get a better way to annotate specific parts of the text? spancat? ccreate custom 

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import datetime

import pandas as pd
import tentaclio

from phoenix.common import artifacts, run_params, utils

In [None]:
utils.setup_notebook_output()
utils.setup_notebook_logging()

In [None]:
# Files
posts_df_path = f"{artifacts.urls.get_local()}/prodigy/reddit_posts_2019_5.csv"
comments_df_path = f"{artifacts.urls.get_local()}/prodigy/reddit_comments_2019_5.csv"
output_path = f"{artifacts.urls.get_local()}/prodigy/reddit_conversations.csv"

with tentaclio.open(posts_df_path, "r") as fb:
    posts_df = pd.read_csv(fb)
    
with tentaclio.open(comments_df_path, "r") as fb:
    comments_df = pd.read_csv(fb)

In [None]:
posts_df

In [None]:
# if post, the root_id is its own ID.
posts_df["root_id"] = posts_df["id"]

In [None]:
posts_df

In [None]:
comments_df

In [None]:
posts_df["created_utc"] = pd.to_datetime(posts_df["created_utc"])
comments_df["created_utc"] = pd.to_datetime(comments_df["created_utc"])

In [None]:
all_text_df = pd.concat([posts_df, comments_df]).reset_index()

In [None]:
all_text_df

In [None]:
# remove mod posts/comments
all_text_df = all_text_df[all_text_df["author"] !="AutoModerator"]
all_text_df = all_text_df[all_text_df["author"] !="PoliticsModeratorBot"]

In [None]:
# Fill na for parent and root ids for posts
all_text_df["parent_id"] = all_text_df["parent_id"].fillna(all_text_df["id"]).astype(int) 
all_text_df["root_id"] = all_text_df["root_id"].fillna(all_text_df["id"]).astype(int) 

In [None]:
# only show comments that have a root post
all_text_df = all_text_df[all_text_df["root_id"].isin(all_text_df["id"])]

In [None]:
# Add is_root to help with sorting
all_text_df["is_root"] = all_text_df["id"] == all_text_df["root_id"]

In [None]:
all_text_df["display_text"] = "Reply by @" + all_text_df["author"] + ": \n\t" + all_text_df["text"] + " \n"

all_text_df.loc[all_text_df["is_root"], "display_text"] = "Post by @" + all_text_df["author"] + ": \n\t" + all_text_df["text"] + " \n"

In [None]:
all_text_df["word_count"] = all_text_df["display_text"].str.findall(r"(\w+)").str.len()

In [None]:
all_text_df.iloc[0]["display_text"]

In [None]:
grouped_df = all_text_df.groupby("root_id")

In [None]:
conversation_sizes_df = grouped_df.size().to_frame("num_texts").reset_index()

In [None]:
conversation_sizes_df

In [None]:
conversation_sizes_df[conversation_sizes_df["num_texts"] < 200].groupby("num_texts").size().plot()

In [None]:
conversation_sizes_df[conversation_sizes_df["num_texts"] < 200].groupby("num_texts").size()

In [None]:
conversations_df = pd.merge(all_text_df, conversation_sizes_df[conversation_sizes_df["num_texts"]>1])

In [None]:
## Get cumulative sum of words to cut off extreneous words for display

In [None]:
## This is a first pass attempt to get threads: 
## the sort would not order based on threads, but on create time. there should be some way to 
## achieve this using some kind of graph sort, but it can't be done just with sort_values in pandas
conversations_df = conversations_df.sort_values(["root_id", "is_root", "created_utc"], ascending=[True, False, True])

In [None]:
conversations_df["cum_word_count"] = conversations_df[["root_id", "word_count"]].groupby("root_id").cumsum()

In [None]:
## Cut off less than 150 words 
conversations_df = conversations_df[conversations_df["cum_word_count"] < 150]

In [None]:
conversations_display_df = conversations_df.groupby("root_id").agg({"display_text": "\n".join})

In [None]:
conversations_display_df = conversations_display_df.rename(columns={"display_text": "text"})

In [None]:
with tentaclio.open(output_path, "w") as fb:
    conversations_display_df.to_csv(fb)

In [None]:
conversations_display_df

In [None]:
conversations_display_df["word_count"] = conversations_display_df["text"].str.findall(r"(\w+)").str.len()

In [None]:
conversations_display_df