# Convert snippets to conversations

## first pass:

 - [x] get posts + comments
 - [x] append toplevel posts and comments
 - [x] groupby root_id
 - [x] sort by root_id, parent_id, time?
 - [x] make a single row per post + comments with display text: f"@{post_author}:{post_text} \n @{reply_author}: {reply_text} etc." up to x00 words.
 - [x] Save on local_artifacts
 - [x] show in prodigy
 
## second pass?

 Get a better way to annotate specific parts of the text? spancat? ccreate custom 

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import datetime

import pandas as pd
import tentaclio

from phoenix.common import artifacts, run_params, utils

In [None]:
utils.setup_notebook_output()
utils.setup_notebook_logging()

In [None]:
# Files
posts_df_path = f"{artifacts.urls.get_local()}/prodigy/reddit_posts_2019_5.csv"
comments_df_path = f"{artifacts.urls.get_local()}/prodigy/reddit_comments_2019_5.csv"
twitter_df_path = f"{artifacts.urls.get_local()}/prodigy/2022-6.parquet"
twitter_dmaps_df_path = f"{artifacts.urls.get_local()}/prodigy/tweets_final_dmaps.parquet"
output_path = f"{artifacts.urls.get_local()}/prodigy/reddit_conversations.csv"
twitter_output_path = f"{artifacts.urls.get_local()}/prodigy/twitter_2022-6_conversations.csv"

with tentaclio.open(posts_df_path, "r") as fb:
    posts_df = pd.read_csv(fb)
    
with tentaclio.open(comments_df_path, "r") as fb:
    comments_df = pd.read_csv(fb)

In [None]:
df = artifacts.dataframes.get(twitter_dmaps_df_path).dataframe

In [None]:
df[df["user_screen_name"].isin(["khaberni", "AddustourNews", "alwakeelnews"])]

In [None]:
df.groupby("user_screen_name").size().sort_values(ascending=False)

In [None]:
# This is horrible, but when there are NA's the parquet is automatically a float. So we turn the IDS into float, 
# and then to string
df["id_str"] = df["id_str"].astype(float).astype("string")
df["in_reply_to_status_id_str"] = df["in_reply_to_status_id_str"].astype("string")

In [None]:
df

In [None]:
df_replied_to_statuses = df[~df["in_reply_to_status_id"].isna()].groupby("in_reply_to_status_id").count().sort_values("object_id", ascending=False).reset_index()

In [None]:
df_replied_to_statuses

In [None]:
df_replied_to_statuses = df_replied_to_statuses.rename({"object_id": "num_replies_in_dataset"}, axis=1)

In [None]:
df_tweets_with_replies = df_replied_to_statuses[["in_reply_to_status_id","num_replies_in_dataset"]].merge(df, left_on="in_reply_to_status_id", right_on="id")

In [None]:
df_tweets_with_replies[~df_tweets_with_replies["in_reply_to_status_id_y"].isna()]

In [None]:
import numpy as np

In [None]:
def add_root_ids(df: pd.DataFrame) -> pd.DataFrame:
    """Traverse parent and child posts and add the root and parent ids to all posts."""
    tweets_df_with_root_ids = df.copy()
    tweets_df_with_root_ids = tweets_df_with_root_ids[~tweets_df_with_root_ids["id_str"].duplicated()]
    tweets_df_with_root_ids["root_id"] = np.nan
    tweets_df_with_root_ids["parent_id"] = tweets_df_with_root_ids["in_reply_to_status_id"].astype("string")
    tweets_df_with_root_ids["parent_id"] = tweets_df_with_root_ids["parent_id"].fillna(tweets_df_with_root_ids["id_str"].astype("string"))
    tweets_df_with_root_ids["is_root"] = tweets_df_with_root_ids["id_str"] == tweets_df_with_root_ids["parent_id"]
    tweets_df_with_root_ids.loc[tweets_df_with_root_ids["is_root"], "root_id"] = tweets_df_with_root_ids.loc[tweets_df_with_root_ids["is_root"], "id_str"]
    
    tweets_df_with_root_ids = traverse_parents_to_get_root_id(tweets_df_with_root_ids)
    
    return tweets_df_with_root_ids
        

In [None]:
def traverse_parents_to_get_root_id(df:pd.DataFrame, recursion_iteration:int=0) -> pd.DataFrame:
    """
    Traverse parents of posts to get the eventual root id. 
    This is a hacky way to do graph traversal in pandas, and probably should use networkx or something instead.
    """
    ids_df = df[["parent_id","root_id", "id_str"]].copy()
    ids_df = ids_df.rename(
        {"parent_id": "parent_parent_id","root_id":"parent_root_id","id_str":"parent_id_str"}, axis=1
    )
    join_key = "parent_id" if recursion_iteration==0 else "recursive_parent_id"
    self_joined_df = df.merge(ids_df, left_on=join_key, right_on="parent_id_str", how="left", indicator=True)
    self_joined_df.loc[self_joined_df["_merge"]=="left_only", "root_id"] = "orphaned"
    self_joined_df.loc[self_joined_df["_merge"]=="left_only", "parent_parent_id"] = "orphaned"
    self_joined_df["recursive_parent_id"] = self_joined_df["parent_parent_id"]
    root_found_mask = self_joined_df["recursive_parent_id"] == self_joined_df["parent_id_str"]
    self_joined_df.loc[root_found_mask, "root_id"] = self_joined_df.loc[root_found_mask, "parent_id_str"]    
    self_joined_df = self_joined_df.drop(["parent_parent_id", "parent_root_id", "parent_id_str", "_merge"], axis=1)
    
    if self_joined_df["root_id"].isna().any():
        recursion_iteration+=1
        self_joined_df = traverse_parents_to_get_root_id(self_joined_df, recursion_iteration)

    self_joined_df = self_joined_df.drop("recursive_parent_id", axis=1, errors="ignore")
    return self_joined_df

In [None]:
df = add_root_ids(df)

In [None]:
df

In [None]:
df[~df["is_root"]].groupby("root_id").count()

In [None]:
df[df["root_id"].isna()]

In [None]:
test_df = pd.DataFrame([
    ("a", np.nan),
    ("a1", "a"),
    ("a2", "a"),
    ("aa1", "a1"),
    ("aa2", "a2"),
    ("b", np.nan),
    ("b1", "b"),
    ("bb1", "b1"),
    ("bbb1", "bb1"),
    ("c1", "c"),
],
columns=["id_str", "in_reply_to_status_id_str"])

In [None]:
test_df

In [None]:
output_test_df = add_root_ids(test_df)

In [None]:
output_test_df

In [None]:
posts_df["created_utc"] = pd.to_datetime(posts_df["created_utc"])
comments_df["created_utc"] = pd.to_datetime(comments_df["created_utc"])

In [None]:
all_text_df = pd.concat([posts_df, comments_df]).reset_index()

In [None]:
all_text_df

In [None]:
# remove mod posts/comments
all_text_df = all_text_df[all_text_df["author"] !="AutoModerator"]
all_text_df = all_text_df[all_text_df["author"] !="PoliticsModeratorBot"]

In [None]:
# Fill na for parent and root ids for posts
all_text_df["parent_id"] = all_text_df["parent_id"].fillna(all_text_df["id"]).astype(int) 
all_text_df["root_id"] = all_text_df["root_id"].fillna(all_text_df["id"]).astype(int) 

In [None]:
# only show comments that have a root post
all_text_df = all_text_df[all_text_df["root_id"].isin(all_text_df["id"])]

In [None]:
# remove orphaned posts
df = df[df["root_id"]!= "orphaned"]

In [None]:
df["display_text"] = "Reply by @" + df["user_name"] + ": \n\t" + df["text"] + " \n"

df.loc[df["is_root"], "display_text"] = "Post by @" + df["user_name"] + ": \n\t" + df["text"] + " \n"

In [None]:
df["word_count"] = df["display_text"].str.findall(r"(\w+)").str.len()

In [None]:
df.iloc[0]["display_text"]

In [None]:
grouped_df = df.groupby("root_id")

In [None]:
conversation_sizes_df = grouped_df.size().to_frame("num_texts").reset_index()

In [None]:
conversation_sizes_df

In [None]:
conversation_sizes_df[conversation_sizes_df["num_texts"] < 200].groupby("num_texts").size().plot()

In [None]:
conversation_sizes_df[conversation_sizes_df["num_texts"] < 200].groupby("num_texts").size()

In [None]:
conversations_df = pd.merge(df, conversation_sizes_df[conversation_sizes_df["num_texts"]>1])

In [None]:
## Get cumulative sum of words to cut off extreneous words for display

In [None]:
## This is a first pass attempt to get threads: 
## the sort would not order based on threads, but on create time. there should be some way to 
## achieve this using some kind of graph sort, but it can't be done just with sort_values in pandas
conversations_df = conversations_df.sort_values(["root_id", "is_root", "created_at"], ascending=[True, False, True])

In [None]:
conversations_df["cum_word_count"] = conversations_df[["root_id", "word_count"]].groupby("root_id").cumsum()

In [None]:
## Cut off less than 150 words 
conversations_df = conversations_df[conversations_df["cum_word_count"] < 150]

In [None]:
conversations_display_df = conversations_df.groupby("root_id").agg({"display_text": "\n".join})

In [None]:
conversations_display_df = conversations_display_df.rename(columns={"display_text": "text"})

In [None]:
with tentaclio.open(twitter_output_path, "w") as fb:
    conversations_display_df.to_csv(fb)

In [None]:
conversations_display_df

In [None]:
conversations_display_df["word_count"] = conversations_display_df["text"].str.findall(r"(\w+)").str.len()

In [None]:
conversations_display_df