# Interactions

In [2]:
import sys
import glob
import numpy as np
import pandas as pd
from collections import defaultdict

sys.path.append('/home/kalkiek/projects/reddit-political-affiliation/')

from src.data.date_helper import read_submissions
from src.features.interactions.political_comment import PoliticalComment

### Quick Clean Up

I forgot to  remove 't1' prefix from some of the comment IDs

We also need to remove any bot accounts

In [3]:
def grab_bot_accounts():
    fname = '/shared/0/projects/prosocial/known-bots.tsv'
    bots = []

    with open(fname, 'rt') as f:
        lines = f.readlines()

        for line in lines:
            bots.append(line.split('\t')[1])

    print("Known bots: %d" % len(bots))
    return bots

bots = grab_bot_accounts()
bots.extend('[deleted]')

Known bots: 393


###  Can't run this until the script is done running! We'll run it in a few days

In [6]:
# def remove_id_prefix_and_bots(in_file):
#     comments = 0
#     with open(in_file, 'r') as f:
#         for line in f:
#             comment_id, parent_id, username, subreddit, created, politics, text = line.split('\t')
#             if username not in bots:
#                 if comment_id[:2] == 't1':
#                     comment_id = comment_id[3:]
#                 if parent_id[:2] == 't1':
#                     parent_id = parent_id[3:]

#                 comments += 1
#                 political_comment = PoliticalComment(comment_id, parent_id, username, subreddit, created, politics, text)
#                 yield political_comment

#     print("Total number of political comments: {}".format(comments))

# in_file = '/shared/0/projects/reddit-political-affiliation/data/interactions/all_comments.tsv'
# out_file = '/shared/0/projects/reddit-political-affiliation/data/interactions/all_comments_filtered.tsv'

# with open(out_file, 'w') as out_f:
#     for comment in filtered_comments(in_file):
#         out_f.write(comment.to_tsv_row())

Total number of political comments: 113921968


## Reading In The Data

Sample of how to read the data in

In [32]:
in_file = '/shared/0/projects/reddit-political-affiliation/data/interactions/all_comments_filtered.tsv'

def read_in_comments(in_file, count=-1):
    comments = []
    with open(in_file, 'r') as f:
        for line in f:
            line = line.strip()
            try:
                comment_id, parent_id, username, subreddit, created, politics, text = line.split('\t')
                political_comment = PoliticalComment(comment_id, parent_id, username, subreddit, created, politics, text)
                comments.append(political_comment.to_dict())
                if count > 0 and len(comments) >= count:
                    return comments
            except Exception:
                pass
            
    print("Total number of political comments: {}".format(len(comments)))
    return comments


        
comments = read_in_comments(in_file, count=500000)
df_comments = pd.DataFrame(comments)
print(len(df_comments))
df_comments.head(10)

500000


Unnamed: 0,comment_id,parent_id,username,subreddit,created,politics,text
0,eypc9zv,eyp9duh,CueDramaticMusic,tumblr,1567296002,Democrat,I don’t know whether to read the last sentence...
1,eypca5l,eypby8b,ninethirtyone,news,1567296005,Democrat,You responded to my comment to make a point yo...
2,eypca5t,t3_cy2nlj,lostapwbm,SquaredCircle,1567296006,Democrat,After the fantastic energy of the crowds for T...
3,eypca8l,eypbz6d,OlliesFreeOxen,news,1567296007,Unknown,Run away when you lose. Typical. You have your...
4,eypcaac,eyp5fjl,Based_Putin,news,1567296008,Unknown,What does this even mean?
5,eypcabv,eyonlbn,needler14,ABoringDystopia,1567296009,Democrat,Right? I'll eat McDonald's here and there but ...
6,eypcac1,t3_cy09os,LimpWibbler_,funny,1567296009,Democrat,I absolutely love these kids. Funny or not.
7,eypcacv,t3_cy2yrm,DonnyGoat,NYYankees,1567296010,Democrat,Whose match are they at?
8,eypcae0,eyp66ij,rhunter99,news,1567296010,Unknown,Neil is that you?
9,eypcaeh,eypavwe,Firtydilthythrowaway,Music,1567296011,Unknown,This place is lost; it's merely a public opini...


# Bucket about dyadic affiliation composition (e.g,. Liberal+Liberal)

In [33]:
rep_df = df_comments[df_comments['politics'] == 'Republican']
dem_df = df_comments[df_comments['politics'] == 'Democrat']
unknown_df = df_comments[df_comments['politics'] == 'Unknown']

rep_users = set(rep_df['username'].tolist())
dem_users = set(dem_df['username'].tolist())
unknown_users = set(unknown_df['username'].tolist())

print("Number of rep users: {}".format(len(rep_users)))
print("Number of dem users: {}".format(len(dem_users)))
print("Number of unknown users: {}".format(len(unknown_users)))
print()
print("Number of rep comments: {}".format(len(rep_df)))
print("Number of dem comments: {}".format(len(dem_df)))
print("Number of unknown comments: {}".format(len(unknown_df)))

Number of rep users: 6703
Number of dem users: 14609
Number of unknown users: 76177

Number of rep comments: 134341
Number of dem comments: 263726
Number of unknown comments: 101933


In [34]:
def get_interactions(from_party, to_party):
    from_comment_ids = set(df_comments[df_comments['politics'] == from_party]['comment_id'].tolist())
    to_comment_ids = set(df_comments[df_comments['politics'] == to_party]['comment_id'].tolist())
    interactions = df_comments[(df_comments['comment_id'].isin(from_comment_ids) & df_comments['parent_id'].isin(to_comment_ids))]
    return interactions

dem_to_dem = get_interactions('Democrat', 'Democrat')
rep_to_rep = get_interactions('Republican', 'Republican')
dem_to_rep = get_interactions('Democrat', 'Republican')
rep_to_dem = get_interactions('Republican', 'Democrat')
dem_to_unknown = get_interactions('Democrat', 'Unknown')
rep_to_unknown = get_interactions('Republican', 'Democrat')

print("Dem to dem interactions: {}".format(len(dem_to_dem)))
print("Rep to rep interactions: {}".format(len(rep_to_rep)))
print("Dem to rep interactions: {}".format(len(dem_to_rep)))
print("Rep to dem interactions: {}".format(len(rep_to_dem)))
print("Dem to unknown interactions: {}".format(len(dem_to_unknown)))
print("Rep to dem interactions: {}".format(len(rep_to_unknown)))

Dem to dem interactions: 6019
Rep to rep interactions: 1421
Dem to rep interactions: 2310
Rep to dem interactions: 2288
Dem to unknown interactions: 13974
Rep to dem interactions: 2288


# Comments in same subreddit made at a similar time point