In [1]:
import json
import sys
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
def filter_post(filename):
    '''
    Extracts post data from json file, keeping desired columns
    '''
    raw_json = pd.read_json(path_or_buf=filename+".jsonl", lines=True, encoding="utf-8-sig")
    selected_data = ['selftext', 'created_utc', 'ups', 'subreddit', 'link_flair_text','title']
        #selftext       - main body text
        #created_utc    - post creation time
        #ups            - number of upvotes
        #subreddit      - subreddit
        #link_flair_text - flair info
        #tile           - post title
    clean_json = raw_json[selected_data]
    clean_json = clean_json.rename(columns = {'link_flair:text': 'flair'})
    return clean_json

In [2]:
def filter_comment(filename):
    '''
    Extracts comment data from json file, keeping desired columns
    '''
    raw_json = pd.read_json(path_or_buf=filename+".jsonl", lines=True, encoding="utf-8-sig")
    selected_data = ['body', 'created_utc', 'ups', 'subreddit']
        #body           - main body text
        #created_utc    - post creation time
        #ups            - number of upvotes
        #subreddit      - subreddit

    clean_json = raw_json[selected_data]
    clean_json = clean_json.rename(columns = {'body': 'text'})
    return clean_json

In [6]:
df = filter_comment("./data/Republican_comments")

In [8]:
df.head(50)

Unnamed: 0,text,created_utc,ups,subreddit
0,"stop changing the subject, i‘m not defending t...",1590980062,-2,Republican
1,Sup. Im from Philly. The protesters don't give...,1590983173,8,Republican
2,https://en.wikipedia.org/wiki/Party_divisions_...,1591015660,6,Republican
3,"Your definitely not wrong, but I wouldn’t say ...",1591016577,8,Republican
4,I mean yes to this post but our President also...,1591018089,-2,Republican
5,I am over seventy and sometimes have senior mo...,1591038865,1,Republican
6,"Look at our president, inciting people to viol...",1591061748,-2,Republican
7,None of those messages are perpetuating violen...,1591066993,0,Republican
8,This is the state of anarchy in the epicenter ...,1591097778,4,Republican
9,You mean like everything has been Obamas fault...,1591112281,5,Republican


Preprocessing:

For comments - 
- Get rid of short comments
- Get rid of empty comments
- Get rid of removed comments
- Get rid of comments written by bots

For posts -
- Get rid of posts with just a link


In [15]:
#Extracting all data source files
dem_post = filter_post('./data/democrats_posts')
dem_comment = filter_comment('./data/democrats_comments')
rep_post = filter_post('./data/Republican_posts')
rep_comment = filter_comment('./data/Republican_comments')
# poldis_post = filter_post('../DATA/r_PoliticalDiscussion_posts')
# poldis_comment = filter_comment('../DATA/r_PoliticalDiscussion_comments')

#Combine all posts and all comments together
# all_post = pd.concat([dem_post, rep_post, poldis_post], ignore_index = True)
# all_comment = pd.concat([dem_comment, rep_comment, poldis_comment], ignore_index = True)

all_post = pd.concat([dem_post, rep_post], ignore_index = True)
all_comment = pd.concat([dem_comment, rep_comment], ignore_index = True)

#Remove any missing comments, clear the selftext of posts if removed
all_comment = all_comment[all_comment['text'] != '[removed]']
all_post.loc[all_post['selftext'] == '[removed]', 'selftext'] = ''

#Remove any comments that are bots
all_comment = all_comment[~all_comment['text'].str.contains('I am a bot', case=False, na=False)]

#Get rid of any comments that are less than 5 words
all_comment = all_comment[all_comment['text'].str.split().str.len() >= 5]

#Combine "Title" and "selftext" fields of post to form "text" which is contains all textual content
all_post['text'] = all_post['title'] + ' ' + all_post['selftext']

#drop any remaining missing
all_comment = all_comment.dropna(subset=['text'])
all_post = all_post.dropna(subset=['text'])

#Writing to cleaned CSV files
# all_post.to_csv("../DATA/cleaned_posts.csv", index = False, encoding="utf-8-sig")
# all_comment.to_csv("../DATA/cleaned_comments.csv", index = False, encoding="utf-8-sig")

In [16]:
all_post.head(150)

Unnamed: 0,selftext,created_utc,ups,subreddit,link_flair_text,title,text
0,,1590994340,1,democrats,,"106,000 dead 40 mill unemployed Cities on fire...","106,000 dead 40 mill unemployed Cities on fire..."
1,,1591010811,1,democrats,See Note,Why aren't Democrats calling for the president...,Why aren't Democrats calling for the president...
2,What am I missing?,1591011651,32,democrats,,Why aren't Democrats calling for the president...,Why aren't Democrats calling for the president...
3,,1591012889,49,democrats,,"'Mr President, don't go hide': China goads US ...","'Mr President, don't go hide': China goads US ..."
4,,1591025916,6,democrats,article,"As Protests and Violence Spill Over, Trump Shr...","As Protests and Violence Spill Over, Trump Shr..."
...,...,...,...,...,...,...,...
145,,1592413494,9,Republican,,The first vice president of color was a Republ...,The first vice president of color was a Republ...
146,,1592497958,1,Republican,,All the President’s Lies About the Coronavirus...,All the President’s Lies About the Coronavirus...
147,,1592519227,1,Republican,,"Biden Got Failing Grades, He's Disaster And Am...","Biden Got Failing Grades, He's Disaster And Am..."
148,,1592600267,7,Republican,,Joe Biden could still save Trump by tapping Ka...,Joe Biden could still save Trump by tapping Ka...


In [14]:
all_comment.head(150)

Unnamed: 0,text,created_utc,ups,subreddit
0,"Very true, but the problem is that even at tha...",1590975768,1,democrats
1,You’re full of crap.\n\nYou present no facts w...,1590984758,2,democrats
2,"Politically speaking, there is absolutely no w...",1590995791,29,democrats
3,This is what happens when the president of the...,1591008451,1,democrats
4,Silly comment. No matter how bad a president h...,1591013669,1,democrats
...,...,...,...,...
145,You know what didn't happen in Lafayette park?...,1591313587,1,Republican
146,these people make NO sense. Especially when Tr...,1591319806,4,Republican
147,I think the 2020 outlook is muddy at best. I d...,1591370702,2,Republican
148,> Let's not forget that it is now a historica...,1591372855,2,Republican
