In [1]:
import json
import sys
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def filter_post(filename):
    '''
    Extracts post data from json file, keeping desired columns
    '''
    raw_json = pd.read_json(path_or_buf=filename+".jsonl", lines=True, encoding="utf-8-sig")
    selected_data = ['selftext', 'created_utc', 'ups', 'subreddit', 'link_flair_text','title']
        #selftext       - main body text
        #created_utc    - post creation time
        #ups            - number of upvotes
        #subreddit      - subreddit
        #link_flair_text - flair info
        #tile           - post title
    clean_json = raw_json[selected_data]
    clean_json = clean_json.rename(columns = {'link_flair:text': 'flair'})
    return clean_json

In [3]:
def filter_comment(filename):
    '''
    Extracts comment data from json file, keeping desired columns
    '''
    raw_json = pd.read_json(path_or_buf=filename+".jsonl", lines=True, encoding="utf-8-sig")
    selected_data = ['body', 'created_utc', 'ups', 'subreddit']
        #body           - main body text
        #created_utc    - post creation time
        #ups            - number of upvotes
        #subreddit      - subreddit

    clean_json = raw_json[selected_data]
    clean_json = clean_json.rename(columns = {'body': 'text'})
    return clean_json

In [4]:
df = filter_comment("./data/Republican_comments_2024")

In [5]:
df.head(5)

Unnamed: 0,text,created_utc,ups,subreddit
0,To all the DEMO-RATS that made this possible. ...,1717204164,1,Republican
1,> I think Comey completely screwed Hillary wit...,1717204886,-2,Republican
2,"Well, for one thing we know Biden spent decade...",1717207701,0,Republican
3,I don't think anyone who gets to the level of ...,1717212507,1,Republican
4,The only sham here is that here is that the Re...,1717217202,10,Republican


Preprocessing:

For comments - 
- Get rid of short comments
- Get rid of empty comments
- Get rid of removed comments
- Get rid of comments written by bots

For posts -
- Get rid of posts with just a link


In [22]:
#Extracting all data source files 2020 data
dem_post = filter_post('./data/democrats_posts')
dem_comment = filter_comment('./data/democrats_comments')
rep_post = filter_post('./data/Republican_posts')
rep_comment = filter_comment('./data/Republican_comments')
# poldis_post = filter_post('../DATA/r_PoliticalDiscussion_posts')
# poldis_comment = filter_comment('../DATA/r_PoliticalDiscussion_comments')

#Combine all posts and all comments together
# all_post = pd.concat([dem_post, rep_post, poldis_post], ignore_index = True)
# all_comment = pd.concat([dem_comment, rep_comment, poldis_comment], ignore_index = True)

all_post = pd.concat([dem_post, rep_post], ignore_index = True)
all_comment = pd.concat([dem_comment, rep_comment], ignore_index = True)

#Remove any missing comments, clear the selftext of posts if removed
all_comment = all_comment[all_comment['text'] != '[removed]']
all_post.loc[all_post['selftext'] == '[removed]', 'selftext'] = ''

#Remove any comments that are bots
all_comment = all_comment[~all_comment['text'].str.contains('I am a bot', case=False, na=False)]

#Get rid of any comments that are less than 5 words
all_comment = all_comment[all_comment['text'].str.split().str.len() >= 5]

#Combine "Title" and "selftext" fields of post to form "text" which is contains all textual content
all_post['text'] = all_post['title'] + ' ' + all_post['selftext']

#drop any remaining missing
all_comment = all_comment.dropna(subset=['text'])
all_post = all_post.dropna(subset=['text'])

#Writing to cleaned CSV files
all_post.to_csv("./data/cleaned_posts.csv", index = False, encoding="utf-8-sig")
all_comment.to_csv("./data/cleaned_comments.csv", index = False, encoding="utf-8-sig")

In [16]:
all_post.head(150)

Unnamed: 0,selftext,created_utc,ups,subreddit,link_flair_text,title,text
0,,1590994340,1,democrats,,"106,000 dead 40 mill unemployed Cities on fire...","106,000 dead 40 mill unemployed Cities on fire..."
1,,1591010811,1,democrats,See Note,Why aren't Democrats calling for the president...,Why aren't Democrats calling for the president...
2,What am I missing?,1591011651,32,democrats,,Why aren't Democrats calling for the president...,Why aren't Democrats calling for the president...
3,,1591012889,49,democrats,,"'Mr President, don't go hide': China goads US ...","'Mr President, don't go hide': China goads US ..."
4,,1591025916,6,democrats,article,"As Protests and Violence Spill Over, Trump Shr...","As Protests and Violence Spill Over, Trump Shr..."
...,...,...,...,...,...,...,...
145,,1592413494,9,Republican,,The first vice president of color was a Republ...,The first vice president of color was a Republ...
146,,1592497958,1,Republican,,All the President’s Lies About the Coronavirus...,All the President’s Lies About the Coronavirus...
147,,1592519227,1,Republican,,"Biden Got Failing Grades, He's Disaster And Am...","Biden Got Failing Grades, He's Disaster And Am..."
148,,1592600267,7,Republican,,Joe Biden could still save Trump by tapping Ka...,Joe Biden could still save Trump by tapping Ka...


In [14]:
all_comment.head(150)

Unnamed: 0,text,created_utc,ups,subreddit
0,"Very true, but the problem is that even at tha...",1590975768,1,democrats
1,You’re full of crap.\n\nYou present no facts w...,1590984758,2,democrats
2,"Politically speaking, there is absolutely no w...",1590995791,29,democrats
3,This is what happens when the president of the...,1591008451,1,democrats
4,Silly comment. No matter how bad a president h...,1591013669,1,democrats
...,...,...,...,...
145,You know what didn't happen in Lafayette park?...,1591313587,1,Republican
146,these people make NO sense. Especially when Tr...,1591319806,4,Republican
147,I think the 2020 outlook is muddy at best. I d...,1591370702,2,Republican
148,> Let's not forget that it is now a historica...,1591372855,2,Republican


Repeat for 2024 data

In [None]:
#Extracting all data source files 2020 data
dem_post = filter_post('./data/democrats_posts_2024')
dem_comment = filter_comment('./data/democrats_comments_2024')
rep_post = filter_post('./data/Republican_posts_2024')
rep_comment = filter_comment('./data/Republican_comments_2024')
# poldis_post = filter_post('../DATA/r_PoliticalDiscussion_posts')
# poldis_comment = filter_comment('../DATA/r_PoliticalDiscussion_comments')

#Combine all posts and all comments together
# all_post = pd.concat([dem_post, rep_post, poldis_post], ignore_index = True)
# all_comment = pd.concat([dem_comment, rep_comment, poldis_comment], ignore_index = True)

all_post = pd.concat([dem_post, rep_post], ignore_index = True)
all_comment = pd.concat([dem_comment, rep_comment], ignore_index = True)

#Remove any missing comments, clear the selftext of posts if removed
all_comment = all_comment[all_comment['text'] != '[removed]']
all_post.loc[all_post['selftext'] == '[removed]', 'selftext'] = ''

#Remove any comments that are bots
all_comment = all_comment[~all_comment['text'].str.contains('I am a bot', case=False, na=False)]

#Get rid of any comments that are less than 5 words
all_comment = all_comment[all_comment['text'].str.split().str.len() >= 5]

#Combine "Title" and "selftext" fields of post to form "text" which is contains all textual content
all_post['text'] = all_post['title'] + ' ' + all_post['selftext']

#drop any remaining missing
all_comment = all_comment.dropna(subset=['text'])
all_post = all_post.dropna(subset=['text'])

#Writing to cleaned CSV files
all_post.to_csv("./data/cleaned_posts_2024.csv", index = False, encoding="utf-8-sig")
all_comment.to_csv("./data/cleaned_comments_2024.csv", index = False, encoding="utf-8-sig")

### Using Large Data (not limited to 100)

#### 2020 Election Cycle

In [None]:
#Extracting all data source files 2020 data
dem_post = filter_post('./large-data/r_democrats_posts_2020')
dem_comment = filter_comment('./large-data/r_democrats_comments_2020')
rep_post = filter_post('./large-data/r_Republican_posts_2020')
rep_comment = filter_comment('./large-data/r_Republican_comments_2020')

In [9]:
rep_post.head()

Unnamed: 0,selftext,created_utc,ups,subreddit,link_flair_text,title
0,[removed],1590970225,1,Republican,,Why should trump be re-elected?
1,,1590970317,58,Republican,,Two lawyers hit with federal charges for throw...
2,[deleted],1590970535,1,Republican,,What happens when you apologize to rioters?
3,,1590970624,5,Republican,,What happens when you apologize to rioters?
4,,1590972138,1,Republican,,A new constitutional right idea I have that co...


In [10]:
# Limit jsonl files to only those wth the keyword president
dem_post = dem_post[dem_post['selftext'].str.contains("president", case=False, na=False)]
dem_comment = dem_comment[dem_comment['text'].str.contains("president", case=False, na=False)]

rep_post = rep_post[rep_post['selftext'].str.contains("president", case=False, na=False)]
rep_comment = rep_comment[rep_comment['text'].str.contains("president", case=False, na=False)]

In [11]:
#Combine all posts and all comments together

all_post = pd.concat([dem_post, rep_post], ignore_index = True)
all_comment = pd.concat([dem_comment, rep_comment], ignore_index = True)

#Remove any missing comments, clear the selftext of posts if removed
all_comment = all_comment[all_comment['text'] != '[removed]']
all_post.loc[all_post['selftext'] == '[removed]', 'selftext'] = ''

#Remove any comments that are bots
all_comment = all_comment[~all_comment['text'].str.contains('I am a bot', case=False, na=False)]

#Get rid of any comments that are less than 5 words
all_comment = all_comment[all_comment['text'].str.split().str.len() >= 5]

#Combine "Title" and "selftext" fields of post to form "text" which is contains all textual content
all_post['text'] = all_post['title'] + ' ' + all_post['selftext']

#drop any remaining missing
all_comment = all_comment.dropna(subset=['text'])
all_post = all_post.dropna(subset=['text'])

#Writing to cleaned CSV files
all_post.to_csv("./large-data/r_cleaned_posts_2020.csv", index = False, encoding="utf-8-sig")
all_comment.to_csv("./large-data/r_cleaned_comments_2020.csv", index = False, encoding="utf-8-sig")

In [13]:
all_comment.shape

(24177, 4)

In [14]:
all_post.shape

(1492, 7)

In [16]:
#Extracting all data source files 2020 data
dem_post = filter_post('./large-data/r_democrats_posts_2024_update')
dem_comment = filter_comment('./large-data/r_democrats_comments_2024_update')
rep_post = filter_post('./large-data/r_Republican_posts_2024_update')
rep_comment = filter_comment('./large-data/r_Republican_comments_2024_update')

# Limit jsonl files to only those wth the keyword president
dem_post = dem_post[dem_post['selftext'].str.contains("president", case=False, na=False)]
dem_comment = dem_comment[dem_comment['text'].str.contains("president", case=False, na=False)]

rep_post = rep_post[rep_post['selftext'].str.contains("president", case=False, na=False)]
rep_comment = rep_comment[rep_comment['text'].str.contains("president", case=False, na=False)]

#Combine all posts and all comments together

all_post = pd.concat([dem_post, rep_post], ignore_index = True)
all_comment = pd.concat([dem_comment, rep_comment], ignore_index = True)

#Remove any missing comments, clear the selftext of posts if removed
all_comment = all_comment[all_comment['text'] != '[removed]']
all_post.loc[all_post['selftext'] == '[removed]', 'selftext'] = ''

#Remove any comments that are bots
all_comment = all_comment[~all_comment['text'].str.contains('I am a bot', case=False, na=False)]

#Get rid of any comments that are less than 5 words
all_comment = all_comment[all_comment['text'].str.split().str.len() >= 5]

#Combine "Title" and "selftext" fields of post to form "text" which is contains all textual content
all_post['text'] = all_post['title'] + ' ' + all_post['selftext']

#drop any remaining missing
all_comment = all_comment.dropna(subset=['text'])
all_post = all_post.dropna(subset=['text'])

#Writing to cleaned CSV files
all_post.to_csv("./large-data/r_cleaned_posts_2024.csv", index = False, encoding="utf-8-sig")
all_comment.to_csv("./large-data/r_cleaned_comments_2024.csv", index = False, encoding="utf-8-sig")

print(f"2024 Comments df: {all_comment.shape}")
print(f"2024 Posts df: {all_post.shape}")

2024 Comments df: (12325, 4)
2024 Posts df: (539, 7)
