# Reddit EDA

## Import dependencies

In [1]:
import pandas as pd
import re
import string
from textblob import TextBlob

import pickle # just in case

## Set file locations

In [2]:
# File for cleaned comment data (input file)
cleaned_reddit_comments = '../00_data/reddit_data/reddit_comments_cleaned.csv'

# File for preprocessed comment text (input file)
cleaned_comment_text = '../00_data/reddit_data/reddit_comment_text_cleaned.csv'


## Read in the data

In [3]:
df = pd.read_csv(cleaned_comment_text)

In [4]:
df.head()

Unnamed: 0,comment_id,comment,clean_text
0,gpcqm3a,My super conservative Southern Baptist MIL is ...,my super conservative southern baptist mil is ...
1,gonzgvg,Had a nurse claim the vaccine wasn’t real and ...,had a nurse claim the vaccine wasn’t real and ...
2,gobfxvn,"Not an interaction with an antivaxxer, but jus...",not an interaction with an antivaxxer but just...
3,gobey4f,My grandfather is a Trump-supporting conspirac...,my grandfather is a trumpsupporting conspiracy...
4,go92rh9,"My anti vaxx, anti mask aunt keeps posting on ...",my anti vaxx anti mask aunt keeps posting on o...


In [5]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1911 entries, 0 to 1910
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   comment_id  1911 non-null   object
 1   comment     1911 non-null   object
 2   clean_text  1910 non-null   object
dtypes: object(3)
memory usage: 44.9+ KB


In [6]:
df.fillna(value=' ', inplace=True)

In [7]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1911 entries, 0 to 1910
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   comment_id  1911 non-null   object
 1   comment     1911 non-null   object
 2   clean_text  1911 non-null   object
dtypes: object(3)
memory usage: 44.9+ KB


## Sentiment analysis (by comment)

In [8]:
pol = lambda x: TextBlob(x).sentiment.polarity
sub = lambda x: TextBlob(x).sentiment.subjectivity

df['polarity'] = df['clean_text'].apply(pol)
df['subjectivity'] = df['clean_text'].apply(sub)
df

Unnamed: 0,comment_id,comment,clean_text,polarity,subjectivity
0,gpcqm3a,My super conservative Southern Baptist MIL is ...,my super conservative southern baptist mil is ...,0.029167,0.359167
1,gonzgvg,Had a nurse claim the vaccine wasn’t real and ...,had a nurse claim the vaccine wasn’t real and ...,-0.156111,0.451111
2,gobfxvn,"Not an interaction with an antivaxxer, but jus...",not an interaction with an antivaxxer but just...,0.223864,0.486258
3,gobey4f,My grandfather is a Trump-supporting conspirac...,my grandfather is a trumpsupporting conspiracy...,-0.419643,0.616071
4,go92rh9,"My anti vaxx, anti mask aunt keeps posting on ...",my anti vaxx anti mask aunt keeps posting on o...,0.083333,0.538889
...,...,...,...,...,...
1906,gpdfjo6,My county has about 25% more vaccinated person...,my county has about 25 more vaccinated persons...,0.500000,0.500000
1907,gphl9yr,..,,0.000000,0.000000
1908,gpjzg18,So this location is trying to vaccine about 4k...,so this location is trying to vaccine about 4k...,0.000000,0.000000
1909,gpfbdj4,If you’re not at risk get in line,if you’re not at risk get in line,0.000000,0.000000


In [9]:
### Merge back on large set

In [10]:
df.drop('comment', axis=1, inplace=True)

In [11]:
new_df = pd.read_csv(cleaned_reddit_comments)

In [12]:
new_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1911 entries, 0 to 1910
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   post_id             1911 non-null   object 
 1   comment_id          1911 non-null   object 
 2   author              1911 non-null   object 
 3   comment             1911 non-null   object 
 4   created_utc         1911 non-null   float64
 5   downs               1911 non-null   float64
 6   ups                 1911 non-null   float64
 7   reply               1911 non-null   object 
 8   comment_replied_id  214 non-null    object 
 9   comment_date        1911 non-null   object 
dtypes: float64(3), object(7)
memory usage: 149.4+ KB


In [13]:
new_df = new_df.merge(df, how='left', on='comment_id')

In [14]:
new_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1911 entries, 0 to 1910
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   post_id             1911 non-null   object 
 1   comment_id          1911 non-null   object 
 2   author              1911 non-null   object 
 3   comment             1911 non-null   object 
 4   created_utc         1911 non-null   float64
 5   downs               1911 non-null   float64
 6   ups                 1911 non-null   float64
 7   reply               1911 non-null   object 
 8   comment_replied_id  214 non-null    object 
 9   comment_date        1911 non-null   object 
 10  clean_text          1911 non-null   object 
 11  polarity            1911 non-null   float64
 12  subjectivity        1911 non-null   float64
dtypes: float64(5), object(8)
memory usage: 209.0+ KB


In [15]:
new_df.head()

Unnamed: 0,post_id,comment_id,author,comment,created_utc,downs,ups,reply,comment_replied_id,comment_date,clean_text,polarity,subjectivity
0,ldd0e6,gpcqm3a,t2_16jhc2,My super conservative Southern Baptist MIL is ...,1614669000.0,0.0,1.0,N,,3/2/21 1:02,my super conservative southern baptist mil is ...,0.029167,0.359167
1,ldd0e6,gonzgvg,t2_644zai51,Had a nurse claim the vaccine wasn’t real and ...,1614253000.0,0.0,6.0,N,,2/25/21 5:35,had a nurse claim the vaccine wasn’t real and ...,-0.156111,0.451111
2,ldd0e6,gobfxvn,t2_hh35kpp,"Not an interaction with an antivaxxer, but jus...",1614007000.0,0.0,6.0,N,,2/22/21 9:19,not an interaction with an antivaxxer but just...,0.223864,0.486258
3,ldd0e6,gobey4f,t2_7gf4ejvg,My grandfather is a Trump-supporting conspirac...,1614006000.0,0.0,11.0,N,,2/22/21 9:05,my grandfather is a trumpsupporting conspiracy...,-0.419643,0.616071
4,ldd0e6,go92rh9,t2_h24rn3v,"My anti vaxx, anti mask aunt keeps posting on ...",1613962000.0,0.0,8.0,N,,2/21/21 20:41,my anti vaxx anti mask aunt keeps posting on o...,0.083333,0.538889
