# Reddit Data Cleaning

## Import dependencies

In [1]:
import pandas as pd

import pickle # just in case

## Set file locations

In [2]:
# Raw data file for post text (input file)
reddit_posts_file = '../00_data/reddit_data/reddit_posts.csv'

# Raw data file for comment text (input file)
reddit_comments_file = '../00_data/reddit_data/reddit_comments.csv'

# File for cleaned tweet text (output file)
cleaned_comments = '../00_data/reddit_data/reddit_comments_cleaned.csv'

## Read in raw data

In [3]:
df = pd.read_csv(reddit_comments_file)

In [4]:
df.head()

Unnamed: 0,post_id,comment_id,author,comment,created_utc,downs,ups,reply,comment_replied_id,comment_date
0,ldd0e6,gpcqm3a,t2_16jhc2,My super conservative Southern Baptist MIL is ...,1614669000.0,0.0,1.0,N,,3/2/21 1:02
1,ldd0e6,gonzgvg,t2_644zai51,Had a nurse claim the vaccine wasn’t real and ...,1614253000.0,0.0,6.0,N,,2/25/21 5:35
2,ldd0e6,gobfxvn,t2_hh35kpp,"Not an interaction with an antivaxxer, but jus...",1614007000.0,0.0,6.0,N,,2/22/21 9:19
3,ldd0e6,gobey4f,t2_7gf4ejvg,My grandfather is a Trump-supporting conspirac...,1614006000.0,0.0,11.0,N,,2/22/21 9:05
4,ldd0e6,go92rh9,t2_h24rn3v,"My anti vaxx, anti mask aunt keeps posting on ...",1613962000.0,0.0,8.0,N,,2/21/21 20:41


In [5]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1969 entries, 0 to 1968
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   post_id             1969 non-null   object 
 1   comment_id          1969 non-null   object 
 2   author              1911 non-null   object 
 3   comment             1954 non-null   object 
 4   created_utc         1954 non-null   float64
 5   downs               1954 non-null   float64
 6   ups                 1954 non-null   float64
 7   reply               1969 non-null   object 
 8   comment_replied_id  223 non-null    object 
 9   comment_date        1954 non-null   object 
dtypes: float64(3), object(7)
memory usage: 154.0+ KB


Observations:  
1. It looks like we don't have a full set of values for every 'comment_id'. Need to take a closer look.

In [6]:
# Look at rows with null comments.
df[df['comment'].isnull()]

Unnamed: 0,post_id,comment_id,author,comment,created_utc,downs,ups,reply,comment_replied_id,comment_date
85,krxabz,gidacaw,,,,,,N,,
89,krxabz,giq2l9n,,,,,,Y,gikupi7,
258,lt7x2e,gp2chvl,,,,,,N,,
400,lwi5iq,gpjq0gw,,,,,,N,,
401,lwi5iq,gpja9h0,,,,,,Y,gpj4o6d,
859,lwwxnl,gpkrirp,,,,,,Y,gpkgroj,
897,lwd352,gpj46bk,,,,,,N,,
898,lwd352,gpgv9h3,,,,,,Y,gpgo9cb,
1203,lwsxik,gpki745,,,,,,N,,
1325,bnzmmj,eropqfb,,,,,,N,,


Those don't look very useful. There's certainly no text for analysis.

In [7]:
# Look at rows with null creation date.
df[df['created_utc'].isnull()]

Unnamed: 0,post_id,comment_id,author,comment,created_utc,downs,ups,reply,comment_replied_id,comment_date
85,krxabz,gidacaw,,,,,,N,,
89,krxabz,giq2l9n,,,,,,Y,gikupi7,
258,lt7x2e,gp2chvl,,,,,,N,,
400,lwi5iq,gpjq0gw,,,,,,N,,
401,lwi5iq,gpja9h0,,,,,,Y,gpj4o6d,
859,lwwxnl,gpkrirp,,,,,,Y,gpkgroj,
897,lwd352,gpj46bk,,,,,,N,,
898,lwd352,gpgv9h3,,,,,,Y,gpgo9cb,
1203,lwsxik,gpki745,,,,,,N,,
1325,bnzmmj,eropqfb,,,,,,N,,


Those are also low value.

In [8]:
# Look at rows with null author.
df[df['author'].isnull()]

Unnamed: 0,post_id,comment_id,author,comment,created_utc,downs,ups,reply,comment_replied_id,comment_date
10,ldd0e6,gmy8zo6,,[deleted],1613089000.0,0.0,5.0,N,,2/11/21 18:09
39,krxabz,gn989ko,,[removed],1613269000.0,0.0,-5.0,N,,2/13/21 20:08
85,krxabz,gidacaw,,,,,,N,,
89,krxabz,giq2l9n,,,,,,Y,gikupi7,
258,lt7x2e,gp2chvl,,,,,,N,,
400,lwi5iq,gpjq0gw,,,,,,N,,
401,lwi5iq,gpja9h0,,,,,,Y,gpj4o6d,
434,lwt85y,gpj9ukc,,[removed],1614810000.0,0.0,1.0,N,,2021-03-03 16:15:10
444,lwn9dl,gpju2kv,,[removed],1614819000.0,0.0,1.0,N,,2021-03-03 18:46:56
463,lw4vvg,gpfjed0,,[removed],1614732000.0,0.0,2.0,N,,2021-03-02 18:48:06


In [9]:
df[(df['comment'] == '[removed]') | (df['comment'] == '[deleted]')]

Unnamed: 0,post_id,comment_id,author,comment,created_utc,downs,ups,reply,comment_replied_id,comment_date
10,ldd0e6,gmy8zo6,,[deleted],1613089000.0,0.0,5.0,N,,2/11/21 18:09
39,krxabz,gn989ko,,[removed],1613269000.0,0.0,-5.0,N,,2/13/21 20:08
434,lwt85y,gpj9ukc,,[removed],1614810000.0,0.0,1.0,N,,2021-03-03 16:15:10
444,lwn9dl,gpju2kv,,[removed],1614819000.0,0.0,1.0,N,,2021-03-03 18:46:56
463,lw4vvg,gpfjed0,,[removed],1614732000.0,0.0,2.0,N,,2021-03-02 18:48:06
466,lw4vvg,gpi9nn1,,[removed],1614781000.0,0.0,1.0,N,,2021-03-03 08:11:34
468,lw4vvg,gpft02h,,[removed],1614737000.0,0.0,-1.0,N,,2021-03-02 19:57:45
478,lwc482,gphrlf6,,[removed],1614770000.0,0.0,1.0,N,,2021-03-03 05:07:47
491,lvwkc9,gpf1cwf,,[removed],1614724000.0,0.0,6.0,N,,2021-03-02 16:29:55
494,lvwkc9,gpg166f,,[removed],1614740000.0,0.0,0.0,N,,2021-03-02 20:56:15


In [10]:
df.drop(df[(df['comment'].isnull()) |
           (df['author'].isnull()) |
           (df['created_utc'].isnull()) | 
           (df['comment'] == '[removed]') |
           (df['comment'] == '[deleted]')
          ].index, inplace = True)

In [11]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1911 entries, 0 to 1968
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   post_id             1911 non-null   object 
 1   comment_id          1911 non-null   object 
 2   author              1911 non-null   object 
 3   comment             1911 non-null   object 
 4   created_utc         1911 non-null   float64
 5   downs               1911 non-null   float64
 6   ups                 1911 non-null   float64
 7   reply               1911 non-null   object 
 8   comment_replied_id  214 non-null    object 
 9   comment_date        1911 non-null   object 
dtypes: float64(3), object(7)
memory usage: 164.2+ KB


## Check for duplicates

In [12]:
df.duplicated().value_counts()

False    1911
dtype: int64

## Export cleaned data

In [13]:
# Write it to csv.

df.to_csv(cleaned_comments, index = False)