# Reddit Cleaning 2

## Import dependencies

In [1]:
import pandas as pd
import re
import string

import pickle # just in case

## Set file locations

In [2]:
# File for cleaned comment data (input file)
cleaned_reddit_comments = '../00_data/reddit_data/reddit_comments_cleaned.csv'

# File for preprocessed comment text (output file)
cleaned_reddit_comment_text = '../00_data/reddit_data/reddit_comment_text_cleaned.csv'

## Read in the data

In [3]:
df = pd.read_csv(cleaned_reddit_comments)

In [4]:
df.head()

Unnamed: 0,post_id,comment_id,author,comment,created_utc,downs,ups,reply,comment_replied_id,comment_date
0,ldd0e6,gpcqm3a,t2_16jhc2,My super conservative Southern Baptist MIL is ...,1614669000.0,0.0,1.0,N,,3/2/21 1:02
1,ldd0e6,gonzgvg,t2_644zai51,Had a nurse claim the vaccine wasn’t real and ...,1614253000.0,0.0,6.0,N,,2/25/21 5:35
2,ldd0e6,gobfxvn,t2_hh35kpp,"Not an interaction with an antivaxxer, but jus...",1614007000.0,0.0,6.0,N,,2/22/21 9:19
3,ldd0e6,gobey4f,t2_7gf4ejvg,My grandfather is a Trump-supporting conspirac...,1614006000.0,0.0,11.0,N,,2/22/21 9:05
4,ldd0e6,go92rh9,t2_h24rn3v,"My anti vaxx, anti mask aunt keeps posting on ...",1613962000.0,0.0,8.0,N,,2/21/21 20:41


In [5]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1911 entries, 0 to 1910
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   post_id             1911 non-null   object 
 1   comment_id          1911 non-null   object 
 2   author              1911 non-null   object 
 3   comment             1911 non-null   object 
 4   created_utc         1911 non-null   float64
 5   downs               1911 non-null   float64
 6   ups                 1911 non-null   float64
 7   reply               1911 non-null   object 
 8   comment_replied_id  214 non-null    object 
 9   comment_date        1911 non-null   object 
dtypes: float64(3), object(7)
memory usage: 149.4+ KB


## Preprocess comment text for analysis

In [6]:
# Create a new dataframe. Use 'comment_id' for any future merges.
text_df = df[['comment_id', 'comment']].copy()
text_df.set_index('comment_id')

Unnamed: 0_level_0,comment
comment_id,Unnamed: 1_level_1
gpcqm3a,My super conservative Southern Baptist MIL is ...
gonzgvg,Had a nurse claim the vaccine wasn’t real and ...
gobfxvn,"Not an interaction with an antivaxxer, but jus..."
gobey4f,My grandfather is a Trump-supporting conspirac...
go92rh9,"My anti vaxx, anti mask aunt keeps posting on ..."
...,...
gpdfjo6,My county has about 25% more vaccinated person...
gphl9yr,..
gpjzg18,So this location is trying to vaccine about 4k...
gpfbdj4,If you’re not at risk get in line


In [7]:
# Define a little cleaner function
# I would really like to get some review on the regex here.

def clean_text_round1(text):
    '''Make text lowercase, remove punctuation, excess whitespace (in that order).'''
    # make text lowercase
    text = text.lower()
    # remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # remove multiple whitespace, and convert all whitespace to space (' ').
    text = " ".join(text.split())
    
    return text

In [8]:
text_df['clean_text'] = text_df['comment'].apply(lambda x: clean_text_round1(x))

In [9]:
text_df

Unnamed: 0,comment_id,comment,clean_text
0,gpcqm3a,My super conservative Southern Baptist MIL is ...,my super conservative southern baptist mil is ...
1,gonzgvg,Had a nurse claim the vaccine wasn’t real and ...,had a nurse claim the vaccine wasn’t real and ...
2,gobfxvn,"Not an interaction with an antivaxxer, but jus...",not an interaction with an antivaxxer but just...
3,gobey4f,My grandfather is a Trump-supporting conspirac...,my grandfather is a trumpsupporting conspiracy...
4,go92rh9,"My anti vaxx, anti mask aunt keeps posting on ...",my anti vaxx anti mask aunt keeps posting on o...
...,...,...,...
1906,gpdfjo6,My county has about 25% more vaccinated person...,my county has about 25 more vaccinated persons...
1907,gphl9yr,..,
1908,gpjzg18,So this location is trying to vaccine about 4k...,so this location is trying to vaccine about 4k...
1909,gpfbdj4,If you’re not at risk get in line,if you’re not at risk get in line


In [10]:
text_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1911 entries, 0 to 1910
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   comment_id  1911 non-null   object
 1   comment     1911 non-null   object
 2   clean_text  1911 non-null   object
dtypes: object(3)
memory usage: 44.9+ KB


In [11]:
text_df.to_csv(cleaned_reddit_comment_text, index = False)