In [1]:
import numpy as np
import pandas as pd

In [2]:
commentsm = pd.read_csv('askmen_cleaned')
commentsw = pd.read_csv('askwomen_cleaned')

In [3]:
commentsm.shape

(33390, 10)

In [4]:
commentsw.shape

(38405, 10)

In [6]:
comments = pd.concat([commentsm, commentsw])

In [7]:
len(comments)

71795

In [8]:
comments.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,body,created_utc,id,link_id,parent_id,score,subreddit
0,0,0,0,I was 23 but I went with my ~29 year old cowor...,1545243578,ec4lbi4,t3_a7oy9v,t3_a7oy9v,1,AskMen
1,1,1,1,"Portland, OR.\r\r\r\n\r\r\r\nThe city itself i...",1545243546,ec4la0n,t3_a7mkui,t3_a7mkui,1,AskMen
2,2,2,2,"nope. ""the cats goodbye"" watch how a c...",1545243536,ec4l9lm,t3_a7fe60,t3_a7fe60,1,AskMen
3,3,3,3,Drunk as fuck me during an unintended one nigh...,1545243524,ec4l90i,t3_a79zu9,t3_a79zu9,1,AskMen
4,4,4,4,There was this one time when I went over one o...,1545243449,ec4l5g6,t3_a7kmvc,t3_a7kmvc,1,AskMen


In [10]:
comments.drop(['Unnamed: 0','Unnamed: 0.1','Unnamed: 0.1.1'],axis=1,inplace=True)

In [14]:
comments = comments.reset_index()

In [15]:
comments.drop('index',axis=1,inplace=True)

I ended up realizing that because we're only using first tier comments, link_id is actually redundant with parent_id.  So I decided to remove it.

In [31]:
comments.drop('link_id',axis=1,inplace=True)

In [34]:
comments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71795 entries, 0 to 71794
Data columns (total 6 columns):
body           71795 non-null object
created_utc    71795 non-null int64
id             71795 non-null object
parent_id      71795 non-null object
score          71795 non-null int64
subreddit      71795 non-null object
dtypes: int64(2), object(4)
memory usage: 3.3+ MB


In [35]:
comments.isnull().sum()

body           0
created_utc    0
id             0
parent_id      0
score          0
subreddit      0
dtype: int64

In [56]:
len(comments['body'].map(lambda x: x if '\r' in x else 0).unique())

23261

So you can see from the above that 23000+ comments have a \r somewhere, this will interfere with our attempt to calculate wordlength and may affect our vectorizations.  It would be best to remove these and replace them with one empty space.  This can be accomplished with regex.

In [62]:
import re

In [143]:
def replace_linebreaks_w_space(x):
    return re.sub('([\r\n]+)',' ',x)    

There are also a large number of multiple spaces in some of the comments.  If we want to get accurate word length calculations splitting by ' ', we need to reduce these to single spaces.  Again, regex gets the call.

In [141]:
def replace_space_w_space(x):
    return re.sub('([ ]{2,})',' ',x)

In [145]:
comments['body'] = comments['body'].map(replace_linebreaks_w_space)

In [146]:
comments['body'] = comments['body'].map(replace_space_w_space)

NOW we can do a proper word length count based on a split with the space.

In [148]:
comments['word length'] = comments['body'].map(lambda x: len(x.strip().split(' ')))

In [149]:
comments.head()

Unnamed: 0,body,created_utc,id,parent_id,score,subreddit,word length
0,I was 23 but I went with my ~29 year old cowor...,1545243578,ec4lbi4,t3_a7oy9v,1,AskMen,23
1,"Portland, OR. The city itself is now unafforda...",1545243546,ec4la0n,t3_a7mkui,1,AskMen,36
2,"nope. ""the cats goodbye"" watch how a cat says ...",1545243536,ec4l9lm,t3_a7fe60,1,AskMen,28
3,Drunk as fuck me during an unintended one nigh...,1545243524,ec4l90i,t3_a79zu9,1,AskMen,16
4,There was this one time when I went over one o...,1545243449,ec4l5g6,t3_a7kmvc,1,AskMen,192


In [150]:
comments.to_csv('final_dataset_cleaned', index=False)

Here, I wanted to create a version of the dataset with comments 3 words and shorter removed, as it's difficult to guess much from comments this short, and there are plenty of comments for the model.

In [156]:
comments_fourplus = comments[comments['word length']>=4]

In [157]:
len(comments_fourplus)

67354

In [158]:
comments_fourplus.to_csv('final_dataset_cleaned_fourplus')