In [1]:
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import nltk
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import WhitespaceTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD

import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\16313\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
raw = pd.read_csv('../jsons/combined_clean.csv')

Now that the dataset has been created, the actual text must be pre-processed before modeling.

There are many things that would be benficial to remove from the dataset, such a common or overly specific words, or bits of punction/formatting. There are also many regular or automated types of posts that will only add noise to the model.

Searching the dataset for the string 'CMV', provides an example. 'CMV' stands for 'Change My View,' and is frequently used on reddit discussion threads.

In [4]:
raw[raw.text.str.contains("CMV")].head()

Unnamed: 0,label,text,is_comment,score
60008,1,none I had to study La regle du jeu (Rules of ...,True,1.0
69451,0,"Much like /r/changemyview, this /r/Flicks CMV ...",False,8.0
69470,0,"Much like /r/changemyview, this /r/Flicks CMV ...",False,21.0
69489,0,"Much like /r/changemyview, this /r/Flicks CMV ...",False,28.0
69506,0,"Much like /r/changemyview, this /r/Flicks CMV ...",False,20.0


In [5]:
raw[raw.text.str.contains("Fancy")].head()

Unnamed: 0,label,text,is_comment,score
5,1,#[Be Fun and Fancy Free!](https://www.youtube....,False,3.0
35,1,#[Be Fun and Fancy Free!](https://www.youtube....,False,7.0
53,1,#[Be Fun and Fancy Free!](https://www.youtube....,False,11.0
76,1,#[Be Fun and Fancy Free!](https://www.youtube....,False,6.0
127,1,#[Be Fun and Fancy Free!](https://www.youtube....,False,3.0


These searches reveal both automated/recurring posts, as well as reddit subforum names that are probably better off being removed.

First, I will remove all URLs from the dataset, as well as all numbers and references to a particular subreddit (such as 'r/TrueFilm, or anything with 'r/___'.

In [6]:
raw['text'] = raw['text'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
raw['text'] = raw['text'].replace(r'\s[\/]?r\/[^\s]+', '', regex=True)
raw['text'] = raw['text'].apply(lambda x: x.replace('films','film'))
raw['text'] = raw['text'].apply(lambda x: x.replace('movies','movie'))
raw['text'] = raw['text'].apply(lambda x: re.sub(r'\d+', '', x))

In [7]:
raw[raw.text.str.contains("CMV")].head()

Unnamed: 0,label,text,is_comment,score
60008,1,none I had to study La regle du jeu (Rules of ...,True,1.0
69451,0,Much like this CMV thread is for you to post v...,False,8.0
69470,0,Much like this CMV thread is for you to post v...,False,21.0
69489,0,Much like this CMV thread is for you to post v...,False,28.0
69506,0,Much like this CMV thread is for you to post v...,False,20.0


Next, I will remove certain rows that contain text indicating it is an automated or recurring post.

In [8]:
raw = raw[~raw.text.str.contains('Fancy Free')]
raw = raw[~raw.text.str.contains('TrueFilm frontpage')]
raw = raw[~raw.text.str.contains("I'm a bot")]
raw = raw[~raw.text.str.contains("General Discussion thread")]
raw = raw[~raw.text.str.contains("blahblah")]
raw = raw[~raw.text.str.contains("beep boop")]
raw = raw[~raw.text.str.contains("thread is for you to post")]

In [9]:
raw[raw.text.str.contains("CMV")].head()

Unnamed: 0,label,text,is_comment,score
60008,1,none I had to study La regle du jeu (Rules of ...,True,1.0
73966,0,none Why do so many people *love* Baby Driver...,False,13.0
74875,0,none I think your criticisms of the film are ...,False,25.0
75629,0,none CMV - Taxi Driver is incredibly overrate...,False,0.0
75651,0,none CMV - Cinematics and the way a story is ...,False,9.0


I also want to remove 'stopwords,' which are extremely common (or noisy) words that will likely just add noise to the model.

Note that much like the data science process in general, these pre-processing steps are highly iterative and non-linear. It is likely that new stop words or noisy posts will be discovered during EDA or after analyzing the results of a model, which will necessitate further pre-processing.

Although the NLTK package contains a built-in stopwords package, I will also be appending my own custom stopwords to the list.

In [10]:
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.append('none')
stopword_list.append('No_Post')
stopword_list.append("'s")
stopword_list.append('[removed]')
stopword_list.append('[deleted]')
stopword_list.append('removed')
stopword_list.append('deleted')
stopword_list.append('gt')
stopword_list.append('ve')
stopword_list.append('xb')
stopword_list.append('amp')
stopword_list.append('cmv')
stopword_list.append('subreddit')
stopword_list.append('oh')
stopword_list.append('lol')
stopword_list.append('reddit')
stopword_list.append('haha')
stopword_list.append('yesterday')
stopword_list.append('crafted')
stopword_list.append('week')
stopword_list.append('truefilm')
stopword_list.append('TrueFilm')
stopword_list.append('flicks')
stopword_list.append('rule')
stopword_list.append('remember')
stopword_list.append('thread')
stopword_list.append('blah')
stopword_list.append('this')
stopword_list.append('kind')
stopword_list.append('general discussion')
stopword_list.append('CMV')
#stopword_list.append('movie')
#stopword_list.append('film')

In [11]:
def remove_stopwords(text):
    tokenizer = RegexpTokenizer("[\w']+")
    tokens = tokenizer.tokenize(text)
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [12]:
raw['text'] = raw['text'].apply(lambda x: remove_stopwords(x))

I also want to remove any rows that do not contain any posts.

In [13]:
def text_finder(text):
    if len(text) == 0:
        return('notext')
    else:
        return('text')

In [14]:
raw['has_text'] = raw['text'].apply(lambda x: text_finder(x))

In [15]:
raw_clean = raw.loc[raw['has_text'] == 'text']

In [16]:
raw_clean['text'] = raw_clean['text'].apply(lambda x: remove_stopwords(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [17]:
raw_clean.head()

Unnamed: 0,label,text,is_comment,score,has_text
1,1,I watched On The Waterfront last night absolut...,False,33.0,text
4,1,I seem pretty good grasp what's happening film...,False,0.0,text
6,1,People talk Iranian cinema serves political pu...,False,0.0,text
7,1,This post inspired Kelly Reichardt's Certain W...,False,0.0,text
8,1,I recently watched Tarantino flick I like movi...,False,46.0,text


In [18]:
raw_clean[['label','text']].to_csv('../jsons/pre_processed.csv',index=False)

In [19]:
pd.Series(stopword_list).to_csv('../jsons/stopwords.csv',index=False)