# Text Pre-processing

Add text here explaining this notebook and what it does

##### Import libraries

In [93]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

import re

import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

## Load data

In [94]:
# Read in combined data frames csv file
posts = pd.read_csv('../data/subreddit_posts_combined.csv') 

## Look at the data

In [95]:
posts.head()

Unnamed: 0,author,created_utc,subreddit,text,timeframe
0,sub30605,1499390694,bulimia,Chest Pains? : Anyone else experience chest pa...,pre-covid
1,sub27274,1499060654,bulimia,"Dying to eat, eating to die: study on shifting...",pre-covid
2,sub6055,1499029087,bulimia,Without purging.... : What is the quickest way...,pre-covid
3,sub40365,1498978259,bulimia,Bulimia and Melancholy Feelings : I haven't pu...,pre-covid
4,sub49857,1498814187,bulimia,I'm relapsing : Fuck. I'm so upset at myself r...,pre-covid


#### Check Shape

In [96]:
posts.shape

(85401, 5)

#### Check data types

In [97]:
posts.dtypes

author         object
created_utc    object
subreddit      object
text           object
timeframe      object
dtype: object

#### Check Null Values

In [98]:
posts.isnull().sum()

author         0
created_utc    0
subreddit      1
text           1
timeframe      2
dtype: int64

In [99]:
(posts.loc[posts['text'].isnull() == True]).head(10)

Unnamed: 0,author,created_utc,subreddit,text,timeframe
27249,Pray for me.,pre-covid,,,


In [100]:
posts.dropna(inplace=True)

In [101]:
posts.shape

(85399, 5)

#### Count number of posts per subreddit

In [102]:
posts['subreddit'].value_counts()

bulimia            9777
AnorexiaNervosa    9733
schizophrenia      9549
bipolar            9493
Anxiety            9455
mentalhealth       9450
BPD                9345
depression         9318
autism             9279
Name: subreddit, dtype: int64

## Preprocess Text

### Clean text

Remove numbers, white space, punctuation, and make lowercase

In [104]:
posts['text'] = posts['text'].str.replace(r'[^\w\s\d]+', '')

In [108]:
posts['text'] = posts['text'].str.lower() 

### Tokenize sentences and words

In [109]:
posts['text'] = [sent_tokenize(row) for row in posts['text'].astype(str)] # tokenize sentences

In [110]:
posts['words'] = [word_tokenize(row) for row in posts['text'].astype(str)]

### Remove stop words

In [111]:
stop = stopwords.words('english')

In [112]:
sw_list = {'anorexia', 'anorexianervosa', 'anorexic', 'anorexics', 'ana',
           'bulimia', 'bulimic', 'bullimia', 'bullimic', 'depressed', 
           'depression', 'bipolar', 'schizophrenia', 'schizophrenic', 'BPD', 
           'borderline', 'anxiety', 'autistic', 'autism'}

In [113]:
stop.extend(sw_list)

In [114]:
posts['words'] = posts['words'].apply(lambda x: [item for item in x if item not in stop])

### Check for empty cells

In [168]:
posts.replace('', np.nan, inplace=True) 

In [169]:
posts.isnull().sum()

author         0
created_utc    0
subreddit      0
text           0
timeframe      0
words          0
word_stems     0
dtype: int64

Need to drop empty cells, but unable to replace with nan... will look at low word count cells and then drop

In [143]:
# Create a new column with submission length
posts['text_length'] = posts['text'].map(len)

In [155]:
# Show the shortest submissions
pd.set_option('display.max_rows', 500)

to_drop = posts.sort_values(by='text_length', ascending=True)[['text']].head(61).index

posts = posts.drop(to_drop)

Drop the 'text length' column created above

In [192]:
posts = posts.drop(columns = 'text_length')

### Normalize 

Using Porter Stemmer

In [158]:
# Create an instance of Porter Stemmer
stemmer = PorterStemmer()

In [159]:
text = posts['words'].astype(str)

In [160]:
posts['word_stems'] = [stemmer.stem(word) for word in text]

## Save to csv

In [193]:
posts.to_csv(r'../data/posts-preprocessed.csv', index=False)