# Text Pre-processing

Add text here explaining this notebook and what it does

##### Import libraries

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

import re
import string

import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

## Load data

In [2]:
# Read in combined data frames csv file
posts = pd.read_csv('../data/subreddit_posts_combined.csv') 

## Look at the data

In [3]:
posts.head()

Unnamed: 0,author,created_utc,subreddit,text,timeframe
0,sub30605,1499390694,bulimia,Chest Pains? : Anyone else experience chest pa...,pre-covid
1,sub27274,1499060654,bulimia,"Dying to eat, eating to die: study on shifting...",pre-covid
2,sub6055,1499029087,bulimia,Without purging.... : What is the quickest way...,pre-covid
3,sub40365,1498978259,bulimia,Bulimia and Melancholy Feelings : I haven't pu...,pre-covid
4,sub49857,1498814187,bulimia,I'm relapsing : Fuck. I'm so upset at myself r...,pre-covid


#### Check Shape

In [4]:
posts.shape

(85401, 5)

#### Check data types

In [5]:
posts.dtypes

author         object
created_utc    object
subreddit      object
text           object
timeframe      object
dtype: object

#### Check Null Values

In [6]:
posts.isnull().sum()

author         0
created_utc    0
subreddit      1
text           1
timeframe      2
dtype: int64

In [7]:
(posts.loc[posts['text'].isnull() == True]).head(10)

Unnamed: 0,author,created_utc,subreddit,text,timeframe
27249,Pray for me.,pre-covid,,,


In [8]:
posts.dropna(inplace=True)

In [9]:
posts.shape

(85399, 5)

#### Count number of posts per subreddit

In [10]:
posts['subreddit'].value_counts()

bulimia            9777
AnorexiaNervosa    9733
schizophrenia      9549
bipolar            9493
Anxiety            9455
mentalhealth       9450
BPD                9345
depression         9318
autism             9279
Name: subreddit, dtype: int64

## Preprocess Text

### Normalization

In [12]:
# For removing stopwords
stop_words = set(stopwords.words('english'))  

new_stopwords = ['anorexia', 'nervosa', 'anorexic', 'bulimic', 'bulimia', 
                 'anxiety', 'autistic', 'autism', 'bpd', 'borderline',
                 'schizophrenia', 'schizophrenic', 'schizo']

stop_words.update(new_stopwords)
new_stopwords_list = set(stop_words)



# To only use words within the NLTK corpus, and remove nonsense words
words = set(nltk.corpus.words.words())

https://towardsdatascience.com/cleaning-text-data-with-python-b69b47b97b76

In [13]:
def text_preproc(x):
    x = x.lower()
    x = x.encode('ascii', 'ignore').decode("utf-8")
    x = re.sub(r'https*\S+', ' ', x)
    x = re.sub(r'@\S+', ' ', x)
    x = re.sub(r'#\S+', ' ', x)
    x = re.sub(r'\'\w+', '', x)
    x = re.sub('[%s]' % re.escape(string.punctuation), ' ', x)
    x = re.sub(r'\w*\d+\w*', '', x)
    x = re.sub(r'\s{2,}', ' ', x)
    x = " ".join(w for w in nltk.wordpunct_tokenize(x) if w.lower() \
                 not in new_stopwords_list)
    x = " ".join(w for w in nltk.wordpunct_tokenize(x) if w.lower() \
                 in words or not w.isalpha())
    return x

In [14]:
posts['text_clean'] = posts.text.apply(text_preproc)

In [15]:
posts.head()

Unnamed: 0,author,created_utc,subreddit,text,timeframe,text_clean
0,sub30605,1499390694,bulimia,Chest Pains? : Anyone else experience chest pa...,pre-covid,chest anyone else experience chest purging kno...
1,sub27274,1499060654,bulimia,"Dying to eat, eating to die: study on shifting...",pre-covid,dying eat eating die study shifting coping men...
2,sub6055,1499029087,bulimia,Without purging.... : What is the quickest way...,pre-covid,without purging way lose weight without exercise
3,sub40365,1498978259,bulimia,Bulimia and Melancholy Feelings : I haven't pu...,pre-covid,melancholy little month since slowly losing ha...
4,sub49857,1498814187,bulimia,I'm relapsing : Fuck. I'm so upset at myself r...,pre-covid,relapsing upset right twice week good tired cy...


### Tokenize sentences and words

In [16]:
# tokenize sentences
posts['sent_tokens'] = [sent_tokenize(row) for row in posts['text_clean'].astype(str)] 

In [17]:
# tokenize words
posts['word_tokens'] = [word_tokenize(row) for row in posts['text_clean'].astype(str)]

### Check for empty cells

In [18]:
posts.replace('', np.nan, inplace=True) 

In [19]:
posts.isnull().sum()

author           0
created_utc      0
subreddit        0
text             0
timeframe        0
text_clean     503
sent_tokens      0
word_tokens      0
dtype: int64

In [20]:
posts.dropna(inplace=True)

### Porter Stemmer

Using Porter Stemmer

In [21]:
# Create an instance of Porter Stemmer
stemmer = PorterStemmer()

In [22]:
text = posts['word_tokens'].astype(str)

In [23]:
posts['word_tokens'] = [stemmer.stem(word) for word in text]

### More cleaning

### Check dataframe filtered by submission Lengths

In [24]:
posts.head()

Unnamed: 0,author,created_utc,subreddit,text,timeframe,text_clean,sent_tokens,word_tokens
0,sub30605,1499390694,bulimia,Chest Pains? : Anyone else experience chest pa...,pre-covid,chest anyone else experience chest purging kno...,[chest anyone else experience chest purging kn...,"['chest', 'anyone', 'else', 'experience', 'che..."
1,sub27274,1499060654,bulimia,"Dying to eat, eating to die: study on shifting...",pre-covid,dying eat eating die study shifting coping men...,[dying eat eating die study shifting coping me...,"['dying', 'eat', 'eating', 'die', 'study', 'sh..."
2,sub6055,1499029087,bulimia,Without purging.... : What is the quickest way...,pre-covid,without purging way lose weight without exercise,[without purging way lose weight without exerc...,"['without', 'purging', 'way', 'lose', 'weight'..."
3,sub40365,1498978259,bulimia,Bulimia and Melancholy Feelings : I haven't pu...,pre-covid,melancholy little month since slowly losing ha...,[melancholy little month since slowly losing h...,"['melancholy', 'little', 'month', 'since', 'sl..."
4,sub49857,1498814187,bulimia,I'm relapsing : Fuck. I'm so upset at myself r...,pre-covid,relapsing upset right twice week good tired cy...,[relapsing upset right twice week good tired c...,"['relapsing', 'upset', 'right', 'twice', 'week..."


In [25]:
# Create a new column with submission length
posts['text_length'] = posts['sent_tokens'].map(len)

In [26]:
# Create a new column with number of words per submission
posts['stems_word_count'] = posts['word_tokens'].map(len)

In [27]:
# Show the shortest submissions
posts.sort_values(by='text_length', ascending=True)[['sent_tokens']].head()

Unnamed: 0,sent_tokens
0,[chest anyone else experience chest purging kn...
56994,[sleep keep getting hear disrupt sleep also ge...
56993,[wallpaper]
56992,[tales rainbow machine]
56991,[possible put live finland future foster home ...


In [28]:
# Show the shortest submissions
posts.sort_values(by='stems_word_count', ascending=True)[['word_tokens']].head()

Unnamed: 0,word_tokens
10175,['r']
17794,['r']
9987,['w']
29718,['q']
55499,['q']


In [29]:
# Show the longest submissions
posts.sort_values(by='stems_word_count', ascending=False)[['word_tokens']].head(10)

Unnamed: 0,word_tokens
20780,"['reciprocal', 'suffering', 'time', 'please', ..."
48207,"['rather', 'fast', 'recovery', 'turn', 'old', ..."
84879,"['really', 'long', 'sad', 'story', 'begin', 't..."
82321,"['depressed', 'long', 'remember', 'really', 'k..."
18665,"['think', 'may', 'please', 'help', 'many', 're..."
62675,"['little', 'almost', 'nothing', 'little', 'alm..."
65625,"['experience', 'deal', 'worth', 'long', 'read'..."
85244,"['extremely', 'long', 'post', 'felt', 'like', ..."
34511,"['job', 'girl', 'taking', 'toll', 'ill', 'star..."
18169,"['long', 'detailed', 'put', 'paranoia', 'see',..."


In [30]:
# Drop any unecessary rows before saving the dataframe to a clean csv file
posts.drop(columns=['text','text_length','stems_word_count'], inplace=True)

## Save to csv

In [31]:
posts.to_csv(r'../data/posts-preprocessed.csv', index=False)