# Text Pre-processing

Add text here explaining this notebook and what it does

##### Import libraries

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

import re
import string

import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

## Load data

In [2]:
# Read in combined data frames csv file
posts = pd.read_csv('../data/subreddit_posts_combined.csv') 

## Look at the data

In [3]:
posts.head()

Unnamed: 0,author,created_utc,subreddit,text,timeframe
0,sub30605,1499390694,bulimia,Chest Pains? : Anyone else experience chest pa...,pre-covid
1,sub27274,1499060654,bulimia,"Dying to eat, eating to die: study on shifting...",pre-covid
2,sub6055,1499029087,bulimia,Without purging.... : What is the quickest way...,pre-covid
3,sub40365,1498978259,bulimia,Bulimia and Melancholy Feelings : I haven't pu...,pre-covid
4,sub49857,1498814187,bulimia,I'm relapsing : Fuck. I'm so upset at myself r...,pre-covid


#### Check Shape

In [4]:
posts.shape

(85401, 5)

#### Check data types

In [5]:
posts.dtypes

author         object
created_utc    object
subreddit      object
text           object
timeframe      object
dtype: object

#### Check Null Values

In [6]:
posts.isnull().sum()

author         0
created_utc    0
subreddit      1
text           1
timeframe      2
dtype: int64

In [7]:
(posts.loc[posts['text'].isnull() == True]).head(10)

Unnamed: 0,author,created_utc,subreddit,text,timeframe
27249,Pray for me.,pre-covid,,,


In [8]:
posts.dropna(inplace=True)

In [9]:
posts.shape

(85399, 5)

#### Count number of posts per subreddit

In [10]:
posts['subreddit'].value_counts()

bulimia            9777
AnorexiaNervosa    9733
schizophrenia      9549
bipolar            9493
Anxiety            9455
mentalhealth       9450
BPD                9345
depression         9318
autism             9279
Name: subreddit, dtype: int64

## Preprocess Text

### Normalization

https://towardsdatascience.com/cleaning-text-data-with-python-b69b47b97b76

In [11]:
stop_words = stopwords.words("english")

In [12]:
def text_preproc(x):
    x = x.lower()
    x = ' '.join([word for word in x.split(' ') if word not in stop_words])
    x = x.encode('ascii', 'ignore').decode("utf-8")
    x = re.sub(r'https*\S+', ' ', x)
    x = re.sub(r'@\S+', ' ', x)
    x = re.sub(r'#\S+', ' ', x)
    x = re.sub(r'\'\w+', '', x)
    x = re.sub('[%s]' % re.escape(string.punctuation), ' ', x)
    x = re.sub(r'\w*\d+\w*', '', x)
    x = re.sub(r'\s{2,}', ' ', x)
    return x

In [13]:
posts['text_clean'] = posts.text.apply(text_preproc)

In [14]:
posts.head()

Unnamed: 0,author,created_utc,subreddit,text,timeframe,text_clean
0,sub30605,1499390694,bulimia,Chest Pains? : Anyone else experience chest pa...,pre-covid,chest pains anyone else experience chest pains...
1,sub27274,1499060654,bulimia,"Dying to eat, eating to die: study on shifting...",pre-covid,dying eat eating die study shifting coping mec...
2,sub6055,1499029087,bulimia,Without purging.... : What is the quickest way...,pre-covid,without purging quickest way lose weight witho...
3,sub40365,1498978259,bulimia,Bulimia and Melancholy Feelings : I haven't pu...,pre-covid,bulimia melancholy feelings purged little mont...
4,sub49857,1498814187,bulimia,I'm relapsing : Fuck. I'm so upset at myself r...,pre-covid,i relapsing fuck i upset right now i purged tw...


### Tokenize sentences and words

In [15]:
# tokenize sentences
posts['sent_tokens'] = [sent_tokenize(row) for row in posts['text_clean'].astype(str)] 

In [16]:
# tokenize words
posts['word_tokens'] = [word_tokenize(row) for row in posts['text_clean'].astype(str)]

### Check for empty cells

In [20]:
posts.replace('', np.nan, inplace=True) 

In [21]:
posts.isnull().sum()

author         0
created_utc    0
subreddit      0
text           0
timeframe      0
text_clean     0
sent_tokens    0
word_tokens    0
text_length    0
dtype: int64

Need to drop empty cells, but unable to replace with nan... will look at low word count cells and then drop

In [36]:
# Create a new column with submission length
posts['text_length'] = posts['word_tokens'].map(len)

In [45]:
# Show the shortest submissions
pd.set_option('display.max_rows', 500)

to_drop = posts.sort_values(by='text_length', ascending=True)[['word_tokens']].head(115).index

posts = posts.drop(to_drop)

Drop the 'text length' column created above

In [46]:
posts = posts.drop(columns = 'text_length')

### Porter Stemmer

Using Porter Stemmer

In [47]:
# Create an instance of Porter Stemmer
stemmer = PorterStemmer()

In [48]:
text = posts['word_tokens'].astype(str)

In [49]:
posts['word_tokens'] = [stemmer.stem(word) for word in text]

### More cleaning

### Check dataframe filtered by submission Lengths

In [56]:
posts.head()

Unnamed: 0,author,created_utc,subreddit,text,timeframe,text_clean,sent_tokens,word_tokens
0,sub30605,1499390694,bulimia,Chest Pains? : Anyone else experience chest pa...,pre-covid,chest pains anyone else experience chest pains...,[chest pains anyone else experience chest pain...,"['chest', 'pains', 'anyone', 'else', 'experien..."
1,sub27274,1499060654,bulimia,"Dying to eat, eating to die: study on shifting...",pre-covid,dying eat eating die study shifting coping mec...,[dying eat eating die study shifting coping me...,"['dying', 'eat', 'eating', 'die', 'study', 'sh..."
2,sub6055,1499029087,bulimia,Without purging.... : What is the quickest way...,pre-covid,without purging quickest way lose weight witho...,[without purging quickest way lose weight with...,"['without', 'purging', 'quickest', 'way', 'los..."
3,sub40365,1498978259,bulimia,Bulimia and Melancholy Feelings : I haven't pu...,pre-covid,bulimia melancholy feelings purged little mont...,[bulimia melancholy feelings purged little mon...,"['bulimia', 'melancholy', 'feelings', 'purged'..."
4,sub49857,1498814187,bulimia,I'm relapsing : Fuck. I'm so upset at myself r...,pre-covid,i relapsing fuck i upset right now i purged tw...,[i relapsing fuck i upset right now i purged t...,"['i', 'relapsing', 'fuck', 'i', 'upset', 'righ..."


In [57]:
# Create a new column with submission length
posts['text_length'] = posts['sent_tokens'].map(len)

In [58]:
# Create a new column with number of words per submission
posts['stems_word_count'] = posts['word_tokens'].map(len)

In [60]:
# Show the shortest submissions
posts.sort_values(by='text_length', ascending=True)[['sent_tokens']].head()

Unnamed: 0,sent_tokens
0,[chest pains anyone else experience chest pain...
56960,[guys study meds im fricken sleepy meds lowest...
56959,[schizophrenia school dropped high school got ...
56958,[might lose apartment landlord sold apartment ...
56957,[someone help me back really bad weed trip did...


In [69]:
# Show the shortest submissions
posts.sort_values(by='stems_word_count', ascending=True)[['word_tokens']].head(10)

Unnamed: 0,word_tokens
10706,['u']
59451,['d']
48634,['i']
13876,['ha']
7127,['hi']
66100,['me']
17465,['mc']
16528,['lt']
24938,['me']
16332,['mc']


In [65]:
# Show the longest submissions
posts.sort_values(by='stems_word_count', ascending=False)[['word_tokens']].head(16)

Unnamed: 0,word_tokens
20780,"['reciprocal', 'suffering', 'if', 'time', 'ple..."
18665,"['think', 'may', 'schizophrenia', 'please', 'h..."
48207,"['rather', 'fast', 'recovery', 'amp', 'i', 'tu..."
82321,"['i', 'depressed', 'long', 'remember', 'really..."
84879,"['really', 'long', 'sad', 'story', 'begin', 'i..."
85244,"['extremely', 'long', 'post', 'felt', 'like', ..."
65625,"['experience', 'bpd', 'deal', 'it', 'worth', '..."
62675,"['little', 'almost', 'nothing', 'little', 'alm..."
34511,"['job', 'feelings', 'girl', 'taking', 'toll', ..."
39545,"['right', 'feel', 'way', 'expect', 'anyone', '..."


In [72]:
# Drop any unecessary rows before saving the dataframe to a clean csv file
posts.drop(columns=['text','text_length','stems_word_count'], inplace=True)

## Save to csv

In [73]:
posts.to_csv(r'../data/posts-preprocessed.csv', index=False)