# Text Pre-processing

Add text here explaining this notebook and what it does

##### Import libraries

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

import datetime

# Prep-rocessing imports
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

## Load data

In [2]:
# Read in combined data frames csv file
posts = pd.read_csv('../data/subreddit_posts_combined.csv') 

#### Convert 'created_utc' column to datetime and drop original column

In [3]:
posts['datetime'] = posts['created_utc'].map(lambda t: datetime.datetime.fromtimestamp(t))
posts = posts.drop('created_utc', axis=1)
posts = posts.sort_values(by='datetime')

## Look at the data

In [4]:
posts.head()

Unnamed: 0,author,subreddit,timeframe,text,datetime
853,sub17967,bulimia,pre-covid,How can I stop hating myself : I have been on ...,2017-12-02 16:36:16
852,sub10311,bulimia,pre-covid,"New Guy Here, 1 Month On it (16M) : Hi guys, j...",2017-12-05 19:45:25
851,sub5587,bulimia,pre-covid,So I just vomited blood. What can I eat while ...,2017-12-06 16:58:16
850,sub32498,bulimia,pre-covid,Recovery is expensive .. during recovery : Hi....,2017-12-07 14:07:27
849,sub35262,bulimia,pre-covid,Anyone Relate? --Wanting Validation For Small ...,2017-12-08 00:49:23


#### Check Shape

In [5]:
posts.shape

(47857, 5)

#### Check data types

In [6]:
posts.dtypes

author               object
subreddit            object
timeframe            object
text                 object
datetime     datetime64[ns]
dtype: object

#### Check Null Values

In [7]:
posts.isnull().sum()

author       0
subreddit    0
timeframe    0
text         0
datetime     0
dtype: int64

#### Count number of posts per timeframe

In [8]:
posts['timeframe'].value_counts()

pre-covid     24783
post-covid    23074
Name: timeframe, dtype: int64

#### Count number of posts per subreddit

In [9]:
posts['subreddit'].value_counts()

BPD                6005
Anxiety            5908
mentalhealth       5879
bulimia            5718
depression         5638
AnorexiaNervosa    5596
schizophrenia      4932
bipolar            4526
autism             3655
Name: subreddit, dtype: int64

## Preprocess Text

### Clean text
Remove punctuation and make lower case

In [10]:
posts['text'] = posts['text'].str.replace(r'[^\w\s]+', '')

In [11]:
posts['text'] = [row.lower() for row in posts['text']]

### Tokenize sentences and words

In [12]:
posts['text'] = [sent_tokenize(row) for row in posts['text']] # tokenize sentences

In [13]:
posts['words'] = [word_tokenize(row) for row in posts['text'].astype(str)]

### Remove stop words

In [14]:
stop = stopwords.words('english')

In [15]:
sw_list = {'anorexia', 'anorexianervosa', 'anorexic', 'anorexics', 'ana',
           'bulimia', 'bulimic', 'bullimia', 'bullimic', 'depressed', 
           'depression', 'bipolar', 'schizophrenia', 'schizophrenic', 'BPD', 
           'borderline', 'anxiety', 'autistic', 'autism'}

In [16]:
stop.extend(sw_list)

In [17]:
posts['words'] = posts['words'].apply(lambda x: [item for item in x if item not in stop])

### Normalize 

Using Porter Stemmer

In [18]:
# Create an instance of Porter Stemmer
stemmer = PorterStemmer()

In [19]:
text = posts['words'].astype(str)

In [20]:
posts['word_stems'] = [stemmer.stem(word) for word in text]

## Save to csv

In [21]:
posts.to_csv(r'../data/posts-preprocessed.csv', index = False)