## Text Processing 

### Load in the cleaned data 

In [1]:
import pandas as pd


In [7]:
df = pd.read_csv('../clean_data/comments_mbti.csv', usecols=['MBTI','comments'], dtype={'MBTI': str, 'comments': str}, index_col=None)

In [8]:
df.shape

(6436958, 2)

In [4]:
df.isna().sum()

Unnamed: 0    1179648
MBTI          1684186
comments      1684186
dtype: int64

In [34]:
# remove the extra index column 
df = df.drop(['Unnamed: 0'], axis=1)

In [35]:
# convert all comments text to lower case 
df['comments'] = df['comments'].str.lower()

In [36]:
df.dtypes

MBTI        object
comments    object
dtype: object

In [37]:
# convert the df to strings
df['comments'] = df['comments'].astype(str).str.replace('/',' ')

In [41]:
df.isna().sum()

MBTI        1684186
comments          0
dtype: int64

### Stemming and Lemmatization  

#### Punctuations

In [14]:
import nltk 
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [15]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Doylism/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [16]:
import string
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [17]:
stop_words = stopwords.words('english')
stemmer = SnowballStemmer('english')

In [18]:
# write a function to preprocess text 
def preprocess(text, stem=False):
    tokens=[]
    for token in text.split():
        stemmed_word = stemmer.stem(token)
        for punctuation in string.punctuation:
            stemmed_word = stemmed_word.replace(punctuation, '')
        if token not in stop_words:
            if stem:
                tokens.append(stemmed_word)
            else:
                tokens.append(token)
    return ' '.join(tokens)

In [39]:
# subset a small amount of data to test the function 
df_test = df.sample(n=100, random_state=1)

In [20]:
df_test['comments'] = df_test['comments'].apply(lambda x: preprocess(x))

AttributeError: 'float' object has no attribute 'split'

In [12]:
# write a function to remove punctuation 
def punctuation(text):
    clean = ''.join([t for t in text if t not in string.punctuation])
    return clean