# Data preprocessing

## Import packages

In [8]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.tokenize import regexp_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter

## Read in data

In [2]:
n_rows = 100
complaints_df = pd.read_csv('~/documents/data/consumer_complaints/consumer_complaints_clean.csv', \
                            index_col = 0, nrows = n_rows)

## Make lower case

Words with and without capital letters are considered to be different words by the nltk package. I can simplify the problem by making all letters lower case.

In [3]:
for col in complaints_df.columns:
    complaints_df[col] = [element.lower() for element in complaints_df[col]]

## Tokenize, stem and remove stop words

Here I tokenize each of the complaints by splitting them into a list of separate words. I throw away punctuation at this point. This will make the remaining steps easier.

I also stem each word to its root to save space, speed up the following analysis, and minimise any kind of overfitting to non-meaningful words.

Stop words are commonly occuring words that take up space and add little meaning. Here I remove the stop words to reduce the size of the problem even further. "xxxx" is a a string used to replace words used in the consumer complaints with confidentiality issues. This string occurs frequently but as its meaning is obscured it adds no value. So here I add it to the list of stop words.

In [4]:
stop_words = stopwords.words('english')
stop_words.append("xxxx")

In [9]:
pattern = r"\w+"
complaints_list = []
num_top_words = 50

ps = PorterStemmer()

for i in range(n_rows):
    complaint = regexp_tokenize(complaints_df.iloc[i, 2], pattern)
    complaints_list.append([ps.stem(word) for word in complaint if word not in stop_words])

complaints_df['Consumer complaint narrative'] = complaints_list

del complaints_list

In [None]:
#complaint = [word[0] for word in Counter(complaint).most_common(5)]