# Bag-of-words interpretation model

## Import packages

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

## Read in data

In [11]:
complaints_df = pd.read_csv('~/documents/data/consumer_complaints/consumer_complaints_clean.csv')
n_rows = complaints_df.shape[0]

## Split data

Split data into a training set and a testing set.

In [12]:
X_train, X_test, y_train, y_test = train_test_split(complaints_df['Consumer complaint narrative'], \
                                                    complaints_df['Issue'],\
                                                    test_size = 0.1,\
                                                   random_state = 42)

## CountVectorizer

Instantiate the CountVectorizer object. This will convert the complaints to a matrix of token counts.

In [15]:
count_vectorizer = CountVectorizer(stop_words = 'english', lowercase = True)

Learn the vocabulary dictionary from the training data and return a complaint-term matrix.

In [16]:
count_train = count_vectorizer.fit_transform(X_train.values)

In [31]:
count_train.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

Transform the test-data complaints to document-term matrix.

In [19]:
count_test = count_vectorizer.transform(X_test.values)

In [30]:
count_test.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [5, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [8, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])