# Bag-of-words interpretation model

## Import packages

In [36]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

## Read in data

In [11]:
complaints_df = pd.read_csv('~/documents/data/consumer_complaints/consumer_complaints_pre-processed.csv')
n_rows = complaints_df.shape[0]

## Split data

Split data into a training set and a testing set.

In [12]:
X_train, X_test, y_train, y_test = train_test_split(complaints_df['Consumer complaint narrative'], \
                                                    complaints_df['Issue'],\
                                                    test_size = 0.1,\
                                                   random_state = 42)

## CountVectorizer

Instantiate the CountVectorizer object. This will convert the complaints to a matrix of token counts.

In [15]:
count_vectorizer = CountVectorizer(stop_words = 'english', lowercase = True)

Learn the vocabulary dictionary from the training data and return a complaint-term matrix.

In [16]:
count_train = count_vectorizer.fit_transform(X_train.values)

In [31]:
count_train.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

Transform the test-data complaints to document-term matrix.

In [19]:
count_test = count_vectorizer.transform(X_test.values)

In [30]:
count_test.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [5, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [8, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [38]:
len(count_vectorizer.get_feature_names())

62392

Convert the count matrix to a pandas DataFrame.

In [50]:
count_df = pd.DataFrame(count_train.A, columns = count_vectorizer.get_feature_names())

In [64]:
count_df.head(10)

Unnamed: 0,00,000,0000,00000,0001,000a,000dollars,000if,000ii,000ins,...,zombie,zone,zoned,zones,zoning,zoo,zoom,zooms,ztuff,zwicker
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## TfidfVectorizer

Instantiate the TfidfVectorizer object which converts the complaints data to a matrix of TF-IDF features.

In [56]:
max_df = 0.7
tfidf_vectorizer = TfidfVectorizer(stop_words = 'english', lowercase = True, max_df = max_df)

Learn the vocabulary dictionary from the training data and return a complaint-term matrix.

In [57]:
tfidf_train = tfidf_vectorizer.fit_transform(X_train.values)

In [58]:
tfidf_train.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

Transform the test-data complaints to document-term matrix.

In [59]:
tfidf_test = tfidf_vectorizer.transform(X_test.values)

In [60]:
tfidf_test.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.13401421, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.17205907, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

Convert the tf-idf matrix to a pandas DataFrame.

In [61]:
tfidf_df = pd.DataFrame(tfidf_train.A, columns = tfidf_vectorizer.get_feature_names())

In [63]:
tfidf_df.head(10)

Unnamed: 0,00,000,0000,00000,0001,000a,000dollars,000if,000ii,000ins,...,zombie,zone,zoned,zones,zoning,zoo,zoom,zooms,ztuff,zwicker
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.054017,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.049919,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
