# Bag-of-words interpretation model

## Import packages

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

## Read in data

In [3]:
complaints_df = pd.read_csv('~/documents/data/consumer_complaints/consumer_complaints_pre-processed.csv')
n_rows = complaints_df.shape[0]

In [4]:
print(complaints_df.isna().sum(), '\n')

Unnamed: 0                      0
Product                         0
Issue                           0
Consumer complaint narrative    2
Company                         0
dtype: int64 



Remove the two NaN rows which arose after pre-processing.

In [5]:
complaints_df = complaints_df[complaints_df['Consumer complaint narrative'].notnull()]

## Split data

Split data into a training set and a testing set.

In [6]:
X_train, X_test, y_train, y_test = train_test_split(complaints_df['Consumer complaint narrative'], \
                                                    complaints_df['Issue'],\
                                                    test_size = 0.1,\
                                                   random_state = 42)

## CountVectorizer

Instantiate the CountVectorizer object. This will convert the complaints to a matrix of token counts.

In [7]:
count_vectorizer = CountVectorizer(stop_words = 'english', lowercase = True)

Learn the vocabulary dictionary from the training data and return a complaint-term matrix.

In [8]:
count_train = count_vectorizer.fit_transform(X_train.values)

In [9]:
count_train.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

Transform the test-data complaints to document-term matrix.

In [10]:
count_test = count_vectorizer.transform(X_test.values)

In [11]:
count_test.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [12]:
len(count_vectorizer.get_feature_names())

53261

Convert the count matrix to a pandas DataFrame.

In [13]:
count_df = pd.DataFrame(count_train.A, columns = count_vectorizer.get_feature_names())

In [14]:
count_df.head(10)

Unnamed: 0,aa,aaa,aaaaan,aaaaargh,aaadvantage,aaaked,aaarm,aac,aacceptance,aaccount,...,zipcode,zipper,zombie,zone,zoned,zoning,zoo,zoom,ztuff,zwicker
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## TfidfVectorizer

Instantiate the TfidfVectorizer object which converts the complaints data to a matrix of TF-IDF features.

In [15]:
max_df = 0.7
tfidf_vectorizer = TfidfVectorizer(stop_words = 'english', lowercase = True, max_df = max_df)

Learn the vocabulary dictionary from the training data and return a complaint-term matrix.

In [16]:
tfidf_train = tfidf_vectorizer.fit_transform(X_train.values)

In [17]:
tfidf_train.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

Transform the test-data complaints to document-term matrix.

In [18]:
tfidf_test = tfidf_vectorizer.transform(X_test.values)

In [19]:
tfidf_test.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

Convert the tf-idf matrix to a pandas DataFrame.

In [20]:
tfidf_df = pd.DataFrame(tfidf_train.A, columns = tfidf_vectorizer.get_feature_names())

In [21]:
tfidf_df.head(10)

Unnamed: 0,aa,aaa,aaaaan,aaaaargh,aaadvantage,aaaked,aaarm,aac,aacceptance,aaccount,...,zipcode,zipper,zombie,zone,zoned,zoning,zoo,zoom,ztuff,zwicker
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Multinomial naive Bayes classifier

### Using the count matrix

Initialise the classifier.

In [56]:
nb_classifier = MultinomialNB(alpha = 0.01)

Fit the classifier to the training data.

In [57]:
nb_classifier.fit(count_train, y_train)

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

Use the classifier to predict the issue for the test data.

In [58]:
y_pred = nb_classifier.predict(count_test)

Determine the accuracy of the predictions.

In [59]:
metrics.accuracy_score(y_test, y_pred)

0.5157100228190276

In [60]:
len(complaints_df['Issue'].unique().tolist())

90

The accuracy score is 53%, which is not great, but not terrible considering there are 90 different issues that a complaint could relate to, and we have used a relatively crude bag-of-words methodology.

### Using the tf-idf matrix

Initialise the classifier.

In [61]:
nb_classifier = MultinomialNB(alpha = 0.05)

Fit the classifier to the training data.

In [62]:
nb_classifier.fit(tfidf_train, y_train)

MultinomialNB(alpha=0.05, class_prior=None, fit_prior=True)

Use the classifier to predict the issue for the test data.

In [49]:
y_pred = nb_classifier.predict(tfidf_test)

Determine the accuracy of the predictions.

In [50]:
metrics.accuracy_score(y_test, y_pred)

0.4882394242583816

The performance is slightly worse when using the tf-idf matrix.