In [17]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument
import re
import seaborn as sns
import matplotlib.pyplot as plt


from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score, f1_score

### Get documents

In [6]:
df = pd.read_csv('Consumer_Complaints.csv')
df = df[['Consumer complaint narrative','Product']]
df = df[pd.notnull(df['Consumer complaint narrative'])]
df.rename(columns = {'Consumer complaint narrative':'narrative'}, inplace = True)
df.head(10)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,narrative,Product
36888,"AFTER REVIEWING MY CREDIT REPORT, I HAVE IDENT...","Credit reporting, credit repair services, or o..."
36926,"AFTER REVIEWING MY CREDIT REPORT, I HAVE IDENT...","Credit reporting, credit repair services, or o..."
44061,XXXX and Transunion are reporting incorrectly ...,"Credit reporting, credit repair services, or o..."
46638,I asked them to verify my debt and they have n...,Debt collection
47097,Hello. My name is XXXX XXXX. I recently Check ...,"Credit reporting, credit repair services, or o..."
47115,I applied for an XXXX account last week. I den...,"Credit reporting, credit repair services, or o..."
47314,I have a bunch of fraudulent information on my...,"Credit reporting, credit repair services, or o..."
47319,Navient does not allow access to a monthly sta...,Student loan
47337,XX/XX/2019 XX/XX/2019 XX/XX/2019 I have these ...,"Credit reporting, credit repair services, or o..."
47446,Hi my name is XXXX. I recently was at home doi...,"Credit reporting, credit repair services, or o..."


### Clean Documents 

In [7]:
def cleanText(text):
    text = BeautifulSoup(text, "lxml").text
    text = re.sub(r'\|\|\|', r' ', text) 
    text = re.sub(r'http\S+', r'<URL>', text)
    text = text.lower()
    text = text.replace('x', '')
    return text
df['narrative'] = df['narrative'].apply(cleanText)

In [9]:
train, test = train_test_split(df, test_size=0.3, random_state=42)

def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens

train_tagged = train.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['narrative']), tags=[r.Product]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['narrative']), tags=[r.Product]), axis=1)

In [10]:
model_dbow = Doc2Vec(dm=0, vector_size=100, negative=5, hs=0, min_count=2, sample = 0)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 274648/274648 [00:00<00:00, 2267462.68it/s]


In [11]:
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha
    print(epoch)

100%|██████████| 274648/274648 [00:00<00:00, 2816900.08it/s]
100%|██████████| 274648/274648 [00:00<00:00, 3008632.39it/s]

0



100%|██████████| 274648/274648 [00:00<00:00, 3220402.24it/s]

1



100%|██████████| 274648/274648 [00:00<00:00, 2891763.70it/s]

2



100%|██████████| 274648/274648 [00:00<00:00, 3218773.54it/s]

3



100%|██████████| 274648/274648 [00:00<00:00, 3172404.58it/s]

4



100%|██████████| 274648/274648 [00:00<00:00, 3102597.46it/s]

5



100%|██████████| 274648/274648 [00:00<00:00, 3046973.82it/s]

6



100%|██████████| 274648/274648 [00:00<00:00, 1873542.88it/s]

7



100%|██████████| 274648/274648 [00:00<00:00, 3070682.17it/s]

8



100%|██████████| 274648/274648 [00:00<00:00, 3090909.40it/s]

9



100%|██████████| 274648/274648 [00:00<00:00, 2614448.10it/s]

10



100%|██████████| 274648/274648 [00:00<00:00, 3237317.10it/s]

11



100%|██████████| 274648/274648 [00:00<00:00, 3185317.15it/s]

12



100%|██████████| 274648/274648 [00:00<00:00, 3394279.03it/s]

13



100%|██████████| 274648/274648 [00:00<00:00, 3337893.76it/s]

14



100%|██████████| 274648/274648 [00:00<00:00, 3257493.52it/s]

15



100%|██████████| 274648/274648 [00:00<00:00, 3288346.79it/s]

16



100%|██████████| 274648/274648 [00:00<00:00, 3131705.63it/s]

17



100%|██████████| 274648/274648 [00:00<00:00, 3216993.74it/s]

18



100%|██████████| 274648/274648 [00:00<00:00, 3281022.64it/s]

19



100%|██████████| 274648/274648 [00:00<00:00, 2515574.88it/s]

20



100%|██████████| 274648/274648 [00:00<00:00, 3052059.70it/s]

21



100%|██████████| 274648/274648 [00:00<00:00, 3262087.32it/s]

22



100%|██████████| 274648/274648 [00:00<00:00, 2997975.78it/s]

23



100%|██████████| 274648/274648 [00:00<00:00, 3308729.12it/s]

24



100%|██████████| 274648/274648 [00:00<00:00, 3274494.11it/s]

25



100%|██████████| 274648/274648 [00:00<00:00, 3336946.19it/s]

26



100%|██████████| 274648/274648 [00:00<00:00, 3197855.82it/s]

27



100%|██████████| 274648/274648 [00:00<00:00, 3282546.59it/s]

28





29


In [14]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in tqdm(sents)])
    return targets, regressors

In [15]:
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

100%|██████████| 274648/274648 [15:20<00:00, 298.50it/s]
100%|██████████| 117707/117707 [06:32<00:00, 299.51it/s]


NameError: name 'accuracy_score' is not defined

In [18]:
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.6687707612971191
Testing F1 score: 0.6470871755332511
