In [2]:
import numpy as np
import pandas as pd
import torch

In [3]:
import random

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [4]:
data = pd.read_csv("consumer_complaints.csv")

print(data.shape)
data.head()

(555957, 18)


  data = pd.read_csv("consumer_complaints.csv")


Unnamed: 0,date_received,product,sub_product,issue,sub_issue,consumer_complaint_narrative,company_public_response,company,state,zipcode,tags,consumer_consent_provided,submitted_via,date_sent_to_company,company_response_to_consumer,timely_response,consumer_disputed?,complaint_id
0,08/30/2013,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,,U.S. Bancorp,CA,95993,,,Referral,09/03/2013,Closed with explanation,Yes,Yes,511074
1,08/30/2013,Mortgage,Other mortgage,"Loan servicing, payments, escrow account",,,,Wells Fargo & Company,CA,91104,,,Referral,09/03/2013,Closed with explanation,Yes,Yes,511080
2,08/30/2013,Credit reporting,,Incorrect information on credit report,Account status,,,Wells Fargo & Company,NY,11764,,,Postal mail,09/18/2013,Closed with explanation,Yes,No,510473
3,08/30/2013,Student loan,Non-federal student loan,Repaying your loan,Repaying your loan,,,"Navient Solutions, Inc.",MD,21402,,,Email,08/30/2013,Closed with explanation,Yes,Yes,510326
4,08/30/2013,Debt collection,Credit card,False statements or representation,Attempted to collect wrong amount,,,Resurgent Capital Services L.P.,GA,30106,,,Web,08/30/2013,Closed with explanation,Yes,Yes,511067


In [5]:
data.dropna(subset=["consumer_complaint_narrative", "product"], inplace=True)
data.head()

Unnamed: 0,date_received,product,sub_product,issue,sub_issue,consumer_complaint_narrative,company_public_response,company,state,zipcode,tags,consumer_consent_provided,submitted_via,date_sent_to_company,company_response_to_consumer,timely_response,consumer_disputed?,complaint_id
190126,03/19/2015,Debt collection,"Other (i.e. phone, health club, etc.)",Cont'd attempts collect debt not owed,Debt was paid,XXXX has claimed I owe them {$27.00} for XXXX ...,,"Diversified Consultants, Inc.",NY,121XX,Older American,Consent provided,Web,03/19/2015,Closed with explanation,Yes,No,1290516
190135,03/19/2015,Consumer Loan,Vehicle loan,Managing the loan or lease,,Due to inconsistencies in the amount owed that...,,M&T Bank Corporation,VA,221XX,Servicemember,Consent provided,Web,03/19/2015,Closed with explanation,Yes,No,1290492
190155,03/19/2015,Mortgage,Conventional fixed mortgage,"Loan modification,collection,foreclosure",,In XX/XX/XXXX my wages that I earned at my job...,,Wells Fargo & Company,CA,946XX,,Consent provided,Web,03/19/2015,Closed with explanation,Yes,Yes,1290524
190207,03/19/2015,Mortgage,Conventional fixed mortgage,"Loan servicing, payments, escrow account",,I have an open and current mortgage with Chase...,,JPMorgan Chase & Co.,CA,900XX,Older American,Consent provided,Web,03/19/2015,Closed with explanation,Yes,Yes,1290253
190208,03/19/2015,Mortgage,Conventional fixed mortgage,Credit decision / Underwriting,,XXXX was submitted XX/XX/XXXX. At the time I s...,,Rushmore Loan Management Services LLC,CA,956XX,Older American,Consent provided,Web,03/19/2015,Closed with explanation,Yes,Yes,1292137


In [6]:
data['word_count'] = data['consumer_complaint_narrative'].str.split().str.len()
data = data[data['word_count'] >= 250][["product", "consumer_complaint_narrative"]]

print(data.shape)
data.head()

(17142, 2)


Unnamed: 0,product,consumer_complaint_narrative
190155,Mortgage,In XX/XX/XXXX my wages that I earned at my job...
190208,Mortgage,XXXX was submitted XX/XX/XXXX. At the time I s...
190251,Mortgage,I spoke to XXXX of green tree representatives ...
190263,Credit card,i opened XXXX Bank of America credit cards 15-...
190264,Consumer Loan,I applied for a loan with XXXX XXXX and had pu...


In [7]:
data["product"].value_counts()

Mortgage                   6042
Debt collection            2832
Credit card                2293
Bank account or service    1930
Credit reporting           1753
Consumer Loan              1066
Student loan                734
Money transfers             201
Prepaid card                160
Payday loan                 100
Other financial service      31
Name: product, dtype: int64

In [8]:
data = data[~data["product"].isin(["Other financial service"])]

In [9]:
num_labels = len(data["product"].unique())
print("num labels: ", num_labels)
genre2id = {genre: i for i, genre in enumerate(data["product"].unique())}
id2genre = {i: genre for i, genre in enumerate(data["product"].unique())}

num labels:  10


In [10]:
data["genre_id"] = data["product"].apply(lambda a: genre2id[a])

In [11]:
import re

def clean_text(text):
    text = re.sub('[^a-zA-Z0-9\.\,\?\!]', ' ', str(text).lower()) # remove all except lowercase, uppercase, digits, punctuation
    text = re.sub('\[.*?\]', '', text) # remove any text in square brackets
    text = re.sub('https?://\S+|www\.\S+', '', text) # remove any links present 
    text = re.sub('\n', ' ', text) # remove the next line character
    text = re.sub('\w*\d\w*', '', text) # remove the words contaitning numbers
    text = re.sub('\s+', ' ', text) # remove extra spaces
    text = re.sub('x{2,}', '', text) # remove multiple x's
    
    return text

In [12]:
data['consumer_complaint_narrative'] = data['consumer_complaint_narrative'].apply(clean_text)
data.head()

Unnamed: 0,product,consumer_complaint_narrative,genre_id
190155,Mortgage,in my wages that i earned at my job decreas...,0
190208,Mortgage,was submitted . at the time i submitted thi...,0
190251,Mortgage,i spoke to of green tree representatives on ...,0
190263,Credit card,i opened bank of america credit cards years a...,1
190264,Consumer Loan,i applied for a loan with and had purchased ...,2


In [13]:
mortgage = data[data["genre_id"] == 0]
credit_card = data[data["genre_id"] == 1]
consumer_loan = data[data["genre_id"] == 2]
debt_collection = data[data["genre_id"] == 3]
credit_reporting = data[data["genre_id"] == 4]
student_loan = data[data["genre_id"] == 5]
bank_account = data[data["genre_id"] == 6]
money_transfers = data[data["genre_id"] == 7]
payday_loan = data[data["genre_id"] == 8]
prepaid_card = data[data["genre_id"] == 9]

In [14]:
from sklearn.utils import resample

mortgage_downsample = resample(mortgage,
                              replace=False,
                              n_samples=600,
                              random_state=42)

credit_card_downsample = resample(credit_card,
                              replace=False,
                              n_samples=600,
                              random_state=42)

consumer_loan_downsample = resample(consumer_loan,
                              replace=False,
                              n_samples=600,
                              random_state=42)

debt_collection_downsample = resample(debt_collection,
                              replace=False,
                              n_samples=600,
                              random_state=42)

credit_reporting_downsample = resample(credit_reporting,
                              replace=False,
                              n_samples=600,
                              random_state=42)

student_loan_downsample = resample(student_loan,
                              replace=False,
                              n_samples=500,
                              random_state=42)

bank_account_downsample = resample(bank_account,
                              replace=False,
                              n_samples=600,
                              random_state=42)

money_transfers_downsample = resample(money_transfers,
                              replace=False,
                              n_samples=160,
                              random_state=42)

payday_loan_downsample = resample(payday_loan,
                              replace=False,
                              n_samples=80,
                              random_state=42)

prepaid_card_downsample = resample(prepaid_card,
                              replace=False,
                              n_samples=130,
                              random_state=42)

In [15]:
train = pd.concat([mortgage_downsample, credit_card_downsample, consumer_loan_downsample, debt_collection_downsample, credit_reporting_downsample, student_loan_downsample, bank_account_downsample, money_transfers_downsample, payday_loan_downsample, prepaid_card_downsample])

In [16]:
train["product"].value_counts()

Mortgage                   600
Credit card                600
Consumer Loan              600
Debt collection            600
Credit reporting           600
Bank account or service    600
Student loan               500
Money transfers            160
Prepaid card               130
Payday loan                 80
Name: product, dtype: int64

In [17]:
test = data.loc[~data.index.isin(train.index)]

In [18]:
test["product"].value_counts()

Mortgage                   5442
Debt collection            2232
Credit card                1693
Bank account or service    1330
Credit reporting           1153
Consumer Loan               466
Student loan                234
Money transfers              41
Prepaid card                 30
Payday loan                  20
Name: product, dtype: int64

In [19]:
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce GTX 1070


In [20]:
train

Unnamed: 0,product,consumer_complaint_narrative,genre_id
289175,Mortgage,we have been trying to get our home out of for...,0
255512,Mortgage,carrington mortgage took over servicing my loa...,0
504921,Mortgage,this is a dispute to case number . i received ...,0
297108,Mortgage,"is to any ? dear .... my name is , with ...",0
312177,Mortgage,", i took out a mortgage . less than a month ...",0
...,...,...,...
224279,Prepaid card,i signed up for pay power visa card to get a ...,9
245873,Prepaid card,i purchased goggle . gift cards and when i we...,9
297169,Prepaid card,"on , i was online trying to file for identity...",9
519629,Prepaid card,on i noticed some suspicious activity on my ...,9


In [21]:
train = train.rename(columns={'product': 'label', 'consumer_complaint_narrative': 'text', 'genre_id': 'label_id'})

test = test.rename(columns={'product': 'label', 'consumer_complaint_narrative': 'text', 'genre_id': 'label_id'})

In [22]:
#Write data to files:
train.to_csv('data/ds2_train.csv', index=False)  # Specify the file path and name
test.to_csv('data/ds2_test.csv', index=False)  # Specify the file path and name

In [25]:
train = pd.read_csv("data/ds2_train.csv")
train.head()

Unnamed: 0,label,text,label_id
0,Mortgage,we have been trying to get our home out of for...,0
1,Mortgage,carrington mortgage took over servicing my loa...,0
2,Mortgage,this is a dispute to case number . i received ...,0
3,Mortgage,"is to any ? dear .... my name is , with ...",0
4,Mortgage,", i took out a mortgage . less than a month ...",0
