In [1]:
import numpy as np
import pandas as pd
import torch

In [2]:
import random

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [4]:
data = pd.read_csv("consumer_complaints.csv")

print(data.shape)
data.head()

(555957, 18)


  data = pd.read_csv("consumer_complaints.csv")


Unnamed: 0,date_received,product,sub_product,issue,sub_issue,consumer_complaint_narrative,company_public_response,company,state,zipcode,tags,consumer_consent_provided,submitted_via,date_sent_to_company,company_response_to_consumer,timely_response,consumer_disputed?,complaint_id
0,08/30/2013,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,,U.S. Bancorp,CA,95993,,,Referral,09/03/2013,Closed with explanation,Yes,Yes,511074
1,08/30/2013,Mortgage,Other mortgage,"Loan servicing, payments, escrow account",,,,Wells Fargo & Company,CA,91104,,,Referral,09/03/2013,Closed with explanation,Yes,Yes,511080
2,08/30/2013,Credit reporting,,Incorrect information on credit report,Account status,,,Wells Fargo & Company,NY,11764,,,Postal mail,09/18/2013,Closed with explanation,Yes,No,510473
3,08/30/2013,Student loan,Non-federal student loan,Repaying your loan,Repaying your loan,,,"Navient Solutions, Inc.",MD,21402,,,Email,08/30/2013,Closed with explanation,Yes,Yes,510326
4,08/30/2013,Debt collection,Credit card,False statements or representation,Attempted to collect wrong amount,,,Resurgent Capital Services L.P.,GA,30106,,,Web,08/30/2013,Closed with explanation,Yes,Yes,511067


In [None]:
data.dropna(subset=["consumer_complaint_narrative", "product"], inplace=True)
data.head()

In [None]:
data['word_count'] = data['consumer_complaint_narrative'].str.split().str.len()
data = data[data['word_count'] >= 250][["product", "consumer_complaint_narrative"]]

print(data.shape)
data.head()

In [None]:
data["product"].value_counts()

In [None]:
data = data[~data["product"].isin(["Other financial service"])]

In [None]:
num_labels = len(data["product"].unique())
print("num labels: ", num_labels)
genre2id = {genre: i for i, genre in enumerate(data["product"].unique())}
id2genre = {i: genre for i, genre in enumerate(data["product"].unique())}

In [None]:
data["genre_id"] = data["product"].apply(lambda a: genre2id[a])

In [None]:
import re

def clean_text(text):
    text = re.sub('[^a-zA-Z0-9\.\,\?\!]', ' ', str(text).lower()) # remove all except lowercase, uppercase, digits, punctuation
    text = re.sub('\[.*?\]', '', text) # remove any text in square brackets
    text = re.sub('https?://\S+|www\.\S+', '', text) # remove any links present 
    text = re.sub('\n', ' ', text) # remove the next line character
    text = re.sub('\w*\d\w*', '', text) # remove the words contaitning numbers
    text = re.sub('\s+', ' ', text) # remove extra spaces
    text = re.sub('x{2,}', '', text) # remove multiple x's
    
    return text

In [None]:
data['consumer_complaint_narrative'] = data['consumer_complaint_narrative'].apply(clean_text)
data.head()

In [None]:
mortgage = data[data["genre_id"] == 0]
credit_card = data[data["genre_id"] == 1]
consumer_loan = data[data["genre_id"] == 2]
debt_collection = data[data["genre_id"] == 3]
credit_reporting = data[data["genre_id"] == 4]
student_loan = data[data["genre_id"] == 5]
bank_account = data[data["genre_id"] == 6]
money_transfers = data[data["genre_id"] == 7]
payday_loan = data[data["genre_id"] == 8]
prepaid_card = data[data["genre_id"] == 9]

In [None]:
from sklearn.utils import resample

mortgage_downsample = resample(mortgage,
                              replace=False,
                              n_samples=600,
                              random_state=42)

credit_card_downsample = resample(credit_card,
                              replace=False,
                              n_samples=600,
                              random_state=42)

consumer_loan_downsample = resample(consumer_loan,
                              replace=False,
                              n_samples=600,
                              random_state=42)

debt_collection_downsample = resample(debt_collection,
                              replace=False,
                              n_samples=600,
                              random_state=42)

credit_reporting_downsample = resample(credit_reporting,
                              replace=False,
                              n_samples=600,
                              random_state=42)

student_loan_downsample = resample(student_loan,
                              replace=False,
                              n_samples=500,
                              random_state=42)

bank_account_downsample = resample(bank_account,
                              replace=False,
                              n_samples=600,
                              random_state=42)

money_transfers_downsample = resample(money_transfers,
                              replace=False,
                              n_samples=160,
                              random_state=42)

payday_loan_downsample = resample(payday_loan,
                              replace=False,
                              n_samples=80,
                              random_state=42)

prepaid_card_downsample = resample(prepaid_card,
                              replace=False,
                              n_samples=130,
                              random_state=42)

In [None]:
train = pd.concat([mortgage_downsample, credit_card_downsample, consumer_loan_downsample, debt_collection_downsample, credit_reporting_downsample, student_loan_downsample, bank_account_downsample, money_transfers_downsample, payday_loan_downsample, prepaid_card_downsample])

In [None]:
train["product"].value_counts()

In [None]:
test = data.loc[~data.index.isin(train.index)]

In [None]:
test["product"].value_counts()

In [None]:
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")