In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



/kaggle/input/complaint-cfpb/complaints.csv


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as scp
%matplotlib inline
plt.rcParams['figure.figsize'] = 8,4
import warnings
warnings.filterwarnings('ignore')

from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import plotly.graph_objs as go

import plotly.express as px

In [3]:
df = pd.read_csv('/kaggle/input/complaint-cfpb/complaints.csv')

In [4]:
df.shape

(3190837, 18)

In [5]:
df.columns

Index(['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue',
       'Consumer complaint narrative', 'Company public response', 'Company',
       'State', 'ZIP code', 'Tags', 'Consumer consent provided?',
       'Submitted via', 'Date sent to company', 'Company response to consumer',
       'Timely response?', 'Consumer disputed?', 'Complaint ID'],
      dtype='object')

In [6]:
#Clean column headers up
df.columns = ['DateReceived', 'Product', 'SubProduct', 'Issue','SubIssue','ConsumerComplaintNarrative', 
               'CompanyPublicResponse', 'Company', 'State', 'ZIPCode', 'Tags', 'ConsumerConsentProvided?',
               'SubmittedVia', 'DateSentToCompany', 'CompanyResponseToConsumer', 'TimelyResponse', 
               'ConsumerDisputed', 'ComplaintID']

drop_columns = ['Tags',
                'CompanyPublicResponse',
                'ConsumerConsentProvided?',
                'DateSentToCompany',
                'ComplaintID']

df = df.drop(drop_columns, 1) 

df.dtypes

DateReceived                  object
Product                       object
SubProduct                    object
Issue                         object
SubIssue                      object
ConsumerComplaintNarrative    object
Company                       object
State                         object
ZIPCode                       object
SubmittedVia                  object
CompanyResponseToConsumer     object
TimelyResponse                object
ConsumerDisputed              object
dtype: object

In [8]:
replacements = {'No': 0,
               'Yes': 1,
               'Untimely response' : 0,
               'Closed without relief': 0,
               'Closed' : 0,
               'Closed with explanation': 1,
               'Closed with non-monetary relief' : 1,
               'In progress' : np.nan,
               'Closed with relief': 1,
               'Closed with monetary relief' : 1}

df.replace(replacements, inplace = True)

In [9]:
total_of_complaints = df.shape[0]
total_of_closed_with_relief = (np.sum(df['CompanyResponseToConsumer'] == 1)/total_of_complaints)*100
total_of_closed_without_relief = (np.sum(df['CompanyResponseToConsumer'] == 0)/total_of_complaints)*100
print("Closed with relief:", round(total_of_closed_with_relief,2),"%")
print("Closed without relief:", round(total_of_closed_without_relief,2),"%")

Closed with relief: 96.32 %
Closed without relief: 1.41 %


In [10]:
categorical_variables = ['CompanyResponseToConsumer', 'TimelyResponse', 'ConsumerDisputed']
df[categorical_variables].apply(pd.Series.value_counts)

Unnamed: 0,CompanyResponseToConsumer,TimelyResponse,ConsumerDisputed
0.0,44876,50666,620062
1.0,3073395,3140171,148378


In [11]:
dfCCN = df[~df['ConsumerComplaintNarrative'].isnull()]
cat_variables = ['ConsumerComplaintNarrative']
dfCCN[cat_variables].shape # 331,077 instances

(1150291, 1)

In [12]:
dfCCN_sample = dfCCN[0:400]
dfCCN_sample_trans = dfCCN[0:400]
dfCCN_sample[cat_variables].shape

(400, 1)

In [13]:
dfCCN_sample['Complaint_Length'] = dfCCN_sample['ConsumerComplaintNarrative'].str.len()
type(dfCCN_sample['Complaint_Length'])

pandas.core.series.Series

In [14]:
maxClm = dfCCN_sample['Complaint_Length'].max()
maxClm

9415

In [15]:
# Index where B is longest
idx = dfCCN_sample["ConsumerComplaintNarrative"].apply(len).idxmax()
print(idx)
# Get that row
#dfCCN["ConsumerComplaintNarrative"][2778559]

3630


# Extract Topic from CustomerCompalint text: CFPB

# Used various models from huggingface 
### This model is a fine-tuned version of yiyanghkust/finbert-tone on Twitter Financial News Topic dataset. It determines the financial topic of given tweets over 20 various topics.

link for the model: https://huggingface.co/nickmuchi/finbert-tone-finetuned-finance-topic-classification?text=Amazon+to+buy+primary+health+care+provider+One+Medical+for+roughly+%243.9+billion  

In [16]:
from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import expit

    
MODEL = "nickmuchi/finbert-tone-finetuned-finance-topic-classification"
tokenizer = AutoTokenizer.from_pretrained(MODEL)

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
class_mapping = model.config.id2label
print(class_mapping)

Downloading:   0%|          | 0.00/435 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/221k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/695k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.76k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/419M [00:00<?, ?B/s]

{0: 'Analyst Update', 1: 'Fed | Central Banks', 2: 'Company | Product News', 3: 'Treasuries | Corporate Debt', 4: 'Dividend', 5: 'Earnings', 6: 'Energy | Oil', 7: 'Financials', 8: 'Currencies', 9: 'General News | Opinion', 10: 'Gold | Metals | Materials', 11: 'IPO', 12: 'Legal | Regulation', 13: 'M&A | Investments', 14: 'Macro', 15: 'Markets', 16: 'Politics', 17: 'Personnel Change', 18: 'Stock Commentary', 19: 'Stock Movement'}


In [41]:
text = 'Bank robbery cames in news paper'
tokens = tokenizer(text, return_tensors='pt',truncation=True, max_length=512,)
output = model(**tokens)

scores1 = output[0][0].detach().numpy()
scores2 = expit(scores1)
predictions = (scores2 >= 0.5) * 1
topic_dict = {}
    
for i in range(len(predictions)):
    if predictions[i]:
        #print(class_mapping[i], score[i]) 
        topic_dict[class_mapping[i]] = scores2[i]
sorted_dict = sorted(topic_dict.items(),key=lambda x:x[1], reverse=True)
topic_reverse = dict(sorted_dict)
topic_reverse

{'General News | Opinion': 0.9808168,
 'Fed | Central Banks': 0.97377723,
 'Treasuries | Corporate Debt': 0.8787653,
 'Politics': 0.80446726,
 'Legal | Regulation': 0.7749341,
 'Macro': 0.5970086,
 'Personnel Change': 0.5013461}

In [38]:
scores2

array([0.37896517, 0.97377723, 0.3192092 , 0.8787653 , 0.18298371,
       0.1547073 , 0.36147752, 0.2958977 , 0.4855542 , 0.9808168 ,
       0.25211367, 0.27280265, 0.7749341 , 0.36058185, 0.5970086 ,
       0.21026123, 0.80446726, 0.5013461 , 0.25636077, 0.17709276],
      dtype=float32)

In [42]:
def Get_Topic(text):
    tokens = tokenizer(text, return_tensors='pt',truncation=True, max_length=512,)
    output = model(**tokens)
    scores1 = output[0][0].detach().numpy()
    scores2 = expit(scores1)
    predictions = (scores2 >= 0.5) * 1
    topic_dict = {}
    # Map to classes
    for i in range(len(predictions)):
      if predictions[i]:
        #print(class_mapping[i], score[i]) 
        topic_dict[class_mapping[i]] = scores2[i]
    sorted_dict = sorted(topic_dict.items(),key=lambda x:x[1], reverse=True)
    topic_reverse = dict(sorted_dict)
    return topic_reverse
    #return predictions, scores1
    
def get_topic(predictions, score):
    topic_dict = {}
    # Map to classes
    for i in range(len(predictions)):
      if predictions[i]:
        #print(class_mapping[i], score[i]) 
        topic_dict[class_mapping[i]] = score[i]
        #return topic_dict
    sorted_dict = sorted(topic_dict.items(),key=lambda x:x[1], reverse=True)
    topic_reverse = dict(sorted_dict)
    return topic_reverse    

In [55]:
sample_cfpbtext = dfCCN_sample["ConsumerComplaintNarrative"][3630]
#print(sample_cfpbtext)
#sample_cfpbtext = 'Bank robbery cames in news paper'


In [22]:
print(type(dfCCN_sample))

<class 'pandas.core.frame.DataFrame'>


In [54]:
Topic = Get_Topic(sample_cfpbtext)
print(Topic)
print(type(Topic))

{'General News | Opinion': 0.9808168, 'Fed | Central Banks': 0.97377723, 'Treasuries | Corporate Debt': 0.8787653, 'Politics': 0.80446726, 'Legal | Regulation': 0.7749341, 'Macro': 0.5970086, 'Personnel Change': 0.5013461}
<class 'dict'>


In [52]:
dfCCN_sample['Topics_CCN'] = dfCCN_sample["ConsumerComplaintNarrative"].map(lambda text: Get_Topic(text))

In [49]:
dfCCN_sample.head(2)

Unnamed: 0,DateReceived,Product,SubProduct,Issue,SubIssue,ConsumerComplaintNarrative,Company,State,ZIPCode,SubmittedVia,CompanyResponseToConsumer,TimelyResponse,ConsumerDisputed,Complaint_Length,Topics_CCN
0,2022-12-18,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Account information incorrect,still showing an account that should have been...,Experian Information Solutions Inc.,FL,33685.0,Web,1.0,1,,439,"{'Company | Product News': 0.9698917, 'General..."
1,2022-11-07,"Credit reporting, credit repair services, or o...",Credit reporting,Problem with a credit reporting company's inve...,Their investigation did not fix an error on yo...,I discovered that some of the information on m...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",NY,10469.0,Web,1.0,1,,669,"{'Stock Commentary': 0.99155504, 'General News..."


In [50]:
dfCCN_sample['Topics_CCN'].iloc[300]

{'General News | Opinion': 0.99881613,
 'Legal | Regulation': 0.95282215,
 'Company | Product News': 0.88493216,
 'Stock Commentary': 0.8159385,
 'Macro': 0.6576548,
 'Politics': 0.55249286}

In [51]:
dfCCN_sample['Topics_CCN'].iloc[1]

{'Stock Commentary': 0.99155504,
 'General News | Opinion': 0.9556083,
 'Macro': 0.9392888,
 'Company | Product News': 0.9125292,
 'Treasuries | Corporate Debt': 0.6779366,
 'Fed | Central Banks': 0.57674825,
 'Energy | Oil': 0.5468057}