In [20]:
# importing all necessary modules
import pandas as pd
import seaborn as sns
import numpy as np

In [2]:
# Loading data into pandas dataframe.
file_location = '../data/complaints.csv.zip'

full_df = pd.read_csv(file_location)

In [3]:
# First few lines of dataframe
full_df.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,2019-09-24,Debt collection,I do not know,Attempts to collect debt not owed,Debt is not yours,transworld systems inc. \nis trying to collect...,,TRANSWORLD SYSTEMS INC,FL,335XX,,Consent provided,Web,2019-09-24,Closed with explanation,Yes,,3384392
1,2019-09-19,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,,Company has responded to the consumer and the ...,Experian Information Solutions Inc.,PA,15206,,Consent not provided,Web,2019-09-20,Closed with non-monetary relief,Yes,,3379500
2,2019-11-08,Debt collection,I do not know,Communication tactics,Frequent or repeated calls,"Over the past 2 weeks, I have been receiving e...",,"Diversified Consultants, Inc.",NC,275XX,,Consent provided,Web,2019-11-08,Closed with explanation,Yes,,3433198
3,2019-09-15,Debt collection,Other debt,Attempts to collect debt not owed,Debt was result of identity theft,Pioneer has committed several federal violatio...,,Pioneer Capital Solutions Inc,CA,925XX,,Consent provided,Web,2019-09-15,Closed with explanation,Yes,,3374555
4,2020-10-23,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,,Company has responded to the consumer and the ...,Experian Information Solutions Inc.,FL,334XX,,Other,Web,2020-10-23,Closed with explanation,Yes,,3915909


In [4]:
# Shape of the data
full_df.shape

(1839405, 18)

In [5]:
# Display All columns
full_df.columns

Index(['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue',
       'Consumer complaint narrative', 'Company public response', 'Company',
       'State', 'ZIP code', 'Tags', 'Consumer consent provided?',
       'Submitted via', 'Date sent to company', 'Company response to consumer',
       'Timely response?', 'Consumer disputed?', 'Complaint ID'],
      dtype='object')

In [6]:
# Display information about the columns
full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1839405 entries, 0 to 1839404
Data columns (total 18 columns):
 #   Column                        Dtype 
---  ------                        ----- 
 0   Date received                 object
 1   Product                       object
 2   Sub-product                   object
 3   Issue                         object
 4   Sub-issue                     object
 5   Consumer complaint narrative  object
 6   Company public response       object
 7   Company                       object
 8   State                         object
 9   ZIP code                      object
 10  Tags                          object
 11  Consumer consent provided?    object
 12  Submitted via                 object
 13  Date sent to company          object
 14  Company response to consumer  object
 15  Timely response?              object
 16  Consumer disputed?            object
 17  Complaint ID                  int64 
dtypes: int64(1), object(17)
memory usage: 252.

In [7]:
# Select only necessary columns for further processing.
necessary_columns = ['Product', 'Sub-product', 'Issue',
                     'Sub-issue', 'Consumer complaint narrative', 'State',
                      'Tags', 'Submitted via', 'Timely response?']

In [8]:
# create new dataframe with necessary columns
df = full_df[necessary_columns]

In [9]:
# Changing column Name
df.columns = ['label', 'sub_label','issue',
              'sub-issue', 'full_text', 'state',
              'tags', 'submitted_via', 'response']

In [10]:
df.head()

Unnamed: 0,label,sub_label,issue,sub-issue,full_text,state,tags,submitted_via,response
0,Debt collection,I do not know,Attempts to collect debt not owed,Debt is not yours,transworld systems inc. \nis trying to collect...,FL,,Web,Yes
1,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,,PA,,Web,Yes
2,Debt collection,I do not know,Communication tactics,Frequent or repeated calls,"Over the past 2 weeks, I have been receiving e...",NC,,Web,Yes
3,Debt collection,Other debt,Attempts to collect debt not owed,Debt was result of identity theft,Pioneer has committed several federal violatio...,CA,,Web,Yes
4,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,,FL,,Web,Yes


In [11]:
# Checking Null values in the frame
df.isnull().sum()

label                  0
sub_label         235165
issue                  0
sub-issue         589910
full_text        1214191
state              34551
tags             1598292
submitted_via          0
response               0
dtype: int64

In [12]:
# Checking How many values in label column
df['label'].value_counts().to_frame()

Unnamed: 0,label
"Credit reporting, credit repair services, or other personal consumer reports",553660
Debt collection,321220
Mortgage,314233
Credit reporting,140432
Credit card or prepaid card,94142
Credit card,89190
Bank account or service,86206
Checking or savings account,76192
Student loan,59729
Consumer Loan,31604


In [13]:
# Set pandas option to show all column since notebook
# Restriction on showing all columns
pd.set_option('display.max_rows', None)

In [14]:
# Checking How many values in sub-label column
df['sub_label'].value_counts().to_frame()

Unnamed: 0,sub_label
Credit reporting,546054
Checking account,117190
Other mortgage,86635
General-purpose credit card or charge card,71028
Conventional fixed mortgage,70613
I do not know,67782
Conventional home mortgage,51861
Other debt,50883
"Other (i.e. phone, health club, etc.)",44543
Credit card debt,38730


In [15]:
# How many values on tags column
df['tags'].value_counts().to_frame()

Unnamed: 0,tags
Servicemember,126278
Older American,94847
"Older American, Servicemember",19988


In [16]:
# check values in submitted column
df['submitted_via'].value_counts().to_frame()

Unnamed: 0,submitted_via
Web,1429188
Referral,202707
Phone,105169
Postal mail,78639
Fax,23279
Email,423


In [17]:
# Check values in response column
df['response'].value_counts().to_frame()

Unnamed: 0,response
Yes,1800501
No,38904


In [None]:
# Retrive full_text based on customer submission
# This is to check null values.

df[df['submitted_via'] == 'Web'].sum()