### Consumer complaints dataset . It can be downloaded from https://catalog.data.gov/dataset/consumer-complaint-database

In [2]:
import pandas as pd
consumer_complaints_df = pd.read_csv("Consumer_Complaints.csv")
consumer_complaints_df.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,03/12/2014,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,,M&T BANK CORPORATION,MI,48382,,,Referral,03/17/2014,Closed with explanation,Yes,No,759217
1,10/01/2016,Credit reporting,,Incorrect information on credit report,Account status,I have outdated information on my credit repor...,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",AL,352XX,,Consent provided,Web,10/05/2016,Closed with explanation,Yes,No,2141773
2,10/17/2016,Consumer Loan,Vehicle loan,Managing the loan or lease,,I purchased a new car on XXXX XXXX. The car de...,,"CITIZENS FINANCIAL GROUP, INC.",PA,177XX,Older American,Consent provided,Web,10/20/2016,Closed with explanation,Yes,No,2163100
3,06/08/2014,Credit card,,Bankruptcy,,,,AMERICAN EXPRESS COMPANY,ID,83854,Older American,,Web,06/10/2014,Closed with explanation,Yes,Yes,885638
4,09/13/2014,Debt collection,Credit card,Communication tactics,Frequent or repeated calls,,,"CITIBANK, N.A.",VA,23233,,,Web,09/13/2014,Closed with explanation,Yes,Yes,1027760


In [3]:
df = pd.read_json('./data/dfExoplanetsNASAdetected100rand_v2.json', orient = 'table')
consumer_complaints_df = df[['sent', 'label']]
consumer_complaints_df.columns=['Consumer complaint narrative',"Product"]

### Printing list of unique classes

In [4]:
consumer_complaints_df['Product'].unique()

array(['discovery', 'None', ''], dtype=object)

### Removing rows with null value for 'Consumer complaint narrative'

In [5]:
consumer_complaints_filtered_df = consumer_complaints_df[pd.notnull(consumer_complaints_df['Consumer complaint narrative'])]
consumer_complaints_filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 0 to 99
Data columns (total 2 columns):
Consumer complaint narrative    100 non-null object
Product                         100 non-null object
dtypes: object(2)
memory usage: 2.3+ KB


### Visualizing distribution of classes

In [6]:
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(10,6))
df = consumer_complaints_filtered_df[['Product','Consumer complaint narrative']]
df.groupby('Product').count().plot.bar(ylim=0)
plt.show()

<Figure size 1000x600 with 0 Axes>

<Figure size 640x480 with 1 Axes>

### Preprocessing data

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

labels = df['Product']
text = df['Consumer complaint narrative']

X_train, X_test, y_train, y_test = train_test_split(text, labels, random_state=0, test_size=0.3)

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_transformed = tf_transformer.transform(X_train_counts)

X_test_counts = count_vect.transform(X_test)
X_test_transformed = tf_transformer.transform(X_test_counts)

labels = LabelEncoder()
y_train_labels_fit = labels.fit(y_train)
y_train_lables_trf = labels.transform(y_train)

print(labels.classes_)

['None' 'discovery']


### Training a LinearSVC classifier and using CalibratedClassifierCV to get probabilities for each predicted class

In [8]:
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

linear_svc = LinearSVC()
clf = linear_svc.fit(X_train_transformed,y_train_lables_trf)

calibrated_svc = CalibratedClassifierCV(base_estimator=linear_svc,
                                        cv="prefit")

calibrated_svc.fit(X_train_transformed,y_train_lables_trf)
predicted = calibrated_svc.predict(X_test_transformed)
    
to_predict = ["I have outdated information on my credit report that I have previously disputed that has yet to be removed this information is more then seven years old and does not meet credit reporting requirements"]
p_count = count_vect.transform(to_predict)
p_tfidf = tf_transformer.transform(p_count)
print('Average accuracy on test set={}'.format(np.mean(predicted == labels.transform(y_test))))
print('Predicted probabilities of demo input string are')
print(calibrated_svc.predict_proba(p_tfidf))

ValueError: y contains previously unseen labels: ''

### Printing predicted probability of demo input

In [35]:
pd.DataFrame(calibrated_svc.predict_proba(p_tfidf)*100, columns=labels.classes_)

Unnamed: 0,Bank account or service,Checking or savings account,Consumer Loan,Credit card,Credit card or prepaid card,Credit reporting,"Credit reporting, credit repair services, or other personal consumer reports",Debt collection,"Money transfer, virtual currency, or money service",Money transfers,Mortgage,Other financial service,Payday loan,"Payday loan, title loan, or personal loan",Prepaid card,Student loan,Vehicle loan or lease,Virtual currency
0,0.04661,0.000761,0.242386,0.083987,0.096338,76.720032,20.738274,1.732941,3.9e-05,0.000377,0.240907,3.802342e-08,0.001168,0.001433,0.000694,0.089579,0.003782,0.000692
