In [75]:
!pip install "jupyterlab>=3" "ipywidgets>=7.6"
!pip install plotly==5.10.0
!pip install altair



In [76]:
import pandas as pd
import sklearn
import altair as alt
import plotly.express as px
import sklearn
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('stopwords')
from gensim.models.coherencemodel import CoherenceModel
stop_words = set(nltk.corpus.stopwords.words('english'))
alt.data_transformers.disable_max_rows()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/DanielYoon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


DataTransformerRegistry.enable('default')

In [None]:
df=pd.read_csv('complaints.csv')


In [None]:
df.shape


In [None]:
df['Consumer complaint narrative']

In [None]:
df.isnull().sum()

In [None]:
df['Date_received_dt'] = pd.to_datetime(df['Date received'], format='%Y/%m/%d')


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fig = sns.histplot(data=df.Date_received_dt, kde=False, bins=24)
plt.xlabel('Date')
plt.show(fig)




In [None]:
fig = px.histogram(df, x="Date_received_dt", labels={
                     "Date_received_dt": "Date",
                 })

fig.update_layout({
    'plot_bgcolor': 'rgba(0,0,0,0)',
})

import plotly.graph_objects as go
fig.update_layout(shapes=[go.layout.Shape(
type='rect',
xref='paper',
yref='paper',
x0=0,
y0=0,
x1=1,
y1=1,
line={'width': 1, 'color': 'black'}
)])

fig.show()

In [None]:
P = df.groupby(['Date_received_dt','Timely response?'])['Complaint ID'].count().reset_index()
P2 = P[0:7800]
P = P2

In [None]:
P['Percentage'] = P.groupby('Date_received_dt')['Complaint ID'].apply(lambda x: x*100/x.sum())


In [None]:
P_yes=P[P['Timely response?']=='Yes']


In [None]:
fig = px.line(P_yes, x='Date_received_dt',y="Percentage", labels={
                     "Date_received_dt": "Date",
                 })



fig.update_layout({
    'plot_bgcolor': 'rgba(0,0,0,0)',
})

import plotly.graph_objects as go
fig.update_layout(shapes=[go.layout.Shape(
type='rect',
xref='paper',
yref='paper',
x0=0,
y0=0,
x1=1,
y1=1,
line={'width': 1, 'color': 'lightgray'}
)])

fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')

fig.show()



# Used line chart to show the monthly timely response rate by top 5 complaints companies in terms of volume.

In [None]:
top5=df.groupby('Company')['Complaint ID'].count().reset_index(name='count') \
                             .sort_values(['count'], ascending=False) \
                             .head(5)
top5_company=top5.Company.to_list()
top5_df=df[df['Company'].isin(top10_company)]

In [None]:
#Group by the data to year and month level
P2=top5_df.groupby([top5_df['Date_received_dt'].dt.to_period('M').rename('Year-Month'),
                     'Company','Timely response?'])['Complaint ID'].count().reset_index(name='count')

P2['Percentage']=P2.groupby(['Year-Month'
           ,'Company'])['count'].apply(lambda x: x*100/x.sum())

In [None]:
#Keep only timely response percentage.
P2_yes=P2[P2['Timely response?']=='Yes']

In [None]:
fig = px.line(P2_yes, x=P2_yes['Year-Month'].dt.to_timestamp(),y="Percentage",color='Company', labels={
                     "x": "Date",
                 })




fig.update_layout({
    'plot_bgcolor': 'rgba(0,0,0,0)',
})

import plotly.graph_objects as go
fig.update_layout(shapes=[go.layout.Shape(
type='rect',
xref='paper',
yref='paper',
x0=0,
y0=0,
x1=1,
y1=1,
line={'width': 1, 'color': 'lightgray'}
)])

fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')

fig.show()


# Used scatter plot to show the timely response rate by top 5 issues

In [None]:
top5_issue=df.groupby('Issue')['Complaint ID'].count().reset_index(name='count') \
                             .sort_values(['count'], ascending=False) \
                             .head(5)
top5_issue=top5_issue.Issue.to_list()
top5_df_issue=df[df['Issue'].isin(top5_issue)]

In [None]:
P3=top5_df_issue.groupby([top5_df_issue['Date_received_dt'].dt.to_period('M').rename('Year-Month'),
                     'Issue','Timely response?'])['Complaint ID'].count().reset_index(name='count')

P3['Percentage']=P3.groupby(['Year-Month'
           ,'Issue'])['count'].apply(lambda x: x*100/x.sum())

In [None]:
P3_yes=P3[P3['Timely response?']=='Yes']

In [None]:
fig = px.scatter(P3_yes, x=P3_yes['Year-Month'].dt.to_timestamp(),y="Percentage",color='Issue', labels={
                     "x": "Date",
                 })





fig.update_layout({
    'plot_bgcolor': 'rgba(0,0,0,0)',
})

import plotly.graph_objects as go
fig.update_layout(shapes=[go.layout.Shape(
type='rect',
xref='paper',
yref='paper',
x0=0,
y0=0,
x1=1,
y1=1,
line={'width': 1, 'color': 'lightgray'}
)])

fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')

fig.show()




# How customers submit to the company

In [None]:
fig = sns.displot(data = df, x="Date_received_dt", hue='Submitted via')
plt.xlabel('Date')

plt.show(fig)

# How companies respond to customers

In [None]:
sns.displot(data = df, x="Date_received_dt", hue='Company response to consumer', kind='kde')
plt.xlabel('Date')

plt.show(fig)

In [None]:
dfd