In [6]:
import pandas as pd
import numpy as np
import glob
import numpy as np

# from plotly.graph_objs import *
# import plotly as px

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


In [7]:
df = pd.read_csv('data/sample.csv', index_col=0)

In [8]:
df.head()

Unnamed: 0,product,customer_id,amount
0,outlxwfhyu,customer1,0
1,ukjakcfikb,customer2,0
2,fstxfaulvj,customer3,15
3,wcpurpgyce,customer4,178
4,oyhjphvfbb,customer5,157


In [9]:
# drop rows with amount=0
# df = df.drop(df[df.amount <= 0].index)

In [10]:
def pareto_analysis_one(df, col, percentile):
    """
    Returns list object. 
    """
    
    # total # of products for the month
    total_products = len(df.index)
        
    # sort dataframe in desc order by col selected, in this case, just price
    df = df.sort_values(by=col, ascending=False)

    # perc share amount spent. not neccessary for calc but good for ground truth
    total_amount = df[col].sum()
    df['perc_share_amount'] = (df[col] / total_amount)*100
  
    # cumulative percentile
    df['cum_sum_amount'] = df[col].cumsum()
    df['cum_sum_perc_amount'] = 100*df['cum_sum_amount']/total_amount

    # filter the df where cum_sum_perc_amount reach 80.0
    df = df[df['cum_sum_perc_amount'] < percentile]

    # now calculate the KPI
    num_products = len(df.index)
    kpi = (num_products/total_products)*100
    
    return(round(kpi, 2))

# test
pareto_analysis_one(df, 'amount', 80.00)

# 80% of amount spent was generated by 20.45% of unique products

20.45

### Make Plotly pareto chart
# Code modified from 
link[https://newbedev.com/how-to-overlay-two-plots-in-same-figure-in-plotly-create-pareto-chart-in-plotly]!

In [11]:
df.describe()
# 1098 entries

df['product'].value_counts() # 1098 unique products
df['customer_id'].value_counts() # 561 unique customers

customer412    18
customer123     8
customer23      7
customer105     7
customer47      7
               ..
customer136     1
customer111     1
customer645     1
customer294     1
customer684     1
Name: customer_id, Length: 562, dtype: int64

In [12]:
df = df.drop(columns='customer_id')

In [13]:
df = df.sort_values(by=['amount'], ascending=False)
df['cumulative_sum'] = df.amount.cumsum()
df['cumulative_perc'] = 100*df.cumulative_sum/df.amount.sum()
df.head(200)

Unnamed: 0,product,amount,cumulative_sum,cumulative_perc
1192,teptlezwgy,170577,170577,4.111711
1325,pweflqandk,147300,317877,7.662336
1140,mkbtkziftf,106332,424209,10.225439
69,ptctwlpjbg,91496,515705,12.430925
820,mwvqtdpxik,84535,600240,14.468617
...,...,...,...,...
892,svyeloswfa,4385,3202904,77.205106
78,oswtzbraym,4341,3207245,77.309744
1313,satiorpbqm,4297,3211542,77.413322
644,ejektlxmsr,4212,3215754,77.514852


In [14]:
# make two tables:
# the first table: once cumulative_perc hits 80%
# the second table: after cumulative_perc hits 80%
top_80 = df.drop(df[df.cumulative_perc > 80.0].index)
bottom_20 = df.drop(df[df.cumulative_perc <= 80.0].index)

# sanity check. did we lose any rows? nope
print(len(top_80), len(bottom_20), len(df))


# collapse bottom_20 into one row: 
# product | amount
# other   | 10000

total_amount = bottom_20['amount'].sum()
bottom_20 = {'product': 'other', 'amount': total_amount}

top_80 = top_80[['product', 'amount']]

temp_df = top_80.append(bottom_20, ignore_index = True)

temp_df

225 875 1100


Unnamed: 0,product,amount
0,teptlezwgy,170577
1,pweflqandk,147300
2,mkbtkziftf,106332
3,ptctwlpjbg,91496
4,mwvqtdpxik,84535
...,...,...
221,zamghxyvuz,3711
222,ldbbjdniot,3710
223,amwobstgaq,3704
224,rzjepepehw,3688


In [15]:
temp_df['cumulative_sum'] = temp_df.amount.cumsum()
temp_df['cumulative_perc'] = 100*temp_df.cumulative_sum/temp_df.amount.sum()
temp_df.head()

Unnamed: 0,product,amount,cumulative_sum,cumulative_perc
0,teptlezwgy,170577,170577,4.111711
1,pweflqandk,147300,317877,7.662336
2,mkbtkziftf,106332,424209,10.225439
3,ptctwlpjbg,91496,515705,12.430925
4,mwvqtdpxik,84535,600240,14.468617


In [22]:
temp_df.to_csv('data/plotly_data.csv', index=True)
temp_df = pd.read_csv('data/plotly_data.csv', index_col=0)

In [23]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

cat = 'product'
num = 'amount'
title = 'This is a Pareto chart in Plotly'

trace1 = go.Bar(
    x=temp_df[cat],
    y=temp_df[num],
    name=num,
    marker=dict(
        color='rgb(34,163,192)'
               )
)
trace2 = go.Scatter(
    x=temp_df[cat],
    y=temp_df['cumulative_perc'],
    name='Cumulative Percentage',
    yaxis='y2'

)

fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(trace1)
fig.add_trace(trace2,secondary_y=True)
fig['layout'].update(height = 600, width = 800, title = title, xaxis=dict(
      tickangle=-90
    ))

# iplot(fig)

fig

In [24]:
import dash
from dash import dcc
from dash import html

app = dash.Dash()
app.layout = html.Div([
    dcc.Graph(figure=fig)
])

app.run_server(debug=True, use_reloader=False)  # Turn off reloader if inside Jupyter


Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: on
