In [6]:
import pandas as pd

# shopify rollup dataset. 
# Goal: plot % of daily active (rev>$0) accounts that are fraudulent.
# store status flow: open>fraud>closed or open>closed.

# mock up data
s = """store_id,date,status,revenue
1,2019-01-01,open,123
1,2019-01-02,fraud,15
1,2019-01-03,fraud,0
2,2019-01-01,open,100
2,2019-01-02,open,234
2,2019-01-03,open,0
3,2019-01-01,fraud,12
3,2019-01-02,closed,0
3,2019-01-03,closed,0
4,2019-01-03,open,50"""
df = pd.read_csv(pd.compat.StringIO(s))


In [10]:

# option 1
(
    df[df["revenue"]>0]
    .groupby(["date","status"])
    .agg({"store_id":"count"})
    .rename(columns={'store_id': 'stores'})
    .unstack(fill_value=0).stack() # https://stackoverflow.com/a/49128246
    .groupby(level=0)
    .apply(lambda x: x/float(x.sum())) # this may be slow
#     .reset_index()
    .query("status == 'fraud'")
)


Unnamed: 0_level_0,Unnamed: 1_level_0,stores
date,status,Unnamed: 2_level_1
2019-01-01,fraud,0.333333
2019-01-02,fraud,0.5
2019-01-03,fraud,0.0


In [8]:
# option 2, scales better
daily_by_status = (
    df[df["revenue"]>0]
    .groupby(["date","status"])
    .agg({"store_id":"count"})
    .rename(columns={'store_id': 'stores'})
    .unstack(fill_value=0).stack() # https://stackoverflow.com/a/49128246
)

daily_total = (
    df[df["revenue"]>0]
    .groupby(['date'])
    .agg({'store_id':'count'})
    .rename(columns={'store_id': 'stores'})
)

daily_by_status.div(daily_total, level='date').query("status == 'fraud'")


Unnamed: 0_level_0,Unnamed: 1_level_0,stores
date,status,Unnamed: 2_level_1
2019-01-01,fraud,0.333333
2019-01-02,fraud,0.5
2019-01-03,fraud,0.0
