In [1]:
import numpy as np
import pickle
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

plt.style.use("ggplot")

import json

In [2]:
df = pd.read_excel('../data/Data_2020_P06/all_billings_inputs_08_20.xlsx',
                  sheet_name = 'base_billings')

In [3]:
print('df length: ', len(df))

df length:  294044


In [4]:
df.rename(
        index=str,
        columns={
            "Contrct Duration in Months": "duration",
            "Document Currency": "curr",
            "Enterprise BU Desc": "BU",
            "Invoice Fiscal Year Period Desc": "period",
            "POB Type": "POB_type",
            "Product Config Type": "config",
            "Rev Rec Category": "rev_req_type",
            "Rule For Bill Date": "rebill_rule",
            "Sales Document Type": "sales_doc",
            "Sales Type": "sales_type",
            "Subscription Term": "sub_term",
            "Completed Sales ( DC )": "DC_amount",
            "Completed Sales": "US_amount",
        },
        inplace=True,
    )


In [5]:
df.head(3)

Unnamed: 0,duration,curr,BU,period,POB_type,config,rev_req_type,rebill_rule,sales_doc,sales_type,sub_term,DC_amount,US_amount
0,0,AUD,Experience Cloud,2017-01,,OCONS,,,ZCC,PRO-SVC-INV,0,375.0,278.01
1,0,AUD,Experience Cloud,2017-01,,ONORE,,,ZCC,RECOGNIZED,0,0.0,45.75
2,0,AUD,Experience Cloud,2017-01,,ONORE,D,YA,ZCC,DEFERRED,0,-684738.0,-512177.81


In [6]:
vc = df["curr"].value_counts()
keep_these = vc.values > 20
keep_curr = vc[keep_these]
list_keepers = keep_curr.index
remove_these = vc[vc.values <= 20].index
model_dict = {"curr_removed": list(vc[remove_these].index)}
delete_curr = list(remove_these)

if "TRY" not in model_dict["curr_removed"]:
    model_dict["curr_removed"].append("TRY")
    delete_curr.append("TRY")
    list_keepers = list_keepers.drop("TRY")

df = df[df["curr"].isin(list_keepers)]

In [7]:
print(len(df))
df = df[df["DC_amount"] != 0]
print(len(df))

293832
286744


In [8]:
model_dict

{'curr_removed': ['BHD',
  'JOD',
  'EGP',
  'OMR',
  'LBP',
  'BMD',
  'AED',
  'MXP',
  'TRY']}

In [9]:
df['POB_type'].value_counts(dropna=False)

NaN      192219
RR        73256
IR-NA      7424
IR         4819
BNDL       3604
CR         2956
RR-NA      2308
CR-NA       102
LFB          56
Name: POB_type, dtype: int64

## Grouping by POB Type

The new 606 has everything grouped by sales type. There are blanks
We need to separate into groups based on the POB type


In [25]:
list_IR = ['IR', 'IR-NA', 'LFB']
list_service = ['CR', 'CR-NA']
list_deferred = ['RR', 'RR-NA']
list_hybrid = ['BNDL']

list_all = list_IR + list_service + list_deferred + list_hybrid
print(list_all)

['IR', 'IR-NA', 'LFB', 'CR', 'CR-NA', 'RR', 'RR-NA', 'BNDL']


In [33]:
rec = df[df["POB_type"].isin(list_IR)].copy()
svc = df[df["POB_type"].isin(list_service)].copy()
dfr = df[df["POB_type"].isin(list_deferred)].copy()
hyb = df[df["POB_type"].isin(list_hybrid)].copy()
blank = df[~df["POB_type"].isin(list_all)].copy()

In [34]:
print('rec', len(rec))
print('svc', len(svc))
print('dfr', len(dfr))
print('hyb', len(hyb))
print('blank', len(blank))
print('ALL', len(df))

rec 12299
svc 3058
dfr 75564
hyb 3604
blank 192219
ALL 286744


In [35]:
len(rec)+len(svc)+ len(dfr)+ len(blank)+len(hyb)

286744

In [36]:
gb_rec = rec.groupby(["curr", "BU", "period"], as_index=False).sum()

In [37]:
gb_rec.drop(labels=["duration", "sub_term"] , axis=1, inplace=True)

In [38]:
gb_rec

Unnamed: 0,curr,BU,period,DC_amount,US_amount
0,ARS,Creative,2019-07,6786.00,155.12
1,ARS,Creative,2019-08,16472.00,390.02
2,ARS,Creative,2019-09,19205.00,405.51
3,ARS,Creative,2019-10,30382.19,532.13
4,ARS,Creative,2019-11,59049.00,1023.67
...,...,...,...,...,...
1398,USD,Print & Publishing,2020-02,6806243.20,6806243.20
1399,USD,Print & Publishing,2020-03,6369245.97,6369245.97
1400,USD,Print & Publishing,2020-04,684593.15,684593.15
1401,USD,Print & Publishing,2020-05,1656955.97,1656955.97


In [39]:
gb_svc = svc.groupby(["curr", "BU", "period"], as_index=False).sum()

In [40]:
gb_svc.drop(labels=["sub_term", "duration"] , axis=1, inplace=True)

gb_svc

Unnamed: 0,curr,BU,period,DC_amount,US_amount
0,AUD,Creative,2017-10,8544.00,6837.59
1,AUD,Creative,2019-01,8472.55,6135.45
2,AUD,Creative,2019-02,3635.13,2587.61
3,AUD,Creative,2019-03,10437.33,7531.36
4,AUD,Creative,2019-04,6914.89,4896.03
...,...,...,...,...,...
576,USD,Print & Publishing,2019-12,70834.00,70834.00
577,USD,Print & Publishing,2020-01,35000.00,35000.00
578,USD,Print & Publishing,2020-02,33333.00,33333.00
579,USD,Print & Publishing,2020-04,33333.00,33333.00


In [41]:
gb_svc.head(4)

Unnamed: 0,curr,BU,period,DC_amount,US_amount
0,AUD,Creative,2017-10,8544.0,6837.59
1,AUD,Creative,2019-01,8472.55,6135.45
2,AUD,Creative,2019-02,3635.13,2587.61
3,AUD,Creative,2019-03,10437.33,7531.36


# Deferred billings
## Type B Service based/

In [42]:
dfr_b = dfr[dfr["rev_req_type"] == "B"].copy()

In [43]:
print(len(dfr))
print(len(dfr_b))

75564
0


In [44]:
dfr.head(10)

Unnamed: 0,duration,curr,BU,period,POB_type,config,rev_req_type,rebill_rule,sales_doc,sales_type,sub_term,DC_amount,US_amount
15,0,EUR,Experience Cloud,2017-01,RR,ONORE,D,YA,ZCC,DEFERRED,0,117600.0,123853.97
47,1,EUR,Experience Cloud,2017-01,RR,ONORE,D,YA,ZCC,DEFERRED,0,399.0,426.24
48,1,JPY,Experience Cloud,2017-01,RR,ONORE,D,YA,ZCC,DEFERRED,0,72772.0,637.48
55,1,USD,Experience Cloud,2017-01,RR,ONORE,D,Y1,ZCC,DEFERRED,0,1232.5,1232.5
56,1,USD,Experience Cloud,2017-01,RR,ONORE,D,YA,ZCC,DEFERRED,0,18094.74,18094.74
57,1,USD,Experience Cloud,2017-01,RR,ONORE,D,YQ,ZCC,DEFERRED,0,4064.52,4064.52
60,2,EUR,Experience Cloud,2017-01,RR,ONORE,D,YA,ZCC,DEFERRED,0,19485.9,20724.03
65,2,USD,Experience Cloud,2017-01,RR,ONORE,D,YA,ZCC,DEFERRED,0,13850.0,13850.0
69,3,CAD,Experience Cloud,2017-01,RR,ONORE,D,YQ,ZCC,DEFERRED,0,78810.0,58634.64
74,3,EUR,Experience Cloud,2017-01,RR,ONORE,D,YA,ZCC,DEFERRED,0,79251.79,84287.45


In [45]:
dfr.rev_req_type.value_counts(dropna=False)

D      51470
A      23504
NaN      590
Name: rev_req_type, dtype: int64

## Type A Deferred billings

### First sort by config type

### Then use sub term

In [None]:
dfr_a = dfr[dfr["rev_req_type"] == "A"].copy()
print("length of dfr_a", len(dfr_a))
dfr_a.drop(labels='duration', axis=1, inplace=True)
gb_a = dfr_a.groupby(["curr", "BU", "period", "config", 'sub_term'], as_index=False).sum()


In [None]:
len(gb_a)

In [None]:
gb_a.head(4)


In [None]:
gb_a.config.value_counts(dropna=False)

In [None]:
gb_a.sub_term.value_counts(dropna=False)

In [None]:
config_type_keepers = ['MTHLY', '1Y', '2Y', '3Y']


In [None]:
gb_a_keepers = gb_a[gb_a["config"].isin(config_type_keepers)].copy()
a_bad_config = gb_a[~gb_a["config"].isin(config_type_keepers)].copy()

In [None]:
print('len gb_a', len(gb_a))
print('gb_a_keepers', len(gb_a_keepers))
print('len a_blank_config', len(a_bad_config))


In [None]:
gb_a_keepers['config'].value_counts(dropna=False)

In [None]:
gb_a_keepers['sub_term'].value_counts(dropna=False)

In [None]:
gb_a_keepers.head(10)

In [None]:
print('Total USD Equivalent Billings of Type A with bad configs', 
      a_bad_config.US_amount.sum())

## For the type A billings that we can use, we need to split them apart by sub_term and config
If subterm == 0, then we use the config period. (Assuming these are PUP?)

Otherwise, 12 = annual, 24 = two years, 36 = 3 years

Old code has these as gb_a_1M, gb_a_1Y, gb_a_2Y, gb_a_3Y

Here we will pull out the types from the gb_a_keepers dataframe into a new dataframe and then delete these old records from the gb_a_keepers dataframe

In [None]:
print('start length', len(gb_a_keepers))
df_1M = gb_a_keepers[(gb_a_keepers['config']=='MTHLY') |
                       (gb_a_keepers['sub_term']==1)].copy()

index_1M = df_1M.index


gb_a_keepers.drop(index_1M, inplace=True)

print(df_1M.head(10))
print('len df_1M', len(df_1M))
print(len(gb_a_keepers))


In [None]:
print(len(gb_a_keepers))

In [None]:
#df_1M['sub_term'].value_counts(dropna=False)
df_1M['config'].value_counts(dropna=False)

In [None]:
gb_a_keepers.head(10)

##### Dealing with the 1 year rebillings

In [None]:
print('start length', len(gb_a_keepers))
df_1Y = gb_a_keepers[(gb_a_keepers['sub_term']==12) | 
                       ((gb_a_keepers['sub_term']==0) &
                        (gb_a_keepers['config']=='1Y'))].copy()

index_1Y = df_1Y.index


gb_a_keepers.drop(index_1Y, inplace=True)

print(df_1Y.head(10))
print('len df_1Y', len(df_1Y))
print(len(gb_a_keepers))

In [None]:
df_2Y = gb_a_keepers[gb_a_keepers['config']=='2Y'].copy()
df_3Y = gb_a_keepers[gb_a_keepers['config']=='3Y'].copy()

print('length gb_a_keepers', len(gb_a_keepers))
print('length df_2Y', len(df_2Y))
print('df_3Y', len(df_3Y))

In [None]:
test2 = gb_a_keepers[gb_a_keepers['config']=='3Y']
test2.sub_term.value_counts()

# Type D Billings



In [None]:
dfr_d = dfr[dfr["rev_req_type"] == "D"].copy()
dfr_d.head(10)

In [None]:
gb_d = dfr_d.groupby(["curr", "BU", "period", "rebill_rule", "sales_doc"], as_index=False).sum()
gb_d.head(10)

In [None]:
gb_d.drop(labels=["sub_term", "duration"], axis=1, inplace=True)
gb_d.head(10)

In [None]:
gb_d["rebill_rule"].value_counts(dropna=False)

In [None]:
list_monthly = ['Y1', 'Y2', 'Y3', 'YM']
list_qtrly = ['YQ', 'YY', 'YT']
list_semi_ann = ['YH']
list_ann = ['YA', 'YC', 'YX']
list_2yrs = ['Y4']
list_3yrs = ['Y7']
list_all_rebills = list_monthly + list_qtrly + list_semi_ann + list_ann + list_2yrs + list_3yrs
print(list_all_rebills)

In [None]:
gb_d_mthly = gb_d[gb_d["rebill_rule"].isin(list_monthly)].copy()
gb_d_mthly.drop(labels="rebill_rule", axis=1, inplace=True)
gb_d_mthly = gb_d_mthly.groupby(["curr", "BU", "period"]).sum()
gb_d_mthly.reset_index(inplace=True)

gb_d_qtrly = gb_d[gb_d["rebill_rule"].isin(list_qtrly)].copy()
gb_d_qtrly.drop(labels="rebill_rule", axis=1, inplace=True)
gb_d_qtrly = gb_d_qtrly.groupby(["curr", "BU", "period"]).sum()
gb_d_qtrly.reset_index(inplace=True)

gb_d_semi_ann = gb_d[gb_d["rebill_rule"].isin(list_semi_ann)]
gb_d_semi_ann.drop(labels="rebill_rule", axis=1, inplace=True)
gb_d_semi_ann = gb_d_semi_ann.groupby(["curr", "BU", "period"]).sum()
gb_d_semi_ann.reset_index(inplace=True)

gb_d_annual = gb_d[gb_d["rebill_rule"].isin(list_ann)].copy()
gb_d_annual.drop(labels="rebill_rule", axis=1, inplace=True)
gb_d_annual = gb_d_annual.groupby(["curr", "BU", "period"]).sum()
gb_d_annual.reset_index(inplace=True)

gb_d_two_yrs = gb_d[gb_d["rebill_rule"].isin(list_2yrs)].copy()
gb_d_two_yrs.drop(labels="rebill_rule", axis=1, inplace=True)
gb_d_two_yrs = gb_d_two_yrs.groupby(["curr", "BU", "period"]).sum()
gb_d_two_yrs.reset_index(inplace=True)

gb_d_three_yrs = gb_d[gb_d["rebill_rule"].isin(list_3yrs)]
gb_d_three_yrs.drop(labels="rebill_rule", axis=1, inplace=True)
gb_d_three_yrs = gb_d_three_yrs.groupby(["curr", "BU", "period"]).sum()
gb_d_three_yrs.reset_index(inplace=True)

print("Length of monthly", len(gb_d_mthly))
print("Length of quarterly", len(gb_d_qtrly))
print("Length of semi ann", len(gb_d_semi_ann))
print("Length of annual", len(gb_d_annual))
print("Length of two years", len(gb_d_two_yrs))
print("Length of three years", len(gb_d_three_yrs))


In [None]:
list_df = [
    gb_rec,
    gb_svc,
    gb_b,
    gb_a_1M,
    gb_a_1Y,
    gb_a_2Y,
    gb_a_3Y,
    gb_d_mthly,
    gb_d_qtrly,
    gb_d_semi_ann,
    gb_d_annual,
    gb_d_two_yrs,
    gb_d_three_yrs,
]

list_columns = [
    "recognized",
    "service",
    "deferred_B",
    "deferred_1M_a",
    "deferred_1Y_a",
    "deferred_2Y_a",
    "deferred_3Y_a",
    "deferred_1M_d",
    "deferred_3M_d",
    "deferred_6M_d",
    "deferred_1Y_d",
    "deferred_2Y_d",
    "deferred_3Y_d",
]
