# Testing Deferred Revenue in Python
Will this be easier for everyone to use than Matlab?

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import pickle

## Step 1: Processing Base Billings Data

In [2]:
df = pd.read_excel('../data/all_billings_inputs.xlsx', sheet_name='base_billings')

In [3]:
df.sample(10)

Unnamed: 0,Document Currency,Enterprise BU Desc,Frequency,Invoice Fiscal Year Period Desc,Product Config Type,Rev Rec Category,Rule For Bill Date,Sales Type,Subscription Term,Completed Sales ( DC ),Completed Sales
32691,NOK,Document Cloud,,2017-01,,A,,DEFERRED,1,-426.0,-50.01
38144,USD,Creative,,2017-04,1V,,,RECOGNIZED,0,-375.0,-375.0
34547,RUB,Other Solutions,,2019-06,,,,RECOGNIZED,0,-386.4,-5.92
23278,GBP,Experience Cloud,1TME,2019-10,,A,,DEFERRED,0,10000.0,12211.6
3277,AUD,Experience Cloud,1TME,2019-08,,B,,DEFERRED,0,101691.0,71399.42
47664,USD,Experience Cloud,ONGO,2020-01,,D,Y1,DEFERRED,0,122994.42,122994.42
31268,JPY,Print & Publishing,,2018-12,1V,,,RECOGNIZED,0,139775.0,1238.41
16629,EUR,Experience Cloud,1TME,2020-01,,,,DEFERRED,0,3784.5,4169.22
26413,INR,Creative,,2020-04,,,,RECOGNIZED,0,103624.0,1439.25
41534,USD,Document Cloud,,2019-08,1Y,A,,DEFERRED,0,520694.8,520694.8


## Changing the column names early since they are inconsistent across other reports

In [4]:
df.columns

Index(['Document Currency', 'Enterprise BU Desc', 'Frequency',
       'Invoice Fiscal Year Period Desc', 'Product Config Type',
       'Rev Rec Category', 'Rule For Bill Date', 'Sales Type',
       'Subscription Term', 'Completed Sales ( DC )', 'Completed Sales'],
      dtype='object')

In [5]:
df.rename(index = str, columns = {'Document Currency': 'curr',
                                 'Enterprise BU Desc': 'BU',
                                 'Invoice Fiscal Year Period Desc': 'period',
                                 'Product Config Type': 'config',
                                 'Rev Rec Category': 'rev_req_type',
                                 'Rule For Bill Date': 'rebill_rule',
                                 'Completed Sales ( DC )': 'DC_amount',
                                 'Completed Sales': 'US_amount'}, inplace=True)

In [6]:
df.columns

Index(['curr', 'BU', 'Frequency', 'period', 'config', 'rev_req_type',
       'rebill_rule', 'Sales Type', 'Subscription Term', 'DC_amount',
       'US_amount'],
      dtype='object')

## Filter that removes any currency that has  < 10 transactions. 


In [7]:
# creates a list of the currencies and the number of transactions for each currency
vc = df['curr'].value_counts()
print(vc)

USD    12427
EUR     7725
GBP     5714
AUD     5068
JPY     4919
CHF     2389
SEK     2260
DKK     2172
NOK     1854
CAD     1520
HKD      486
BRL      466
RUB      461
KRW      247
CLP      211
COP      210
ARS      210
SGD      209
PEN      190
INR      181
PHP      151
TWD      148
THB      144
MYR      138
IDR      132
NZD       67
ILS       38
TRY       27
SAR        4
BMD        2
AED        1
MXP        1
Name: curr, dtype: int64


In [8]:
# Create variable that is true if the number of transaction is greater than 10, false otherwise
keep_these = vc.values > 10
# filtering only currencies that were greater than 10
keep_curr = vc[keep_these]
a = keep_curr.index
# filtering the dataframe to remove any of teh currencies not in our list
df = df[df['curr'].isin(a)]

## Just keeping track of the currencies we removed in our model_dict data structure

In [9]:
remove_these = vc[vc.values <= 10].index
model_dict = {'curr_removed': list(vc[remove_these].index)}
delete_curr = list(remove_these)

## The FX database does not have information on the following currencies
 - AED (United Arab Emirates Dirham)
 - BMD (Bermudan Dollar)
 - MXP (Mexican Peso)
 - TRY (Turkish Lira)

In [10]:
if 'TRY' not in model_dict:
    model_dict['curr_removed'].append('TRY')
    delete_curr.append('TRY')
    a = a.drop('TRY')
    
print('Model dictionary', model_dict)
print('Deleted Currencies', delete_curr)

Model dictionary {'curr_removed': ['SAR', 'BMD', 'AED', 'MXP', 'TRY']}
Deleted Currencies ['SAR', 'BMD', 'AED', 'MXP', 'TRY']


In [11]:
print("---Removing infrequent currencies from billings history---")
print('Total number of currencies in the base billings file: ', len(vc))
if len(model_dict['curr_removed'])==0:
    print('No currencies were removed, all contained 10 or more billings')
    print('Currencies in the base billings file')
    for item in a:
        print(a[item], end = " ")
else:
    print('\n Currencies were removed: ', len(model_dict['curr_removed']))

    for item in remove_these:
        print(item, ', ', end="")
        
    print("\n\n{} Remaining currencies: ".format(len(a)))
    for item in a:
        print(item, ', ', end="")

---Removing infrequent currencies from billings history---
Total number of currencies in the base billings file:  32

 Currencies were removed:  5
SAR , BMD , AED , MXP , 

27 Remaining currencies: 
USD , EUR , GBP , AUD , JPY , CHF , SEK , DKK , NOK , CAD , HKD , BRL , RUB , KRW , CLP , COP , ARS , SGD , PEN , INR , PHP , TWD , THB , MYR , IDR , NZD , ILS , 

# Removing any of the values that are zero

In [12]:
print('This is the length of the dataframe before removing zeros: ', len(df))
df = df[df['DC_amount']!=0]
print('This is the length of the dataframe after removing zeros: ', len(df))

This is the length of the dataframe before removing zeros:  49764
This is the length of the dataframe after removing zeros:  46285


In [13]:
df.head(30)

Unnamed: 0,curr,BU,Frequency,period,config,rev_req_type,rebill_rule,Sales Type,Subscription Term,DC_amount,US_amount
1,ARS,Creative,,2019-03,1Y,D,,DEFERRED,1,-11291.52,-289.26
2,ARS,Creative,,2019-03,1Y,D,Y3,DEFERRED,1,373766.0,9601.19
3,ARS,Creative,,2019-03,1Y,D,YA,DEFERRED,12,241380.0,6194.45
4,ARS,Creative,,2019-03,MTHLY,D,,DEFERRED,1,-1221.0,-31.07
5,ARS,Creative,,2019-03,MTHLY,D,Y3,DEFERRED,1,45799.0,1173.37
6,ARS,Creative,,2019-04,1Y,D,,DEFERRED,1,-40014.7,-985.24
7,ARS,Creative,,2019-04,1Y,D,Y3,DEFERRED,1,1705471.0,42032.83
8,ARS,Creative,,2019-04,1Y,D,YA,DEFERRED,12,1112854.12,27486.58
9,ARS,Creative,,2019-04,MTHLY,D,,DEFERRED,1,-14292.8,-351.04
10,ARS,Creative,,2019-04,MTHLY,D,Y3,DEFERRED,1,188938.0,4653.43


## Clearing out the Non-Revenue billings from the file
 - No Idea what these are

In [14]:
df["Sales Type"].value_counts()

DEFERRED       37628
RECOGNIZED      7183
PRO-SVC-INV     1323
NON-REV          150
Name: Sales Type, dtype: int64

In [15]:
print('Length of the dataframe before removing non-revenue billings: ', len(df))
df = df[df['Sales Type']!='NON-REV']
print('Length of the dataframe after removing non-revenue billings:  ', len(df))


Length of the dataframe before removing non-revenue billings:  46285
Length of the dataframe after removing non-revenue billings:   46135


## Starting to group the revenue by period, industry, etc

Attempting to group by the following categories
 - currency
 - period
 - sale type
 
May need to process the data differently with the deferred billings so we will start with the recognized and then the service billings

# DOING THIS ALL IN PANDAS WITH SPLIT APPLY COMBINE on Sales Type 


In [16]:
# First split the data into three dataframes
# Recognized billings
rec = df[df['Sales Type']=='RECOGNIZED'].copy()
svc = df[df['Sales Type']=='PRO-SVC-INV'].copy()
dfr = df[df['Sales Type']=='DEFERRED'].copy()

# NOW WORKING ON THE BILLINGS

### Recognized Revenue

In [17]:
rec.head(20)

Unnamed: 0,curr,BU,Frequency,period,config,rev_req_type,rebill_rule,Sales Type,Subscription Term,DC_amount,US_amount
24,ARS,Creative,,2019-07,,,,RECOGNIZED,0,6786.0,155.12
32,ARS,Creative,,2019-08,,,,RECOGNIZED,0,16472.0,390.02
39,ARS,Creative,,2019-09,,,,RECOGNIZED,0,19205.0,405.51
47,ARS,Creative,,2019-10,,,,RECOGNIZED,0,30382.19,532.13
55,ARS,Creative,,2019-11,,,,RECOGNIZED,0,59049.0,1023.67
63,ARS,Creative,,2019-12,,,,RECOGNIZED,0,71355.57,1195.94
71,ARS,Creative,,2020-01,,,,RECOGNIZED,0,79224.95,1323.41
80,ARS,Creative,,2020-02,,,,RECOGNIZED,0,36596.0,611.42
88,ARS,Creative,,2020-03,,,,RECOGNIZED,0,49107.0,810.06
96,ARS,Creative,,2020-04,,,,RECOGNIZED,0,20858.0,336.07


In [18]:
# testing groupby object
gb_rec = rec.groupby(['curr', 'BU', 'period'], as_index=False).sum()

In [19]:
# the Subscription term hangs around. We are dropping that here
gb_rec.drop(labels='Subscription Term', axis=1,inplace =True)

In [20]:
gb_rec.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3347 entries, 0 to 3346
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   curr       3347 non-null   object 
 1   BU         3347 non-null   object 
 2   period     3347 non-null   object 
 3   DC_amount  3347 non-null   float64
 4   US_amount  3347 non-null   float64
dtypes: float64(2), object(3)
memory usage: 156.9+ KB


## Now doing this for the service billings

In [21]:
# testing groupby object
gb_svc = svc.groupby(['curr', 'BU', 'period'], as_index=False).sum()

In [22]:
gb_svc.drop(labels='Subscription Term', axis=1,inplace =True)
gb_svc.head(30)

Unnamed: 0,curr,BU,period,DC_amount,US_amount
0,AUD,Experience Cloud,2015-01,25075.0,21084.51
1,AUD,Experience Cloud,2015-02,-4750.0,-4220.74
2,AUD,Experience Cloud,2015-03,424271.75,337822.37
3,AUD,Experience Cloud,2015-04,297925.0,229116.23
4,AUD,Experience Cloud,2015-05,316894.82,237310.85
5,AUD,Experience Cloud,2015-06,589499.45,465692.76
6,AUD,Experience Cloud,2015-07,656493.5,509990.4
7,AUD,Experience Cloud,2015-08,370780.74,272070.02
8,AUD,Experience Cloud,2015-09,1251726.13,917876.03
9,AUD,Experience Cloud,2015-10,291324.07,208547.57


# NOW WORKING ON DEFERRED BILLINGS

## Type B billings are service agreements that will have invoices submitted before the billings are reclassified to revenue. If no invoices are assigned to the billings, the billings become revenue in 12 months

In [23]:
# filter out the type B first then do a group_by
dfr_b = dfr[dfr['rev_req_type']=='B'].copy()

In [24]:
gb_b = dfr_b.groupby(['curr', 'BU', 'period'], as_index=False).sum()

In [25]:
gb_b.drop(labels='Subscription Term', axis=1, inplace=True)

In [26]:
gb_b.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 709 entries, 0 to 708
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   curr       709 non-null    object 
 1   BU         709 non-null    object 
 2   period     709 non-null    object 
 3   DC_amount  709 non-null    float64
 4   US_amount  709 non-null    float64
dtypes: float64(2), object(3)
memory usage: 33.2+ KB


In [27]:
print('length of deferred billings : ', len(dfr))
print('length of the type B billings: ', len(dfr_b))

length of deferred billings :  37628
length of the type B billings:  1434


## Now Type A Billings
These billings are on a billing plan. The product config tells us how long before they renew

 - '2Y' = 24 months
 - '1Y' = 12 months
 - 'MTHLY' = 1 month
 
NOTE: There are also other fields in the 'Product Configtype ID' field that do not map well to a rebill period.
To fix this, we need to load up a different file and determine the length of the sales contract (type A no config)
 

In [28]:
# filtering just the type A billings
dfr_a = dfr[dfr['rev_req_type']=='A'].copy()

In [29]:
gb_a = dfr_a.groupby(['curr', 'BU', 'period',
                     'config'], as_index=False).sum()
gb_a.drop(labels='Subscription Term', axis=1, inplace = True)

In [30]:
gb_a.head(20)

Unnamed: 0,curr,BU,period,config,DC_amount,US_amount
0,AUD,Creative,2015-01,1Y,1091293.6,872984.35
1,AUD,Creative,2015-01,2Y,29664.8,24592.98
2,AUD,Creative,2015-01,MTHLY,-1868.55,-1562.55
3,AUD,Creative,2015-02,1Y,789086.99,576028.63
4,AUD,Creative,2015-02,2Y,595.0,484.88
5,AUD,Creative,2015-02,MTHLY,-2220.75,-1809.04
6,AUD,Creative,2015-03,1Y,1764509.2,1293318.86
7,AUD,Creative,2015-03,2Y,14118.0,11045.07
8,AUD,Creative,2015-03,MTHLY,-2446.92,-1919.1
9,AUD,Creative,2015-04,1Y,1481271.41,1063257.93


In [31]:
gb_a['config'].value_counts()

1Y       2418
MTHLY     950
2Y        875
OUNIV     231
OCONS     106
3Y        101
ONORE      32
1V          1
OENSV       1
Name: config, dtype: int64

### Below is just a check to see how large the billing types are across all periods

In [32]:
gb_a_config = gb_a.groupby(['config'], as_index=False).sum()
gb_a_config

Unnamed: 0,config,DC_amount,US_amount
0,1V,6503.0,6503.0
1,1Y,119813500000.0,9655797000.0
2,2Y,247030700.0,67191010.0
3,3Y,938046400.0,4051346.0
4,MTHLY,138194000.0,39854060.0
5,OCONS,39032330.0,10282520.0
6,OENSV,-120.0,-120.0
7,ONORE,2350904.0,2015485.0
8,OUNIV,4283761.0,3473767.0


### These 'OCONS', 'ONORE' and 'OUNIV' data types are not actual product config IDs so we have to get them from a different data file. We are excluding these types below.

In [33]:
config_list = ['1Y', '2Y', '3Y', 'MTHLY']
test1 = gb_a[gb_a['config'].isin(config_list)]


In [34]:
test1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4344 entries, 0 to 4714
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   curr       4344 non-null   object 
 1   BU         4344 non-null   object 
 2   period     4344 non-null   object 
 3   config     4344 non-null   object 
 4   DC_amount  4344 non-null   float64
 5   US_amount  4344 non-null   float64
dtypes: float64(2), object(4)
memory usage: 237.6+ KB


## For now, lets just split this into gb_a_1Y, gb_a_2Y, gb_a_3y, gb_a_1M


In [35]:
gb_a_1Y = test1[test1['config']=='1Y'].copy()
gb_a_2Y = test1[test1['config']=='2Y'].copy()
gb_a_3Y = test1[test1['config']=='3Y'].copy()
gb_a_1M = test1[test1['config']=='MTHLY'].copy()

In [36]:
print('this is the lenght of type A 1M billings: ', len(gb_a_1M))
print('this is the lenght of type A 1Y billings: ', len(gb_a_1Y))
print('this is the lenght of type A 2Y billings: ', len(gb_a_2Y))
print('this is the lenght of type A 3Y billings: ', len(gb_a_3Y))

this is the lenght of type A 1M billings:  950
this is the lenght of type A 1Y billings:  2418
this is the lenght of type A 2Y billings:  875
this is the lenght of type A 3Y billings:  101


In [37]:
gb_a_2Y.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 875 entries, 1 to 4714
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   curr       875 non-null    object 
 1   BU         875 non-null    object 
 2   period     875 non-null    object 
 3   config     875 non-null    object 
 4   DC_amount  875 non-null    float64
 5   US_amount  875 non-null    float64
dtypes: float64(2), object(4)
memory usage: 47.9+ KB


# TYPE D billings
These billings have a field 'Rule For Bill Date' that determines when new billings will occur
 - Monthly [Y1, Y2, Y3, Y5]
 - Quarterly [YQ]
 - Every 4 months [YT]
 - Semi-annual [YH]
 - Annual [YA, YC]
 - Every 2 years - [Y4]
 
 We also need to track the type D billings that do not have a 'Rule for Bill Date'

In [38]:
# for now just do a groupby on the type
# filtering just the type A billings
dfr_d = dfr[dfr['rev_req_type']=='D'].copy()

In [39]:
gb_d = dfr_d.groupby(['curr', 'BU', 'period',
                     'rebill_rule'], as_index=False).sum()
gb_d.drop(labels='Subscription Term', axis=1, inplace = True)

In [40]:
gb_d['rebill_rule'].value_counts()

YA    2952
Y3    2375
YQ     991
YH     547
Y1     491
YC     490
Y2     153
Y4      57
Y7      31
YM      23
YX      12
YY      10
YT      10
Name: rebill_rule, dtype: int64

In [41]:
gb_d_mthly = gb_d[gb_d['rebill_rule'].isin(['Y1', 'Y2', 'Y3', 'YM'])].copy()
gb_d_mthly.drop(labels='rebill_rule', axis=1,inplace=True)
gb_d_mthly = gb_d_mthly.groupby(['curr', 'BU', 'period']).sum()
gb_d_mthly.reset_index(inplace=True)

gb_d_qtrly = gb_d[gb_d['rebill_rule'].isin(['YQ', 'YY'])].copy()
gb_d_qtrly.drop(labels='rebill_rule', axis=1,inplace=True)
gb_d_qtrly = gb_d_qtrly.groupby(['curr', 'BU', 'period']).sum()
gb_d_qtrly.reset_index(inplace=True)


gb_d_four_mths = gb_d[gb_d['rebill_rule']=='YT']
gb_d_semi_ann = gb_d[gb_d['rebill_rule']=='YH']

gb_d_annual = gb_d[gb_d['rebill_rule'].isin(['YA', 'YC', 'YX'])].copy()
gb_d_annual.drop(labels='rebill_rule', axis=1,inplace=True)
gb_d_annual = gb_d_annual.groupby(['curr', 'BU', 'period']).sum()
gb_d_annual.reset_index(inplace=True)


gb_d_two_yrs = gb_d[gb_d['rebill_rule']=='Y4']
gb_d_three_yrs = gb_d[gb_d['rebill_rule']=='Y7']

In [45]:
gb_d_annual.tail(10)

Unnamed: 0,curr,BU,period,DC_amount,US_amount
2942,USD,Print & Publishing,2019-07,4825618.35,4825618.35
2943,USD,Print & Publishing,2019-08,1686567.9,1686567.9
2944,USD,Print & Publishing,2019-09,4762479.88,4762479.88
2945,USD,Print & Publishing,2019-10,4001644.47,4010789.47
2946,USD,Print & Publishing,2019-11,5254275.03,5254275.03
2947,USD,Print & Publishing,2019-12,7109263.16,7109263.16
2948,USD,Print & Publishing,2020-01,2038078.16,2038078.16
2949,USD,Print & Publishing,2020-02,2489383.24,2489383.24
2950,USD,Print & Publishing,2020-03,5282552.91,5282552.91
2951,USD,Print & Publishing,2020-04,717861.76,717861.76


In [46]:
print('Length of monthly', len(gb_d_mthly))
print('Length of quarterly', len(gb_d_qtrly))
print('Length of four months', len(gb_d_four_mths))
print('Length of semi ann', len(gb_d_semi_ann))
print('Length of annual', len(gb_d_annual))
print('Length of two years', len(gb_d_two_yrs))
print('Length of three years', len(gb_d_three_yrs))

Length of monthly 2597
Length of quarterly 991
Length of four months 10
Length of semi ann 547
Length of annual 2952
Length of two years 57
Length of three years 31


In [None]:
whos

## NOW WE NEED TO BUILD A DATAFRAME THAT INTEGRATES THIS DATA

- We will have the following descriptive fields
   - Invoicing Fiscal Year-Period
   - Document Currency
   - Enterprise BU

- We will have the following fields based on rebilling rule
   - Recognized
   - Service
   - Monthly
   - Quarterly
   - Annual
   - Two Years
   - Three Years

In [None]:
print(gb_rec2['US_amount'].sum())
print(gb_svc2['US_amount'].sum())
print(gb_svc2['US_amount'].sum()+ gb_rec2['US_amount'].sum())

print(joined_df['US_amount_rec'].sum() + joined_df['US_amount_svc'].sum())


In [53]:
# We need to do it this way when we get to a .py file!
list_df = [gb_rec, gb_svc, gb_b,
        gb_a_1M,    gb_a_1Y,    gb_a_2Y,       gb_a_3Y, 
        gb_d_mthly, gb_d_qtrly, gb_d_semi_ann, gb_d_annual, gb_d_two_yrs, gb_d_three_yrs]

list_columns = ['recognized', 'service', 'deferred_B', 
    'deferred_1M_a', 'deferred_1Y_a', 'deferred_2Y_a', 'deferred_3Y_a',
    'deferred_1M_d', 'deferred_3M_d', 'deferred_6M_d', 'deferred_1Y_d', 'deferred_2Y_d', 'deferred_3Y_d']


In [61]:
def sum_USD_amt(list_df, list_columns):
    total_US = []
    for df in list_df:
        total_US.append(df['US_amount'].sum())
    total_df = pd.DataFrame(index = list_columns, columns = ['US_amounts'], data=total_US)
    return total_df

In [63]:
def merge_all_dataframes(list_df, list_columns):
    for i, df in enumerate(list_df):
        print('This is i:', i)
        #print("This is the df: ", df.head())
        print('referencing the column: ', list_columns[i])

        if i==0:
            df_merged = list_df[0].copy()
            df_merged.rename(index=str, columns={'DC_amount': list_columns[i]+'_DC', 
                                                 'US_amount': list_columns[i]+'_US'}, inplace=True)
        else:
            df_merged = merge_new_dataframe(df_merged, df, list_columns[i])

    return df_merged
    

In [64]:
def merge_new_dataframe(old_df, new_df, new_column):
    df_merged = pd.merge(old_df, new_df, how='outer', 
                     left_on=['curr', 'BU', 'period'],
                    right_on=['curr', 'BU', 'period'])
    df_merged.rename(index=str, columns={'DC_amount': new_column+'_DC', 'US_amount': new_column+'_US'}, inplace=True)
    
    #need to drop the product configtype id for merges where the new_df is of type A
    config_str = 'config'
    rule_str = 'rebill_rule'
    if config_str in df_merged.columns:
        df_merged.drop(columns=['config'], inplace=True)
    
    if rule_str in df_merged.columns:
        df_merged.drop(columns=['rebill_rule'], inplace=True)
        
    return df_merged

In [65]:
df = merge_all_dataframes(list_df, list_columns)

This is i: 0
referencing the column:  recognized
This is i: 1
referencing the column:  service
This is i: 2
referencing the column:  deferred_B
This is i: 3
referencing the column:  deferred_1M_a
This is i: 4
referencing the column:  deferred_1Y_a
This is i: 5
referencing the column:  deferred_2Y_a
This is i: 6
referencing the column:  deferred_3Y_a
This is i: 7
referencing the column:  deferred_1M_d
This is i: 8
referencing the column:  deferred_3M_d
This is i: 9
referencing the column:  deferred_6M_d
This is i: 10
referencing the column:  deferred_1Y_d
This is i: 11
referencing the column:  deferred_2Y_d
This is i: 12
referencing the column:  deferred_3Y_d


In [66]:
def clean_df_columns(df):
    
    # clean up NaNs before adding 
    df = df.fillna(value=0)
    
    # DC columns first
    # Monthly
    df['deferred_1M_DC'] = df['deferred_1M_a_DC']+df['deferred_1M_d_DC']
    df.drop(labels=['deferred_1M_a_DC', 'deferred_1M_d_DC'], axis=1, inplace=True)
    
    # Annual
    df['deferred_1Y_DC'] = df['deferred_1Y_a_DC']+df['deferred_1Y_d_DC']
    df.drop(labels=['deferred_1Y_a_DC', 'deferred_1Y_d_DC'], axis=1, inplace=True)
    
    # Two-Year
    df['deferred_2Y_DC'] = df['deferred_2Y_a_DC']+df['deferred_2Y_d_DC']
    df.drop(labels=['deferred_2Y_a_DC', 'deferred_2Y_d_DC'], axis=1, inplace=True)
    
    #Three-Year
    df['deferred_3Y_DC'] = df['deferred_3Y_a_DC']+df['deferred_3Y_d_DC']
    df.drop(labels=['deferred_3Y_a_DC', 'deferred_3Y_d_DC'], axis=1, inplace=True)
    
    # renaming 3M and 6M
    df.rename(index=str, columns = {'deferred_3M_d_DC':'deferred_3M_DC', 
                               'deferred_6M_d_DC': 'deferred_6M_DC'}, inplace=True)

    # US columns
    # Monthly
    df['deferred_1M_US'] = df['deferred_1M_a_US']+df['deferred_1M_d_US']
    df.drop(labels=['deferred_1M_a_US', 'deferred_1M_d_US'], axis=1, inplace=True)
    
    # Annual
    df['deferred_1Y_US'] = df['deferred_1Y_a_US']+df['deferred_1Y_d_US']
    df.drop(labels=['deferred_1Y_a_US', 'deferred_1Y_d_US'], axis=1, inplace=True)
    
    # Two-Year
    df['deferred_2Y_US'] = df['deferred_2Y_a_US']+df['deferred_2Y_d_US']
    df.drop(labels=['deferred_2Y_a_US', 'deferred_2Y_d_US'], axis=1, inplace=True)
    
    # Three-Year
    df['deferred_3Y_US'] = df['deferred_3Y_a_US']+df['deferred_3Y_d_US']
    df.drop(labels=['deferred_3Y_a_US', 'deferred_3Y_d_US'], axis=1, inplace=True)
    
    # renaming 3M and 6M
    df.rename(index=str, columns = {'deferred_3M_d_US':'deferred_3M_US', 
                               'deferred_6M_d_US': 'deferred_6M_US'}, inplace=True)

    
    #cleaning up the longer column names
    df.rename(index=str, columns = {'curr': 'curr',
                               'BU':'BU',
                               'period':'period'}, inplace=True)
    
    return df

In [67]:
df = clean_df_columns(df)

In [69]:
df.sum()

curr              ARSARSARSARSARSARSARSARSARSARSAUDAUDAUDAUDAUDA...
BU                CreativeCreativeCreativeCreativeCreativeCreati...
period            2019-072019-082019-092019-102019-112019-122020...
recognized_DC                                           3.72773e+10
recognized_US                                           4.68828e+09
service_DC                                              3.96039e+09
service_US                                              9.38328e+08
deferred_B_DC                                           9.59237e+09
deferred_B_US                                           5.73855e+08
deferred_3M_DC                                          2.83187e+09
deferred_3M_US                                          1.75483e+09
deferred_6M_DC                                          1.01191e+09
deferred_6M_US                                          1.78882e+08
deferred_1M_DC                                          1.61229e+11
deferred_1Y_DC                                  

In [62]:
total_df = sum_USD_amt(list_df, list_columns)
total_df

Unnamed: 0,US_amounts
recognized,4688280000.0
service,938328300.0
deferred_B,573854900.0
deferred_1M_a,39854060.0
deferred_1Y_a,9655797000.0
deferred_2Y_a,67191010.0
deferred_3Y_a,4051346.0
deferred_1M_d,12393920000.0
deferred_3M_d,1754833000.0
deferred_6M_d,178882000.0


In [73]:
total_df.loc['deferred_1M_d']+total_df.loc['deferred_1M_a']

US_amounts    1.243378e+10
dtype: float64

In [74]:
# Make this a function to be cleaned up somehow
del dfr
del dfr_a
del dfr_b
del dfr_d
del gb_a
del gb_a_1M
del gb_a_1Y
del gb_a_2Y
del gb_a_3Y
del gb_b, 
del gb_d
del gb_svc, gb_rec, gb_d_two_yrs
del gb_d_four_mths, gb_d_qtrly, gb_d_semi_ann


# Now working on the ZCC billings

These billings are type D billings that did not populate the rebill_rule field of the database.

They have a 'sales document type' = 'ZCC"

The billings themselves are being created from a tableau report that looks for additions to the deferred revenue waterfall based on billings of type D and have a sales document type of ZCC

# TO BE DONE:

1. Clean up the type F billings (at least check to see if they are necessary)
2. Make a function to delete all intermediate dataframes
3. Add type A no config function
4. Add type D ZCC billings

5. Work on the forecast part of this

6. Load up FX rates

In [None]:
# Adobe financial calendar
df_cal = pd.read_excel('../data/old/ADOBE_FINANCIAL_CALENDAR.xlsx', 'ADBE_cal')

In [None]:
df_cal.head()

## Working on Type A billings with no configuration

 - this is included in the all_billings_inputs file on the 'type_A_no_config' sheet

# Type A No Config Type Billings

This file contains type A billings that have a revenue contract start date and end date. We need to map these into the terms of our dataframe.

### Steps:
1. Rename the columns
2. This file has entries for pennies. Need to clear out anything less than $10 in absolute value
3. Determine the length of time between start date and end date
4. Group this dataframe by currency, period and BU
5. Merge this final dataframe with the larger dataframe

## NOTE: This file contains two different start date and end date columns. We need to look at all of them

In [None]:
df_A = pd.read_excel('../data/all_billings_inputs.xlsx', sheet_name='type_A_no_config')


In [None]:
df_A.sample(10)

In [None]:
df_A.columns

In [None]:
df_A.rename(index=str, columns={'Document Currency':'currency', 
                               'Enterprise Bu':'BU',
                               'Invoicing Fiscal Year-Period Desc':'period',
                               'Rev Rec Contract End Date Hdr':'end_date_1',
                               'Rev Rec Contract End Date Item':'end_date_2',
                               'Rev Rec Contract Start Date Hdr': 'start_date_1',
                               'Rev Rec Contract Start Date Item': 'start_date_2',
                               'Completed Sales ( DC )':'DC_amount',
                               'Completed Sales': 'US_amount'
                               }, inplace=True)



In [None]:
df_A.columns

### Dealing with the duplicate dates by taking a max

In [None]:
df_A.head()

In [None]:
df_A['start_date_str'] = df_A[['start_date_1','start_date_2']].max(axis=1).astype(str)
df_A['end_date_str'] = df_A[['end_date_1','end_date_2']].max(axis=1).astype(str)

In [None]:
df_A.sample(10)

In [None]:
df_A['start_date'] = pd.to_datetime(df_A['start_date_str'])
df_A['end_date'] = pd.to_datetime(df_A['end_date_str'])

In [None]:
df_A.drop(labels=['end_date_1', 'end_date_2', 'start_date_1', 'start_date_2',
                  'start_date_str', 'end_date_str'], axis=1, inplace=True)

In [None]:
df_A['month_interval']=(df_A['end_date']-df_A['start_date'])

In [None]:
df_A.sample(10)

In [None]:
df_A['months']= (df_A['month_interval']/ np.timedelta64(1,'M')).round(0)

In [None]:
df_A.month_interval.describe()

### Now I need to map the months into the different integers in my dataframe


In [None]:
list_rebills = [1, 3, 6, 12, 24, 36]
temp_rebill = np.zeros_like(df_A['months'])
for i in range(len(df_A)):
    temp_rebill[i] = min(list_rebills, key=lambda x:abs(x-df_A['months'][i]))
df_A['rebill_months']=temp_rebill


In [None]:
df_A.sample(10)

In [None]:
plt.scatter(df_A['months'], df_A['rebill_months'])

## Grouping the dataframe by rebill_months

In [None]:
# drop what we dont need
df_A.drop(columns = ['start_date', 'end_date', 'month_interval', 'months'], axis=1, inplace=True)

In [None]:
df_A.head()

In [None]:
df_A.rename(index = str, columns={'currency': 'curr', 'Enterprise BU Desc':'BU', 
             'Invoice Fiscal Year Period Desc': 'period'}, inplace=True)


In [None]:
df_A.head(10)

In [None]:
#medals = df.pivot_table('no of medals', ['Year', 'Country'], 'medal')
temp_DC = df_A.pivot_table('DC_amount', ['curr', 'BU', 'period'], 'rebill_months')
temp_US = df_A.pivot_table('US_amount', ['curr', 'BU', 'period'], 'rebill_months')


In [None]:
temp_DC

In [None]:
temp_DC = temp_DC.fillna(0)
temp_US = temp_DC.fillna(0)

In [None]:
temp_US

In [None]:
temp_flat_DC = pd.DataFrame(temp_DC.to_records())
temp_flat_US = pd.DataFrame(temp_US.to_records())

In [None]:
temp_flat_DC.info()

In [None]:
temp_flat_DC.rename(index=str, columns={'1.0':'deferred_1M_DC', 
                               '3.0':'deferred_3M_DC',
                               '6.0':'deferred_6M_DC',
                               '12.0':'deferred_1Y_DC',
                               '24.0':'deferred_2Y_DC',
                               '36.0': 'deferred_3Y_DC'}, inplace=True)

temp_flat_US.rename(index=str, columns={'1.0':'deferred_1M_US', 
                               '3.0':'deferred_3M_US',
                               '6.0':'deferred_6M_US',
                               '12.0':'deferred_1Y_US',
                               '24.0':'deferred_2Y_US',
                               '36.0': 'deferred_3Y_US'}, inplace=True)


In [None]:
temp_flat_DC.tail(20)

In [None]:
df.head(30)

In [None]:
df_test_dup = df.copy()
print(len(df_test_dup))

In [None]:
df_test_dup =df_test_dup.drop_duplicates(subset=['curr', 'BU', 'period'])

In [None]:
print(len(df_test_dup))

## Now we have to merge these two dataframes with the other billings dataframe

In [None]:
''' def merge_new_dataframe(old_df, new_df, new_column):
    df_merged = pd.merge(old_df, new_df, how='outer', 
                     left_on=['curr', 'BU', 'period'],
                    right_on=['curr', 'BU', 'period'])
    df_merged.rename(index=str, columns={'DC_amount': new_column+'_DC', 'US_amount': new_column+'_US'}, inplace=True)
    
    #need to drop the product configtype id for merges where the new_df is of type A
    config_str = 'config'
    rule_str = 'rebill_rule'
    if config_str in df_merged.columns:
        df_merged.drop(columns=['config'], inplace=True)
    
    if rule_str in df_merged.columns:
        df_merged.drop(columns=['rebill_rule'], inplace=True)
        
    return df_merged
'''

In [None]:
temp_flat_DC.head(30)

In [None]:
df_with_A = pd.merge(df, temp_flat_DC, how='outer',
                    left_on= ['curr', 'BU', 'period'],
                    right_on=['curr', 'BU', 'period'], indicator=True, validate='one_to_one')
df_with_A = df_with_A.fillna(0)

In [None]:
df_with_A.head(30)

In [None]:
df_with_all = pd.merge(df_with_A, temp_flat_US, how='outer',
                    left_on= ['curr', 'BU', 'period'],
                    right_on=['curr', 'BU', 'period'])
df_with_all = df_with_all.fillna(0)

In [None]:
df_with_all.fillna(0)

In [None]:
df_with_all.head(10)

In [None]:
df_with_all['deferred_1M_DC']= df_with_all['deferred_1M_DC_x']+df_with_all['deferred_1M_DC_y']
df_with_all['deferred_3M_DC']= df_with_all['deferred_3M_DC_x']+df_with_all['deferred_3M_DC_y']
df_with_all['deferred_6M_DC']= df_with_all['deferred_6M_DC_x']+df_with_all['deferred_6M_DC_y']
df_with_all['deferred_1Y_DC']= df_with_all['deferred_1Y_DC_x']+df_with_all['deferred_1Y_DC_y']
df_with_all['deferred_2Y_DC']= df_with_all['deferred_2Y_DC_x']+df_with_all['deferred_2Y_DC_y']
df_with_all['deferred_3Y_DC']= df_with_all['deferred_3Y_DC_x']+df_with_all['deferred_3Y_DC_y']

df_with_all['deferred_1M_US']= df_with_all['deferred_1M_US_x']+df_with_all['deferred_1M_US_y']
df_with_all['deferred_3M_US']= df_with_all['deferred_3M_US_x']+df_with_all['deferred_3M_US_y']
df_with_all['deferred_6M_US']= df_with_all['deferred_6M_US_x']+df_with_all['deferred_6M_US_y']
df_with_all['deferred_1Y_US']= df_with_all['deferred_1Y_US_x']+df_with_all['deferred_1Y_US_y']
df_with_all['deferred_2Y_US']= df_with_all['deferred_2Y_US_x']+df_with_all['deferred_2Y_US_y']
df_with_all['deferred_3Y_US']= df_with_all['deferred_3Y_US_x']+df_with_all['deferred_3Y_US_y']

In [None]:
df_with_all.drop(labels = ['deferred_1M_DC_x','deferred_1M_DC_y',
                        'deferred_3M_DC_x','deferred_3M_DC_y',
                        'deferred_6M_DC_x','deferred_6M_DC_y',
                        'deferred_1Y_DC_x','deferred_1Y_DC_y',
                        'deferred_2Y_DC_x','deferred_2Y_DC_y',
                        'deferred_3Y_DC_x','deferred_3Y_DC_y',
                        'deferred_1M_US_x','deferred_1M_US_y',   
                        'deferred_3M_US_x','deferred_3M_US_y',
                        'deferred_6M_US_x','deferred_6M_US_y',
                        'deferred_1Y_US_x','deferred_1Y_US_y',
                        'deferred_2Y_US_x','deferred_2Y_US_y',
                        'deferred_3Y_US_x','deferred_3Y_US_y'],
                 axis=1, inplace=True)

In [None]:
df_with_all.head()

In [None]:
df_with_all.columns

In [None]:
df_with_all['deferred_1Y_US'].describe()

In [None]:
df['deferred_1Y_US'].describe()

In [None]:
print('sum of temp flat DC 1M', temp_flat_DC['deferred_1M_DC'].sum())
print('sum of base_df before DC 1M', df['deferred_1M_DC'].sum())
print('sum of final DC 1M', df_with_all['deferred_1M_DC'].sum())

a = temp_flat_DC['deferred_1M_DC'].sum()
b = df['deferred_1M_DC'].sum()
c = df_with_all['deferred_1M_DC'].sum()
print(c)
print(a+b)

In [None]:
df['deferred_1M_DC'].sum()

In [None]:
temp_flat_DC['deferred_1M_DC'].sum()

In [None]:
df_with_all['deferred_1M_DC'].sum()

# Need to create a table that contains the total billings by DC for each dataframe and each step for auditing

 - start with all of the DC
 - then create function that appends and adds rows
 - then do the same for the DC stuff type_A
 - then check the totals


In [None]:
df = df_with_all.copy()

In [None]:
drop_index= df[df['period']=='2020-04'].index
df.drop(drop_index, inplace=True)

In [None]:
whos

In [None]:
# dump the pickle
#pickle.dump(df, open('../data/processed/all_billings.p', 'wb'))

# open the pickle file
df = pickle.load(open('../data/processed/all_billings.p', 'rb'))

In [None]:
df['period'].unique()

In [None]:
len(df)

## Loading All of the other information we need here
 - filename = currency_map

In [None]:
df_curr_map= pd.read_excel('../data/currency_map.xlsx', sheet_name='curr_map')
df_curr_map['Country'] = df_curr_map['Country'].str.replace('\(MA\)', '', case=False)

In [None]:
df_weeks = pd.read_excel('../data/Period_weeks.xlsx', sheet_name='period_weeks')


In [None]:
df_fx_fwds = pd.read_excel('../data/FX_forward_rates.xlsx', sheet_name='forward_data', 
                          skiprows = 1, usecols="C,G")

In [None]:
df_fx_fwds.rename(index=str, columns={'Unnamed: 2': 'curr', 'FWD REF':'forward'}, inplace=True)

In [None]:
df_fx_fwds

## Working on the func_load_bookings now

In [None]:
df_bookings = pd.read_excel('../data/2020_bookings_fcst_Q1.xlsx', sheet_name='bookings')


## Cleaning up the bookings data
 - remove odd strings such as '(EB)' from BU
 - (IS) from internal segment
 - etc
 
 
 NOTE: The '(' is a special character so we need to precede these with the escape character '\'

In [None]:
df_bookings['EBU'] = df_bookings['EBU'].str.replace(' \(EB\)', '', case=False)
df_bookings['Internal Segment'] = df_bookings['Internal Segment'].str.replace('\(IS\)', '')

df_bookings['PMBU'] = df_bookings['PMBU'].str.replace('\(PMBU\)', '')
df_bookings['GEO'] = df_bookings['GEO'].str.replace('\(G\)', '')
df_bookings['Market Area'] = df_bookings['Market Area'].str.replace('\(MA\)', '')


In [None]:
df_bookings.sample(5)

In [None]:
df_bookings.EBU.value_counts()

In [None]:
df_bookings.PMBU.value_counts()

In [None]:
df_bookings.GEO.value_counts()

In [None]:
df_bookings['Mark Segment'].value_counts()

In [None]:
df_bookings['Type'].value_counts()

In [None]:
df_bookings['Bookings Type'].value_counts()

In [None]:
df_bookings['Scenario'].value_counts()

In [None]:
df_bookings['FX'].value_counts()

In [None]:
df_bookings.sample(5)

In [None]:
df_bookings.drop(columns = ['Hedge', 'Mark Segment', 'Type', 'Scenario', 'FX'], inplace = True)

In [None]:
df_bookings.rename(index=str, columns = {'EBU': 'BU', 
                                        'Internal Segment': 'segment',
                                        'PMBU': 'product',
                                        'GEO':'geo',
                                        'Market Area': 'country',
                                        'Bookings Type': 'booking_type',
                                        'value': 'US_amount'}, inplace =True)

In [None]:
df_curr_map.head(5)

In [None]:
df_bookings['country'].unique()

In [None]:
list_book_ctry = df_bookings['country'].unique()


In [None]:
list_curr_map = df_curr_map['Country'].unique()

In [None]:
a = list(set(list_book_ctry) & set(list_curr_map))
a

In [None]:
not_in_map = set(list_book_ctry).difference(set(list_curr_map))
not_in_map

In [None]:
df_bookings = pd.merge(df_bookings, df_curr_map, how='left', left_on='country', right_on='Country')

In [None]:
df_bookings.sample(10)

## Percent period bookings 

In [None]:
# need to create a list of periods we want to use and keep


list_q2 = ['2019-04', '2019-05', '2019-06']
list_q3 = ['2019-07', '2019-08', '2019-09']
list_q4 = ['2019-10', '2019-11', '2019-12']
list_q1 = [ '2020-01', '2020-02', '2020-03']

list_quarters = [list_q1, list_q2, list_q3, list_q4]

In [None]:
list_BUs = df_bookings['BU'].unique()
list_curr = df_bookings['Currency'].unique()

In [None]:
this_BU

In [None]:
i = 0
j=1
this_BU = list_BUs[i]
this_curr = list_curr[j]

print(this_BU, this_curr)

In [None]:
df_test = df[(df['BU']==this_BU)&
             (df['curr']==this_curr)&
             (df['period'].isin(list_q2))]

In [None]:
df_test.head(20)

In [None]:
df.head()

In [None]:
df.BU.unique()

In [None]:
df_test1= df[(df['curr']=='EUR')&
            (df['BU']=='Experience Cloud')&
             (df['period']=='2019-03')]

             

In [None]:
df_test1

In [None]:
df.tail(20)

In [None]:
print(gb_d_mthly['US_amount'].sum())

In [None]:
gb_d_mthly.tail(20)

In [None]:
gb_test = gb_d[gb_d['rebill_rule'].isin(['Y1', 'Y2', 'Y3', 'YM'])]
gb_test = gb_test.copy()
gb_test.tail(20)

In [None]:
gb_test.drop(labels='rebill_rule', axis=1,inplace=True)
gb_test.tail(20)

In [None]:
gb_test2 = gb_test.groupby(['curr', 'BU', 'period']).sum()

In [None]:
gb_test2.tail(20)

In [None]:
gb_test2.info()

In [None]:
gb_d_mthly = gb_d[gb_d['rebill_rule'].isin(['Y1', 'Y2', 'Y3', 'YM'])].copy()
gb_d_mthly.drop(labels='rebill_rule', axis=1,inplace=True)
gb_d_mthly = gb_d_mthly.groupby(['curr', 'BU', 'period']).sum()
gb_d_mthly.tail(20)

In [None]:
gb_d_mthly.info()

In [None]:
gb_2 = gb_d_mthly.copy()
gb_2.head(10)

In [None]:
gb_2.reset_index(inplace=True)
gb_2.head(10)

In [None]:
gb_d.head(10)

In [None]:
gb_d_qtrly.tail(20)

def join_new_dataframe(old_df, new_df, new_column):
    df_joined = old_df.join(new_df, how='outer')
    df_joined.rename(index=str, columns={'DC_amount': new_column+'_DC', 'US_amount': new_column+'_US'}, inplace=True)
    df_joined.fillna(value=0, inplace=True)
    
    #need to drop the product configtype id for merges where the new_df is of type A
    config_str = 'config'
    rule_str = 'rebill_rule'
    if config_str in df_joined.columns:
        df_joined.drop(columns=['config'], inplace=True)
    
    if rule_str in df_joined.columns:
        df_joined.drop(columns=['rebill_rule'], inplace=True)
        
    return df_joined


def join_all_dataframes(list_df, list_columns):
    for i, df in enumerate(list_df):
        print('This is i:', i)
        print('referencing the column: ', list_columns[i])
        #print(df.columns)
        # setting the index to be 'curr', 'BU' and 'period'
        df.set_index(['curr', 'BU', 'period'], drop=True, inplace=True, verify_integrity=True)
        print(df.head(4))
        #if i==0:
            #df_joined = df.copy()
            #df_joined = list_df[0].copy()
            #df_joined.rename(index=str, columns={'DC_amount': list_columns[i]+'_DC', 
            #                                     'US_amount': list_columns[i]+'_US'}, inplace=True)
        #else:
            #df_joined = join_new_dataframe(df_joined, df, list_columns[i])

    #return df_joined
    x=1
    return x 