# Workbook explores results of the TV Test by price points

### Load packages

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Load core dataset for ASINs 
df = pd.read_csv("../TV_test/asin_date_4.txt", sep="\t",
                 encoding = "ISO-8859-1", header=None, 
                 names = ['week_ending','is_promo','asin','is_promo_asin',
                          'content','title_name',
                         'studio_name','contract_id',
                          'release_date','is_hd',
                          'display_price','lifecycle','season_number','customers',
                          'new_customers','OPS','units','cogs','ppm'])

In [3]:
def initial_setup(df,content):
    """function makes basic data type transformations, 
    converts object to datetime format and renames lifecycles, gets month"""
    
    
    df['week_ending'] = pd.to_datetime(df['week_ending'])
    df['release_date'] = pd.to_datetime(df['release_date'])
    df['month'] = df['week_ending'].dt.month
    df['new_lifecycles'] = np.where(df['lifecycle'].isin(['New Release','Recent release']),'RR','Catalogue')
    
    #Subset and work only with particular content
    df = df[df['content'] == content]
    
    return df

In [4]:
df = initial_setup(df, "Season")

In [5]:
print(df.shape)
df.head()

(158673, 21)


Unnamed: 0,week_ending,is_promo,asin,is_promo_asin,content,title_name,studio_name,contract_id,release_date,is_hd,...,lifecycle,season_number,customers,new_customers,OPS,units,cogs,ppm,month,new_lifecycles
15,2018-01-27,non_promo_period,B06W5GC1WV,non_promo_asins,Season,Feuerwehrmann Sam - Eine Insel voller Abenteuer,Hit Entertainment,4GB9Q_de,2017-03-03,Y,...,6mo_1yr_Catalogue,1.0,59,0,584.2,59,,,1,Catalogue
28,2018-02-17,non_promo_period,B00KX2UOGK,promo_asins,Season,Sherlock - Staffel 3 [dt./OV],Hartswood Films/BBC/MASTERPIECE,UXBB4,2014-06-10,Y,...,Deep Catalogue,3.0,158,3,1798.82,160,1255.37,0.3021,2,Catalogue
30,2018-02-17,non_promo_period,B01MXUUQCB,non_promo_asins,Season,Ein Tag in ?,Story House,UXZD4,2016-12-05,Y,...,1_2yr_Catalogue,1.0,14,0,40.98,14,25.44,0.3792,2,Catalogue
35,2018-03-03,non_promo_period,B00GNYNJ7I,promo_asins,Season,Rizzoli & Isles - Staffel 3,Warner Bros,UXWB4,2014-01-01,Y,...,Deep Catalogue,3.0,7,0,117.46,7,74.06,0.3694,3,Catalogue
39,2018-03-10,promo_period,B00ET0NHX4,promo_asins,Season,Drawn Together - Staffel 1 [dt./OV],Comedy Central,UXMT4,2014-01-01,N,...,Deep Catalogue,1.0,58,3,234.38,58,166.1,0.2913,3,Catalogue


In [6]:
#Sanity check, the revenue in the promo period of promo asins
df[(df['is_promo'] == 'promo_period')&(df['is_promo_asin'] == "promo_asins")]['OPS'].sum()

2249635.3300000001

# Distribution of promo OPS by price points

<p> Here I am doing basic exploration of the number of ASINs per price point </p>

In [7]:
def dist_by_revenue(df):
    t = pd.DataFrame(df[(df['is_promo'] == 'promo_period')
                        & (df['is_promo_asin'] == "promo_asins")].groupby("display_price")['OPS'].sum().reset_index())
    t.columns = ['display_price','OPS']
    t['share'] = t['OPS'] / t['OPS'].sum()
    t['cumsum'] = t['share'].cumsum()
    return t

In [8]:
t = dist_by_revenue(df)
print("Revenue generated from promo ASINs in the promo period:", t['OPS'].sum())
t.head()

Revenue generated from promo ASINs in the promo period: 2249635.33


Unnamed: 0,display_price,OPS,share,cumsum
0,2.49,2.09,9.290395e-07,9.290395e-07
1,2.99,12.53,5.569792e-06,6.498831e-06
2,3.49,35.16,1.56292e-05,2.212803e-05
3,3.99,2422.06,0.001076646,0.001098774
4,4.47,33.84,1.504244e-05,0.001113816


In [9]:
#Check the share of OPS covered by those 3 price points
t[t['display_price'].isin([4.99, 9.99, 14.99])]['OPS'].sum() / t['OPS'].sum()

0.92875611977519745

# Make adjustment to time frame for non period

<p> The step is optional: just to make the length of the periods in promo and non-promo equal </p>

In [10]:
df = df[df['week_ending'] >= '2018-01-20']

# Make a list of seasonal coefficiens

<p> This section deals with seasonality and requires another raw data file.
    The idea is to get seasonality factor for each month for RR and Catalogue </p>

In [11]:
new_data_daily = pd.read_csv("./forecast_DE_lifecycles_daily.txt", sep="\t",
                       names = ['promo_period','week_ending','transaction_date','promo_status','content','lifecycle',
                        'total_customers','new_customers','units','ops','cogs'])

In [12]:
def make_prep_for_lifecycles(df, dimension1, dimension2):
    """function makes key preparations"""
    
    df[dimension1] = pd.to_datetime(df[dimension1])
    df[dimension2] = pd.to_datetime(df[dimension2])
    df['new_lifecycles'] = np.where(df['lifecycle'].isin(['New Release','Recent release']),'RR','Catalogue')
    
    df = df[df['content'].isin(['Season'])]
    #df = df[df[dimension1] <= '2018-05-12']
    
    summed = pd.DataFrame(df.groupby([dimension1,'new_lifecycles'])['ops'].sum().reset_index())
    pivoted = pd.DataFrame(pd.pivot_table(summed, values='ops', columns='new_lifecycles', index=dimension1).reset_index())
    pivoted = pivoted.fillna(0)
    return pivoted

In [13]:
def prep_seasonality_monthly(df):
    """prepares seasonality adjustments and calls another function"""
    
    prep = make_prep_for_lifecycles(df,"transaction_date","transaction_date")
    
    prep['total'] = prep['Catalogue'] + prep['RR']
    prep['month'] = prep['transaction_date'].dt.month
    prep['year']  = prep['transaction_date'].dt.year
    #prep['mnth_yr'] = prep['transaction_date'].apply(lambda x: x.strftime('%B-%Y'))   
    
    t = pd.DataFrame(prep.groupby(['month','year'])['Catalogue','RR','total'].sum().reset_index())
    t = t[t['year'] >= 2015]
    
    t = t.sort_values(['year','month'], ascending = [True,True]).reset_index()
    t = t.drop('index', axis = 1)
    
    t['total_new'] = np.where((t['year'] == 2017) & (t['month'] ==7), t[(t['year'] == 2017) & (t['month'] < 7 )]['total'].mean(), t['total'])
    t['RR_new'] = np.where((t['year'] == 2017) & (t['month'] ==7), t[(t['year'] == 2017) & (t['month'] < 7)]['RR'].mean(), t['RR'])
    t['Catalogue_new'] = np.where((t['year'] == 2017) & (t['month'] ==7), t[(t['year'] == 2017) & (t['month'] < 7)]['Catalogue'].mean(), t['Catalogue'])

    
    return t

In [14]:
t = prep_seasonality_monthly(new_data_daily)

In [16]:
def combine_seasonality():
    """function provides for every month a seasonality factor"""
    def get_shape(df,dimension):  
        monthly_mean = t[t['year'] >= 2017].groupby("month")[dimension].mean()
        all_mean = t[t['year'] >= 2017][dimension].mean()
        l = pd.DataFrame({"monthly_mean":monthly_mean,"all_mean":all_mean}).reset_index()
        l['factor'] = l['monthly_mean'] / l['all_mean']
        l = l.drop(['all_mean',"monthly_mean"], axis = 1)
        l.columns = ['month', dimension]
        return l
    
    l1 = get_shape(t,"total_new")
    l2 = get_shape(t,"RR_new")
    l3 = get_shape(t,"Catalogue_new")
    
    g1 = pd.merge(left=l1,right=l2,left_on="month",right_on="month",how='left')
    g2 = pd.merge(left=g1,right=l3,left_on="month",right_on="month", how = "left")
    
    g2.columns = ['month','total','RR','Catalogue']
    return g2

In [17]:
sindex = combine_seasonality()

In [58]:
sindex

Unnamed: 0,month,total,RR,Catalogue
0,1,1.131508,1.07013,1.167832
1,2,0.969689,1.008208,0.946893
2,3,0.956748,0.900552,0.990004
3,4,0.884616,0.829465,0.917254
4,5,0.878059,0.812552,0.916825
5,6,0.800869,0.880618,0.753674
6,7,0.79861,0.778905,0.810272
7,8,1.267936,1.411177,1.183166
8,9,1.158129,1.229705,1.115771
9,10,1.165321,1.261044,1.108673


In [22]:
def get_melt(df):
    """function transforms the format from wide to long"""
    t = pd.melt(sindex, id_vars=['month'])
    t.columns = ['month','new_lifecycles','factor']
    return t

In [23]:
sindex_melt = get_melt(sindex)

In [24]:
sindex_melt.tail()

Unnamed: 0,month,new_lifecycles,factor
31,8,Catalogue,1.183166
32,9,Catalogue,1.115771
33,10,Catalogue,1.108673
34,11,Catalogue,1.014306
35,12,Catalogue,1.136522


## Get ASINs in promo at these price points in the non promo period 

<p> The idea is to compare ASINs which are at this price points in the promo and non promo period </p>

In [35]:
def get_asin_at_price_points(df):
    """function selects asins that had certain price points
    and selects from pre period the same ones to make proper comparison"""
    asin_list = df[(df['is_promo'] == 'promo_period') 
               & (df['is_promo_asin'] == "promo_asins") 
               & (df['display_price'].isin([4.99, 9.99, 14.99]))]
    
    asin_subset = df[(df['asin'].isin(asin_list['asin']))]
    
    #Here you delete all rows which dont satisfy some condition
    asin_subset = asin_subset.drop(asin_subset[(asin_subset['is_promo'] == "promo_period") 
                                           & (asin_subset['is_promo_asin'] == "promo_asins")
                                          & (~asin_subset['display_price'].isin([4.99,9.99,14.99]))].index)
    return asin_subset

In [36]:
def seasonality_factors(df1,df2):
    """function merges seasonality factors and the output from get_asin_at_price_points"""
    output = pd.merge(left=df1,right=df2,left_on=['month','new_lifecycles'],right_on=['month','new_lifecycles'],how='left')
    return output

In [37]:
def prep_dataset(df):
    """prepares overview with deseasonlized and raw OPS"""
    df['OPS_deseas'] = df['OPS'] / df['factor']
    df['units_deseas'] = df['units'] / df['factor']
    a = pd.DataFrame(df.groupby(["is_promo","asin"])['week_ending'].size().reset_index())
    key_metrics = pd.DataFrame(df.groupby(["is_promo","asin"])['OPS','OPS_deseas','units','units_deseas'].sum().reset_index())
    m = pd.merge(left=key_metrics,right=a,left_on=['is_promo','asin'],right_on=['is_promo','asin'], how = 'left')
    
    m['OPS_per_week'] = m['OPS'] / m['week_ending']
    m['OPS_deseas_per_week'] = m['OPS_deseas'] / m['week_ending']
    
    m['units_per_week'] = m['units'] / m['week_ending']
    m['units_deseas_per_week'] = m['units_deseas'] / m['week_ending']
    
    
    after = pd.DataFrame(asin_subset[asin_subset['is_promo']=='promo_period'].
                         groupby(["asin",'is_promo'])["display_price"].min().reset_index())
    before = pd.DataFrame(asin_subset[asin_subset['is_promo']=='non_promo_period'].
                          groupby(["asin",'is_promo'])["display_price"].mean().reset_index())
    all_prices = after.append(before, ignore_index=True)
    key_out = pd.merge(left=m,right=all_prices,left_on=['is_promo','asin'],right_on=['is_promo','asin'],how="left")
    key_out = pd.merge(left=key_out,right=after[['asin','display_price']],left_on="asin",right_on="asin",how="left")
    
    return key_out

In [38]:
asin_subset = get_asin_at_price_points(df)

In [39]:
asin_subset = seasonality_factors(asin_subset,sindex_melt)

In [43]:
asin_subset[-1:]

Unnamed: 0,week_ending,is_promo,asin,is_promo_asin,content,title_name,studio_name,contract_id,release_date,is_hd,...,season_number,customers,new_customers,OPS,units,cogs,ppm,month,new_lifecycles,factor
32551,2018-03-24,promo_period,B00JKG6M4S,promo_asins,Season,Mr. Selfridge Staffel 1,NBC,UXNB4,2014-08-01,N,...,1.0,1,0,4.19,1,2.94,0.2983,3,Catalogue,0.990004


In [44]:
key_metrics = prep_dataset(asin_subset)

In [46]:
key_metrics.groupby("is_promo")['OPS_per_week','OPS_deseas_per_week',
                                'units_per_week','units_deseas_per_week'].mean()

Unnamed: 0_level_0,OPS_per_week,OPS_deseas_per_week,units_per_week,units_deseas_per_week
is_promo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
non_promo_period,95.376307,95.149557,6.999844,6.985206
promo_period,83.669155,88.543896,14.566081,15.397548


In [29]:
#key_metrics.groupby("is_promo")['OPS_per_week','units_per_week'].mean().to_csv("UK_overall.csv")

In [59]:
def get_output(df, metric):
    output = pd.DataFrame(df.groupby(["is_promo",'display_price_y'])
                          ['OPS_per_week','OPS_deseas_per_week',
                           'units_per_week','units_deseas_per_week'].mean().reset_index())
    
    
    kk = pd.pivot_table(output, index = "display_price_y",
                        columns="is_promo", values = [metric])
    kk.columns = kk.columns.droplevel()
    kk = kk.reset_index()
    kk['change_in_metric'] = (kk['non_promo_period'] - kk['promo_period']) / kk['non_promo_period']
    kk['change_promo_vs_non_promo_in_%'] = (kk['promo_period'] / kk['non_promo_period']-1)*100
    return kk    

## Initialize metrics to break down the dataset by 

In [155]:
dimension = "season_number"
metric1 = "OPS_deseas_per_week"
#If you want to see the old result, just take metric1 = 'OPS_per_week'
metric2 = "units_deseas_per_week"
#If you want to see the old result, just take metric2 = 'units_per_week'

In [156]:
output = get_output(key_metrics,metric1)
output

is_promo,display_price_y,non_promo_period,promo_period,change_in_metric,change_promo_vs_non_promo_in_%
0,4.99,68.437069,61.6037,0.099849,-9.984895
1,9.99,124.475017,132.553931,-0.064904,6.49039
2,14.99,283.741195,245.247152,0.135666,-13.566604


# Evaluate elasticity of demand
<p> E = percentage_change_in_quantity to percentage_change_in_price </p>

In [157]:
def get_change_in_price(df, metric):
    """computes average price change and corresponding elasticity"""
    
    df['per_change_price'] = (df['display_price_x'] 
                                   - df['display_price_y']) / df['display_price_x']
    k = pd.DataFrame(df[df['is_promo'] == "non_promo_period"].
                     groupby("display_price_y")['per_change_price'].mean().reset_index())
    
    f = get_output(df, metric)
    final = pd.merge(left=f,right=k,left_on="display_price_y",right_on="display_price_y",how="left")
    final['elasticity'] = final['change_in_metric'] / final['per_change_price']
    
    mp = pd.merge(left=final, right=get_output(key_metrics,metric1),
                  left_on="display_price_y",right_on = "display_price_y", how = "left")
    mp.columns = ['price_point','avg_non_period_units','avg_promo_units',
                 'change_in_units','change_in_units_in_percentages',
                 'change_in_price','elasticity','non_promo_ops',
                 'promo_ops','change_in_ops','change_in_ops_in_percentages']
    return mp

In [158]:
final = get_change_in_price(key_metrics,metric2)
final

Unnamed: 0,price_point,avg_non_period_units,avg_promo_units,change_in_units,change_in_units_in_percentages,change_in_price,elasticity,non_promo_ops,promo_ops,change_in_ops,change_in_ops_in_percentages
0,4.99,5.842412,14.80367,-1.533829,153.382875,0.646288,-2.373291,68.437069,61.6037,0.099849,-9.984895
1,9.99,8.638517,15.977265,-0.849538,84.953787,0.41898,-2.027631,124.475017,132.553931,-0.064904,6.49039
2,14.99,13.34999,20.568845,-0.540739,54.07386,0.338411,-1.597874,283.741195,245.247152,0.135666,-13.566604


In [36]:
#final.to_csv("output_UK.csv")

# Add dimensions to the analysis

## Here add complexity to get new dimensions

In [159]:
def prep_dataset_dimension(df, dimension):
    """adds complexity to the analysis in the dimension"""
    
    a = pd.DataFrame(df.groupby(["is_promo","asin"])['week_ending'].size().reset_index())
    key_metrics = pd.DataFrame(df.groupby(["is_promo","asin"])['OPS','OPS_deseas',
                                                               'units','units_deseas'].sum().reset_index())
    m = pd.merge(left=key_metrics,right=a,left_on=['is_promo','asin'],right_on=['is_promo','asin'], how = 'left')
    
    m['OPS_per_week'] = m['OPS'] / m['week_ending']
    m['OPS_deseas_per_week'] = m['OPS_deseas'] / m['week_ending']
    
    m['units_per_week'] = m['units'] / m['week_ending']
    m['units_deseas_per_week'] = m['units_deseas'] / m['week_ending']
    
    lifecycles = pd.DataFrame(asin_subset.groupby(['asin'])[dimension].max().reset_index())
    ops = pd.merge(left=m,right=lifecycles,left_on="asin",right_on="asin",how = "left")
    
    after = pd.DataFrame(asin_subset[asin_subset['is_promo']=='promo_period'].
                         groupby(["asin",'is_promo'])["display_price"].min().reset_index())
    
    before = pd.DataFrame(asin_subset[asin_subset['is_promo']=='non_promo_period'].
                          groupby(["asin",'is_promo'])["display_price"].mean().reset_index())
    
    all_prices = after.append(before, ignore_index=True)
    key_out = pd.merge(left=ops,right=all_prices,left_on=['is_promo','asin'],
                       right_on=['is_promo','asin'],how="left")
    key_out = pd.merge(left=key_out,right=after[['asin','display_price']],
                       left_on="asin",right_on="asin",how="left")
    
    
    return key_out

In [160]:
def get_output_dimension(df, metric, dimension):
    output = pd.DataFrame(df.groupby(["is_promo",'display_price_y',dimension])
                          ['OPS_per_week','OPS_deseas_per_week',
                           'units_per_week','units_deseas_per_week'].mean().reset_index())
    
    
    kk = pd.pivot_table(output, index = ["display_price_y",dimension],
                        columns="is_promo", values = [metric])
    kk.columns = kk.columns.droplevel()
    kk = kk.reset_index()
    kk['change_in_metric'] = (kk['non_promo_period'] - kk['promo_period']) / kk['non_promo_period']
    kk['change_promo_vs_non_promo_in_%'] = (kk['promo_period'] / kk['non_promo_period']-1)*100
    return kk  

In [161]:
def get_change_in_price_dimension(df, metric, dimension):
    """make output table with additional dimension"""
    
    df['per_change_price'] = (df['display_price_x'] 
                                   - df['display_price_y']) / df['display_price_x']
    k = pd.DataFrame(df[df['is_promo'] == "non_promo_period"].
                     groupby(["display_price_y",dimension])['per_change_price'].mean().reset_index())
    
    f = get_output_dimension(df, metric, dimension)
    
    final = pd.merge(left=f,right=k,
                     left_on=["display_price_y",dimension],
                     right_on=["display_price_y",dimension],how="left")
    final['elasticity'] = final['change_in_metric'] / final['per_change_price']
    return final

In [162]:
print("ASIN subset shape: ", asin_subset.shape)
print("Number of unique ASINs in the dataset: ", asin_subset['asin'].nunique())

ASIN subset shape:  (32552, 24)
Number of unique ASINs in the dataset:  2594


# Run by dimensions

In [163]:
key_metrics_l = prep_dataset_dimension(asin_subset,dimension)
key_metrics_l.head()

Unnamed: 0,is_promo,asin,OPS,OPS_deseas,units,units_deseas,week_ending,OPS_per_week,OPS_deseas_per_week,units_per_week,units_deseas_per_week,season_number,display_price_x,display_price_y
0,non_promo_period,B00ERIF4RK,4138.37,4111.650261,314,306.660684,10,413.837,411.165026,31.4,30.666068,2.0,17.64,4.99
1,non_promo_period,B00ERIF6S2,144.43,141.407473,7,6.947015,6,24.071667,23.567912,1.166667,1.157836,3.0,23.315,4.99
2,non_promo_period,B00ERIFUK6,43.64,46.050855,5,5.269075,3,14.546667,15.350285,1.666667,1.756358,1.0,10.656667,4.99
3,non_promo_period,B00ERIG3Z2,45.35,44.773251,3,2.92247,3,15.116667,14.924417,1.0,0.974157,3.0,17.986667,4.99
4,non_promo_period,B00ERIHD1K,237.18,239.638614,25,25.042681,8,29.6475,29.954827,3.125,3.130335,1.0,11.485,4.99


In [164]:
outputl = get_output_dimension(key_metrics_l,metric2,dimension)
outputl.head()

is_promo,display_price_y,season_number,non_promo_period,promo_period,change_in_metric,change_promo_vs_non_promo_in_%
0,4.99,0.0,6.388617,21.851902,-2.420444,242.044352
1,4.99,1.0,7.24065,17.672853,-1.440783,144.078266
2,4.99,2.0,5.020069,12.535812,-1.497139,149.713908
3,4.99,3.0,5.335762,14.83756,-1.780776,178.077632
4,4.99,4.0,5.496348,13.408933,-1.439608,143.960776


In [165]:
#COMMENT Distribution of each price point by content age in terms of OPS, number of asins, change in price, units, OPS

final_d = get_change_in_price_dimension(key_metrics_l,metric2,dimension)
#final_d[final_d['season_number'].isin([1.0,2.0,3.0,4.0])]


## Add on to the table

<p> 
For DE, according to the data here, total promo OPS was 2.089m. It is cleaned of all ASINs that were not sold at price points of 4.99, 9.99 or 14.99 representing still over 90% of total OPS in the promo. Now, I dissect the revenue by dimensions, either by content age or by season number. Similar % part is in the UK. </p>

In [166]:
key_metrics_l[key_metrics_l['is_promo'] == 'promo_period']["OPS"].sum()

2089362.5800000001

In [167]:
in_promo = key_metrics_l[key_metrics_l['is_promo'] == 'promo_period']
print(in_promo.shape, in_promo['OPS'].sum())
in_promo.head()

(2594, 15) 2089362.58


Unnamed: 0,is_promo,asin,OPS,OPS_deseas,units,units_deseas,week_ending,OPS_per_week,OPS_deseas_per_week,units_per_week,units_deseas_per_week,season_number,display_price_x,display_price_y,per_change_price
2315,promo_period,B00ERIF4RK,5862.71,6174.766511,1421,1496.264565,10,586.271,617.476651,142.1,149.626456,2.0,4.99,4.99,0.0
2316,promo_period,B00ERIF6S2,163.33,171.698522,41,43.099418,10,16.333,17.169852,4.1,4.309942,3.0,4.99,4.99,0.0
2317,promo_period,B00ERIFGY6,4.19,4.232306,1,1.010097,1,4.19,4.232306,1.0,1.010097,3.0,4.99,4.99,0.0
2318,promo_period,B00ERIFUK6,35.59,37.965273,9,9.572573,7,5.084286,5.42361,1.285714,1.36751,1.0,4.99,4.99,0.0
2319,promo_period,B00ERIG3Z2,87.96,91.533266,22,22.863551,9,9.773333,10.170363,2.444444,2.540395,3.0,4.99,4.99,0.0


In [168]:
def get_asin_by_dimension(df,dimension):
    t1 = pd.DataFrame(df.groupby(['display_price_y',dimension]).size().reset_index())
    t1.columns = ['display_price_y', dimension, "size"]
    
    t2 = pd.DataFrame(df.groupby(['display_price_y',dimension])['OPS','OPS_deseas',
                                                                'units','units_deseas'].sum().reset_index())
    t2.columns = ['display_price_y', dimension, "ops",'ops_deseas', 'units','units_deseas']
    
    t = pd.merge(left=t2,right=t1,left_on=['display_price_y',dimension],
                right_on=['display_price_y',dimension], how = 'left')
    
    
    #Switch from non deseasonlized to deseasonalized
    #total = pd.DataFrame(t.groupby("display_price_y")['ops','units'].sum().reset_index())
    total = pd.DataFrame(t.groupby("display_price_y")['ops_deseas','units_deseas'].sum().reset_index())
    
    total.columns = ['display_price_y','total_ops', 'total_units']
    t = pd.merge(left=t,right=total,left_on="display_price_y",right_on="display_price_y", how = "left")

    #Watch the corresponding change in these metrics!!!
    #t['share_ops'] = t['ops'] / t['total_ops']
    #t['share_units'] = t['units'] / t['total_units']
    
    t['share_ops'] = t['ops_deseas'] / t['total_ops']
    t['share_units'] = t['units_deseas'] / t['total_units']
    
    return t

In [169]:
asin_dim = get_asin_by_dimension(in_promo,dimension)

In [170]:
print(asin_dim[asin_dim['display_price_y'] == 4.99]['ops_deseas'].sum())
asin_dim[asin_dim['display_price_y'] == 4.99]

1112686.46594


Unnamed: 0,display_price_y,season_number,ops,ops_deseas,units,units_deseas,size,total_ops,total_units,share_ops,share_units
0,4.99,0.0,2062.86,2168.547235,492,517.441517,2,1112686.0,266240.145901,0.001949,0.001944
1,4.99,1.0,430263.55,454433.947422,101011,106562.744635,616,1112686.0,266240.145901,0.408411,0.40025
2,4.99,2.0,155028.24,163812.021517,37508,39627.604128,317,1112686.0,266240.145901,0.147222,0.148842
3,4.99,3.0,122975.6,129516.387741,28659,30144.42203,211,1112686.0,266240.145901,0.1164,0.113223
4,4.99,4.0,72822.27,76722.320676,17848,18800.22475,144,1112686.0,266240.145901,0.068952,0.070614
5,4.99,5.0,54817.78,57752.534305,13131,13820.869519,94,1112686.0,266240.145901,0.051904,0.051911
6,4.99,6.0,32268.37,34003.589296,7894,8318.252086,61,1112686.0,266240.145901,0.03056,0.031243
7,4.99,7.0,30908.85,32749.627886,7479,7919.548812,57,1112686.0,266240.145901,0.029433,0.029746
8,4.99,8.0,17453.69,18493.08176,4297,4554.250282,35,1112686.0,266240.145901,0.01662,0.017106
9,4.99,9.0,5410.96,5712.660866,1318,1391.444352,21,1112686.0,266240.145901,0.005134,0.005226


In [171]:
final_d[final_d['display_price_y'] == 9.99]

Unnamed: 0,display_price_y,season_number,non_promo_period,promo_period,change_in_metric,change_promo_vs_non_promo_in_%,per_change_price,elasticity
46,9.99,1.0,5.546899,9.738765,-0.755713,75.571331,0.40029,-1.887915
47,9.99,2.0,7.587284,14.654241,-0.931421,93.14213,0.400293,-2.32685
48,9.99,3.0,6.756539,13.779832,-1.039481,103.948081,0.447446,-2.323143
49,9.99,4.0,9.42576,19.778115,-1.098304,109.830448,0.415918,-2.640678
50,9.99,5.0,17.280936,26.837017,-0.552984,55.298403,0.422508,-1.308813
51,9.99,6.0,14.012053,24.328834,-0.736279,73.627908,0.412226,-1.786107
52,9.99,7.0,7.296733,11.56466,-0.584909,58.490917,0.459114,-1.273995
53,9.99,8.0,6.650895,21.045495,-2.16431,216.430991,0.481875,-4.491435
54,9.99,9.0,2.831641,5.509467,-0.94568,94.567963,0.372229,-2.540582
55,9.99,10.0,2.859255,4.693765,-0.641604,64.160408,0.343706,-1.866725


In [172]:
def final_join(d1,d2,dimension):
    g= pd.merge(left=d1,right=d2,left_on=['display_price_y',dimension],right_on=['display_price_y',dimension],how='left')
    #g['avg_ops_old'] = g['display_price_y'] / g['per_change_price'] * g['non_promo_period']
    #g['avg_ops_new'] = g['display_price_y'] * g['promo_period']
    #g['ops_diff_in_%'] = (g['avg_ops_new'] - g['avg_ops_old']) / g['avg_ops_old']
    return g

In [173]:
fff = final_join(final_d,asin_dim,dimension)

In [174]:
fff.head()

Unnamed: 0,display_price_y,season_number,non_promo_period,promo_period,change_in_metric,change_promo_vs_non_promo_in_%,per_change_price,elasticity,ops,ops_deseas,units,units_deseas,size,total_ops,total_units,share_ops,share_units
0,4.99,0.0,6.388617,21.851902,-2.420444,242.044352,0.637775,-3.795135,2062.86,2168.547235,492,517.441517,2,1112686.0,266240.145901,0.001949,0.001944
1,4.99,1.0,7.24065,17.672853,-1.440783,144.078266,0.604733,-2.382512,430263.55,454433.947422,101011,106562.744635,616,1112686.0,266240.145901,0.408411,0.40025
2,4.99,2.0,5.020069,12.535812,-1.497139,149.713908,0.654365,-2.287925,155028.24,163812.021517,37508,39627.604128,317,1112686.0,266240.145901,0.147222,0.148842
3,4.99,3.0,5.335762,14.83756,-1.780776,178.077632,0.667272,-2.668743,122975.6,129516.387741,28659,30144.42203,211,1112686.0,266240.145901,0.1164,0.113223
4,4.99,4.0,5.496348,13.408933,-1.439608,143.960776,0.678733,-2.121023,72822.27,76722.320676,17848,18800.22475,144,1112686.0,266240.145901,0.068952,0.070614


## Add change in the OPS to the table

In [175]:
def stack_ops_to_table(df, metric, dimension):
    ops_change = get_output_dimension(key_metrics_l,metric,dimension)
    ops_change.rename(index=str, columns={"non_promo_period": "non_promo_OPS", "promo_period": "promo_OPS"})
    return ops_change

In [176]:
ops = stack_ops_to_table(key_metrics_l, metric1, dimension)

In [177]:
def just_merge(df1,df2, dimension):
    final_output = pd.merge(left=df1,right=df2,left_on=['display_price_y',dimension],right_on=['display_price_y',dimension],how='left')
    
    #Rename columns so that it is easier to understand
    final_output.columns = ['display_price',dimension,'non_promo_units','promo_units',
                           'change_in_units','percentage_change_units',
                           'change_price_ratio','elasticity','ops', 'ops_deseas',
                            'units','units_deseas','size', 'total_ops', 'total_units',
                            'share_ops','share_units','non_promo_ops','promo_ops','change_ops',
                            'percentage_change_ops']
    return final_output

In [178]:
rff = just_merge(fff,ops, dimension)

In [179]:
print(rff.shape)
rff[rff['display_price'] == 9.99]

(90, 21)


Unnamed: 0,display_price,season_number,non_promo_units,promo_units,change_in_units,percentage_change_units,change_price_ratio,elasticity,ops,ops_deseas,...,units_deseas,size,total_ops,total_units,share_ops,share_units,non_promo_ops,promo_ops,change_ops,percentage_change_ops
46,9.99,1.0,5.546899,9.738765,-0.755713,75.571331,0.40029,-1.887915,109946.55,118001.863839,...,14193.005043,158,809288.994222,97199.659982,0.145809,0.146019,72.12218,80.410732,-0.114924,11.492375
47,9.99,2.0,7.587284,14.654241,-0.931421,93.14213,0.400293,-2.32685,130207.19,137573.797859,...,16202.792484,113,809288.994222,97199.659982,0.169993,0.166696,101.373821,123.882884,-0.22204,22.20402
48,9.99,3.0,6.756539,13.779832,-1.039481,103.948081,0.447446,-2.323143,81136.5,86064.237695,...,10429.93877,79,809288.994222,97199.659982,0.106345,0.107304,103.334552,113.52819,-0.098647,9.864694
49,9.99,4.0,9.42576,19.778115,-1.098304,109.830448,0.415918,-2.640678,136629.4,147823.269457,...,17587.031443,88,809288.994222,97199.659982,0.182658,0.180937,132.126983,165.677332,-0.253925,25.392503
50,9.99,5.0,17.280936,26.837017,-0.552984,55.298403,0.422508,-1.308813,104709.88,111695.118836,...,13291.267361,50,809288.994222,97199.659982,0.138016,0.136742,266.319339,224.622562,0.156567,-15.656684
51,9.99,6.0,14.012053,24.328834,-0.736279,73.627908,0.412226,-1.786107,70086.9,73820.896615,...,8770.526739,36,809288.994222,97199.659982,0.091217,0.090232,207.948511,203.647158,0.020685,-2.06847
52,9.99,7.0,7.296733,11.56466,-0.584909,58.490917,0.459114,-1.273995,19140.35,20160.502846,...,2465.619635,22,809288.994222,97199.659982,0.024911,0.025367,115.483814,94.635043,0.180534,-18.053413
53,9.99,8.0,6.650895,21.045495,-2.16431,216.430991,0.481875,-4.491435,15903.76,16790.562772,...,2060.807071,10,809288.994222,97199.659982,0.020747,0.021202,104.899834,171.246911,-0.63248,63.248029
54,9.99,9.0,2.831641,5.509467,-0.94568,94.567963,0.372229,-2.540582,1209.37,1275.434925,...,152.886906,3,809288.994222,97199.659982,0.001576,0.001573,35.838658,45.95753,-0.282345,28.234518
55,9.99,10.0,2.859255,4.693765,-0.641604,64.160408,0.343706,-1.866725,661.89,697.354168,...,84.260796,2,809288.994222,97199.659982,0.000862,0.000867,31.840296,38.88384,-0.221215,22.121477


In [153]:
#Sanity check the directions

# a = rff[rff['display_price'] == 4.99]['share_ops']
# b = rff[rff['display_price'] == 4.99]['percentage_change_ops']
#np.dot(a,b)

In [154]:
#Sanity check shares of total ops and units
#rff[rff['display_price'] == 9.99]['share_units'].sum()

In [None]:
#rff.to_csv("UK_table_2.csv")

# Sales distribution

In [None]:
# print(df.shape)
# df.head()

In [None]:
# df[(df['is_promo'] == 'promo_period') & (df['is_promo_asin'] == 'promo_asins')]['OPS'].sum()

In [None]:
# def sales_dist(df,dimension):
#     sales_dist = pd.DataFrame(df[(df['is_promo'] == 'promo_period') & (df['is_promo_asin'] == 'promo_asins')].groupby(dimension)['OPS'].sum().reset_index())
#     sales_dist['share'] = sales_dist['OPS'] / sales_dist['OPS'].sum()
#     return sales_dist

In [None]:
# sales_dist_df = sales_dist(df,"lifecycle")

In [None]:
# sales_dist_df.head()

# Check ARPU per user for TV Season

In [None]:
# arpu = pd.read_csv("../TV_test/data/ARPU.txt", sep = "\t", 
#                    names = ['total','new','ops','units','cogs','week','marketplace','content'])
# arpu['week'] = pd.to_datetime(arpu['week']) 

In [None]:
# arpu.head()
# arpu['arpu_per_user'] = arpu['ops'] / arpu['total']
# arpu['year_month'] = arpu['week'].map(lambda x: 1000*x.year + x.month)
# arpu['year'] = arpu['week'].map(lambda x: x.year)
# arpu['month'] = arpu['week'].map(lambda x: x.month)

In [None]:
# arpu  = arpu[arpu['content'] == "TV Season"]

In [None]:
# arpu.head()

In [None]:
# arpu.plot(x = "week", y = 'arpu_per_user', figsize = (20,10))
# plt.show()

In [None]:
# arpu[(arpu['week'] >= "2017-01-01") & (arpu['week'] <= '2017-05-01')].plot(x = "week", y = 'arpu_per_user', figsize = (20,10))

In [None]:
# g = pd.DataFrame(arpu[(arpu['week'] >= "2016-01-01")
#               & (arpu['week'] <= '2017-06-01')].groupby(['year_month'])['ops','total'].sum().reset_index())
# g['arpu_per_user'] = g['ops'] / g['total']
# print("Here is the ARPU per user for TV Season in 2017 per month")
# g

## Breakdown the data by lifecycle

<p> Make prediction by 2 categories: new release  vs deep catalogue, new release 0 to 6m, rest is deep catalogue, promo vs nonpromo </p>
<p> Take seasonality into account </p>