# EDA Vouchers and gifted Subscriptions

## Read in needed tables

In [1]:
import pandas as pd
import numpy as np  
import datetime
from sql_functions import get_engine

In [2]:
schema = 'capstone_filmingo' 
engine = get_engine()

In [3]:
table_name = 'accounts'
df_accounts = pd.read_sql_query(f'select * from {schema}.{table_name}', engine)

In [4]:
table_name = 'vouchers'
df_vouchers = pd.read_sql_query(f'select * from {schema}.{table_name}', engine)


In [5]:
table_name = 'subscriptions'
df_subscriptions = pd.read_sql_query(f'select * from {schema}.{table_name}', engine)


In [6]:
table_name = 'playbacks'
df_playbacks = pd.read_sql_query(f'select * from {schema}.{table_name}', engine)


## Truncating subscription dates

### Truncate subscription start date according to our time period 

In [7]:
# define new EDA table
df_subscriptions_eda = df_subscriptions

In [8]:
# truncating subscription start to 01.10.2020 if subscription started before
df_subscriptions_eda['subscription_start_adj_period'] = df_subscriptions['subscription_start'].apply(lambda x: x if str(x) > '2020-10-01' else '2020-10-01')

In [9]:
#set data types for subscription dates to datetime
df_subscriptions_eda['subscription_start_adj_period'] = pd.to_datetime(df_subscriptions_eda['subscription_start_adj_period'])

In [10]:
#create new column and calculate new subscription duration
df_subscriptions_eda['subscription_months_raw_adj_period'] = ((df_subscriptions_eda.subscription_end) - df_subscriptions_eda.subscription_start_adj_period)/np.timedelta64(1, 'M')

In [11]:
#create new column for rounded subscription months for easier further processing
#generally round up from 0.1 to be able to allow some discrepancies due to day to day calculation of subscription duration (deduct 0.1 to be able to use .ceil)
df_subscriptions_eda['subscription_months_adj_period'] = df_subscriptions_eda['subscription_months_raw_adj_period'] - 0.1
df_subscriptions_eda['subscription_months_adj_period'] = df_subscriptions_eda['subscription_months_adj_period'].apply(np.ceil)

# Vouchers


### First subscription date per account

In [12]:
# make new table for the first subscription date per account
df_account_sub_min_date = df_subscriptions.groupby('account_key')['subscription_start'].min()

In [22]:
# merge with subscriptions EDA and accounts EDA table
df_subscriptions_eda = pd.merge(df_subscriptions_eda, df_account_sub_min_date, on='account_key', how='left')
df_accounts_eda = pd.merge(df_accounts, df_account_sub_min_date, on='account_key', how='left')

In [23]:
# rename columns
df_subscriptions_eda.rename(columns = {'subscription_start_y':'min_subscription_start'}, inplace = True)
df_accounts_eda.rename(columns = {'subscription_start':'min_subscription_start'}, inplace = True)


### First voucher created per account

In [16]:
# make new table for first voucher creation date per account
df_voucher_min_date = df_vouchers.groupby('email_hash_receiver')['creation_date'].min()

In [25]:
# merge with accounts EDA table
df_accounts_eda = pd.merge(df_accounts_eda, df_voucher_min_date, left_on='email_hash', right_on='email_hash_receiver', how='left')

In [27]:
# rename column
df_accounts_eda.rename(columns = {'creation_date':'min_voucher_creation_date'}, inplace = True)


### First One Time Rental (OTR) date per account

In [110]:
# make new table only for OTR playbacks
df_otr_playbacks = df_playbacks.query('subscription_playback == 0')

In [112]:
# make new table for first OTR playback per account
df_otr_playbacks_min = df_otr_playbacks.groupby('account_key')['date_start'].min()

In [113]:
# merge with accounts EDA table
df_accounts_eda = pd.merge(df_accounts_eda, df_otr_playbacks_min, on='account_key', how='left')

In [138]:
# rename columns
df_accounts_eda.rename(columns = {'date_start':'min_otr_playback'}, inplace = True)


## Push to SQL

In [117]:
schema = 'capstone_filmingo' 
engine = get_engine()

In [139]:
# table_name = 'accounts_eda'

# if engine!=None:
#     try:
#         df_accounts_eda.to_sql(name=table_name, # Name of SQL table
#                         con=engine, # Engine or connection
#                         if_exists='replace', # Drop the table before inserting new values 
#                         schema=schema, # Use schema that was defined earlier
#                         index=False, # Write DataFrame index as a column
#                         chunksize=5000, # Specify the number of rows in each batch to be written at a time
#                         method='multi') # Pass multiple values in a single INSERT clause
#         print(f"The {table_name} table was imported successfully.")
#     # Error handling
#     except (Exception, psycopg2.DatabaseError) as error:
#         print(error)
#         engine = None
# else:
#      print('Push did not work')

The accounts_eda table was imported successfully.


In [140]:
# Just to be sure: Check if the number of rows match
table_name_sql = f'''SELECT count(*) 
                    FROM {schema}.{table_name}
                    '''
engine.execute(table_name_sql).fetchall()[0][0] == df_accounts_eda.shape[0]

True

# Gifted Subscriptions

In [120]:
df_subscriptions_eda.head()

Unnamed: 0,subscription_key,account_key,currency,price,subscription_type,subscription_monthly,subscription_start_x,subscription_end,gift_subscription,subscription_months_raw,subscription_months,price_chf,price_eur,total_price_chf,total_price_eur,subscription_start_truncate,subscription_start_adj_period,subscription_months_raw_adj_period,subscription_months_adj_period,min_subscription_start
0,1b82b2308dcae546ad0194d03b9d23edd5d2781109de28...,ab1bcf2c1b8d4eb422079becc0c59b1c7db2fd009235d2...,chf,90.0,BASIC,0,2017-02-24,2021-02-23,False,47.968131,48.0,90.0,75.0,360.0,300.0,2020-10-01,2020-10-01,4.763958,5.0,2017-02-24
1,4cb9f67d44d60eca0485270d73b3028143021296081847...,ee1a45a439dc6968f1cfc2b9840264deeada4c0d93f389...,chf,90.0,BASIC,0,2017-08-21,2023-08-20,False,71.952196,72.0,90.0,75.0,540.0,450.0,2020-10-01,2020-10-01,34.596193,35.0,2017-08-21
2,40f9e218b70a6f482ecfcde0011305d93fa6059f624a50...,8bf8f58e615549df5f05b717b6e914da80e213edab3dba...,chf,9.0,BASIC,1,2017-09-26,2022-10-08,False,60.387277,61.0,9.0,7.5,549.0,457.5,2020-10-01,2020-10-01,24.21405,25.0,2017-09-26
3,69d21e601612cc97fc7df9bd17a281879f785358d1c681...,cf914aa939499e8e18fdd9494fce5297b9dce2e1f909ea...,chf,90.0,BASIC,0,2017-09-28,2021-09-27,False,47.968131,48.0,90.0,75.0,360.0,300.0,2020-10-01,2020-10-01,11.860613,12.0,2017-09-28
4,29f98a3154a679f2fe953b1f3b148ddffcb28e60fff719...,8759ab0e44c1cf688770220cf1b87efac2726baca6bfcb...,eur,11.0,STANDARD,1,2017-10-13,2021-10-12,False,47.968131,48.0,15.0,12.5,720.0,600.0,2020-10-01,2020-10-01,12.353436,13.0,2017-10-13


In [121]:
# make tables with only gifted/paid subs
gifted_df_subscriptions = df_subscriptions.query('gift_subscription == True')
paid_df_subscriptions = df_subscriptions.query('gift_subscription == False')

In [122]:
# make tables with minimum start date of gifted/paid subscriptions
gifted_sub_min_date = gifted_df_subscriptions.groupby('account_key')['subscription_start'].min()
paid_sub_min_date = paid_df_subscriptions.groupby('account_key')['subscription_start'].min()

In [123]:
# merge into a new subscription EDA dataframe
df_subscriptions_eda = pd.merge(df_subscriptions_eda, gifted_sub_min_date, on='account_key', how='left')

In [124]:
# merge into a new subscription EDA dataframe
df_subscriptions_eda = pd.merge(df_subscriptions_eda, paid_sub_min_date, on='account_key', how='left')

  df_subscriptions_eda = pd.merge(df_subscriptions_eda, paid_sub_min_date, on='account_key', how='left')


In [125]:
df_subscriptions_eda.rename(columns = {'subscription_start_x':'subscription_start', 'subscription_start_y':'min_gifted_subscription', 'subscription_start':'min_paid_subscription'}, inplace = True)


## Push to SQL

In [126]:
schema = 'capstone_filmingo' 
engine = get_engine()

In [132]:
# table_name = 'subscriptions_eda'

# if engine!=None:
#     try:
#         df_subscriptions_eda.to_sql(name=table_name, # Name of SQL table
#                         con=engine, # Engine or connection
#                         if_exists='replace', # Drop the table before inserting new values 
#                         schema=schema, # Use schema that was defined earlier
#                         index=False, # Write DataFrame index as a column
#                         chunksize=5000, # Specify the number of rows in each batch to be written at a time
#                         method='multi') # Pass multiple values in a single INSERT clause
#         print(f"The {table_name} table was imported successfully.")
#     # Error handling
#     except (Exception, psycopg2.DatabaseError) as error:
#         print(error)
#         engine = None
# else:
#      print('Push did not work')

The subscriptions_eda table was imported successfully.


In [133]:
# Just to be sure: Check if the number of rows match
table_name_sql = f'''SELECT count(*) 
                    FROM {schema}.{table_name}
                    '''
engine.execute(table_name_sql).fetchall()[0][0] == df_subscriptions_eda.shape[0]

True

## tests

In [129]:
df_subscriptions_eda.head()

Unnamed: 0,subscription_key,account_key,currency,price,subscription_type,subscription_monthly,subscription_start,subscription_end,gift_subscription,subscription_months_raw,...,price_eur,total_price_chf,total_price_eur,subscription_start_truncate,subscription_start_adj_period,subscription_months_raw_adj_period,subscription_months_adj_period,min_subscription_start,subscription_start.1,min_gifted_subscription
0,1b82b2308dcae546ad0194d03b9d23edd5d2781109de28...,ab1bcf2c1b8d4eb422079becc0c59b1c7db2fd009235d2...,chf,90.0,BASIC,0,2017-02-24,2021-02-23,False,47.968131,...,75.0,360.0,300.0,2020-10-01,2020-10-01,4.763958,5.0,2017-02-24,NaT,2017-02-24
1,4cb9f67d44d60eca0485270d73b3028143021296081847...,ee1a45a439dc6968f1cfc2b9840264deeada4c0d93f389...,chf,90.0,BASIC,0,2017-08-21,2023-08-20,False,71.952196,...,75.0,540.0,450.0,2020-10-01,2020-10-01,34.596193,35.0,2017-08-21,NaT,2017-08-21
2,40f9e218b70a6f482ecfcde0011305d93fa6059f624a50...,8bf8f58e615549df5f05b717b6e914da80e213edab3dba...,chf,9.0,BASIC,1,2017-09-26,2022-10-08,False,60.387277,...,7.5,549.0,457.5,2020-10-01,2020-10-01,24.21405,25.0,2017-09-26,NaT,2017-09-26
3,69d21e601612cc97fc7df9bd17a281879f785358d1c681...,cf914aa939499e8e18fdd9494fce5297b9dce2e1f909ea...,chf,90.0,BASIC,0,2017-09-28,2021-09-27,False,47.968131,...,75.0,360.0,300.0,2020-10-01,2020-10-01,11.860613,12.0,2017-09-28,NaT,2017-09-28
4,29f98a3154a679f2fe953b1f3b148ddffcb28e60fff719...,8759ab0e44c1cf688770220cf1b87efac2726baca6bfcb...,eur,11.0,STANDARD,1,2017-10-13,2021-10-12,False,47.968131,...,12.5,720.0,600.0,2020-10-01,2020-10-01,12.353436,13.0,2017-10-13,NaT,2017-10-13


In [130]:
df_subscriptions_eda.query("account_key == '1c1b404383a521430af05c648cf1b6e76dda59cb26218db8d32f92e188b49516'")

Unnamed: 0,subscription_key,account_key,currency,price,subscription_type,subscription_monthly,subscription_start,subscription_end,gift_subscription,subscription_months_raw,...,price_eur,total_price_chf,total_price_eur,subscription_start_truncate,subscription_start_adj_period,subscription_months_raw_adj_period,subscription_months_adj_period,min_subscription_start,subscription_start.1,min_gifted_subscription
5649,c67a2878182d37de8a0c84261fe72607025aaedd5ecb1e...,1c1b404383a521430af05c648cf1b6e76dda59cb26218d...,chf,,BASIC,0,2020-12-27,2021-06-26,True,5.946734,...,75.0,49.0,41.0,2020-12-27 00:00:00,2020-12-27,5.946734,6.0,2020-12-27,2020-12-27,2021-12-27
9892,80a0030f578a70bf5ca9b65027a6dc956dc96221d5696e...,1c1b404383a521430af05c648cf1b6e76dda59cb26218d...,chf,9.0,BASIC,1,2021-12-27,2022-01-23,False,0.887082,...,7.5,9.0,7.5,2021-12-27 00:00:00,2021-12-27,0.887082,1.0,2020-12-27,2020-12-27,2021-12-27
10265,9bafc28c684246a7e1c3f8c6e8422ceb08fc8cba878d73...,1c1b404383a521430af05c648cf1b6e76dda59cb26218d...,chf,90.0,BASIC,0,2022-01-23,2023-01-22,False,11.959178,...,75.0,90.0,75.0,2022-01-23 00:00:00,2022-01-23,11.959178,12.0,2020-12-27,2020-12-27,2021-12-27
