# EDA vouchers and gifted subscriptions

## Read in tables

In [40]:
import pandas as pd
import numpy as np  
import datetime
from sql_functions import get_engine

In [41]:
schema = 'capstone_filmingo' 
engine = get_engine()

In [42]:
table_name = 'accounts'
df_accounts = pd.read_sql_query(f'select * from {schema}.{table_name}', engine)

In [43]:
table_name = 'vouchers'
df_vouchers = pd.read_sql_query(f'select * from {schema}.{table_name}', engine)


In [44]:
table_name = 'subscriptions'
df_subscriptions = pd.read_sql_query(f'select * from {schema}.{table_name}', engine)


In [45]:
table_name = 'playbacks'
df_playbacks = pd.read_sql_query(f'select * from {schema}.{table_name}', engine)


## Truncating subscription start dates

### Truncate subscription start date according to our time period (as of 01.10.2020)

In [46]:
# define new EDA table
df_subscriptions_eda = df_subscriptions

In [47]:
# truncating subscription start to 01.10.2020 if subscription started before
df_subscriptions_eda['subscription_start_adj_period'] = df_subscriptions['subscription_start'].apply(lambda x: x if str(x) > '2020-10-01' else '2020-10-01')

In [48]:
#set data types for subscription dates to datetime
df_subscriptions_eda['subscription_start_adj_period'] = pd.to_datetime(df_subscriptions_eda['subscription_start_adj_period'])

In [49]:
#create new column and calculate new subscription duration
df_subscriptions_eda['subscription_months_raw_adj_period'] = ((df_subscriptions_eda.subscription_end) - df_subscriptions_eda.subscription_start_adj_period)/np.timedelta64(1, 'M')

In [50]:
#create new column for rounded subscription months for easier further processing
#generally round up from 0.1 to be able to allow some discrepancies due to day to day calculation of subscription duration (deduct 0.1 to be able to use .ceil)
df_subscriptions_eda['subscription_months_adj_period'] = df_subscriptions_eda['subscription_months_raw_adj_period'] - 0.1
df_subscriptions_eda['subscription_months_adj_period'] = df_subscriptions_eda['subscription_months_adj_period'].apply(np.ceil)

### Calculate prices per subscription based on adjusted start date

In [83]:
#calculate total price per subscription (price / 12 * subscription months)
df_subscriptions_eda['total_price_chf_adj_period'] = df_subscriptions_eda['price_chf'] / 12 * df_subscriptions_eda['subscription_months_adj_period']
df_subscriptions_eda['total_price_eur_adj_period'] = df_subscriptions_eda['price_eur'] / 12 * df_subscriptions_eda['subscription_months_adj_period']


# conditional calculation for exceptions:

# if the subscription is monthly only calculate price * months
df_subscriptions_eda.loc[(df_subscriptions_eda['subscription_monthly'] == 1), 'total_price_chf_adj_period'] = (df_subscriptions_eda['price_chf'] * df_subscriptions_eda['subscription_months_adj_period'])
df_subscriptions_eda.loc[(df_subscriptions_eda['subscription_monthly'] == 1), 'total_price_eur_adj_period'] = (df_subscriptions_eda['price_eur'] * df_subscriptions_eda['subscription_months_adj_period'])

# if the subscription is gifted & 6 months long, a different price is applicable (there is only a 6 month subscription available for gifted subscriptions)
df_subscriptions_eda.loc[((df_subscriptions_eda['gift_subscription'] == True) & (df_subscriptions_eda['subscription_months_adj_period'] == 6)), 'total_price_chf_adj_period'] = '49'
df_subscriptions_eda.loc[((df_subscriptions_eda['gift_subscription'] == True) & (df_subscriptions_eda['subscription_months_adj_period'] == 6)), 'total_price_eur_adj_period'] = '41'

# Vouchers


### First subscription date per account

In [51]:
# make new table for the first subscription date per account
df_account_sub_min_date = df_subscriptions.groupby('account_key')['subscription_start'].min()

In [52]:
# merge with subscriptions EDA and accounts EDA table
df_subscriptions_eda = pd.merge(df_subscriptions_eda, df_account_sub_min_date, on='account_key', how='left')
df_accounts_eda = pd.merge(df_accounts, df_account_sub_min_date, on='account_key', how='left')

In [53]:
# rename columns
df_subscriptions_eda.rename(columns = {'subscription_start_y':'min_subscription_start'}, inplace = True)
df_accounts_eda.rename(columns = {'subscription_start':'min_subscription_start'}, inplace = True)


In [54]:
df_subscriptions_eda.head()

Unnamed: 0,subscription_key,account_key,currency,price,subscription_type,subscription_monthly,subscription_start_x,subscription_end,gift_subscription,subscription_months_raw,subscription_months,price_chf,price_eur,total_price_chf,total_price_eur,subscription_start_adj_period,subscription_months_raw_adj_period,subscription_months_adj_period,min_subscription_start
0,1b82b2308dcae546ad0194d03b9d23edd5d2781109de28...,ab1bcf2c1b8d4eb422079becc0c59b1c7db2fd009235d2...,chf,90.0,BASIC,0,2017-02-24,2021-02-23,False,47.968131,48.0,90.0,75.0,360.0,300.0,2020-10-01,4.763958,5.0,2017-02-24
1,4cb9f67d44d60eca0485270d73b3028143021296081847...,ee1a45a439dc6968f1cfc2b9840264deeada4c0d93f389...,chf,90.0,BASIC,0,2017-08-21,2023-08-20,False,71.952196,72.0,90.0,75.0,540.0,450.0,2020-10-01,34.596193,35.0,2017-08-21
2,40f9e218b70a6f482ecfcde0011305d93fa6059f624a50...,8bf8f58e615549df5f05b717b6e914da80e213edab3dba...,chf,9.0,BASIC,1,2017-09-26,2022-10-08,False,60.387277,61.0,9.0,7.5,549.0,457.5,2020-10-01,24.21405,25.0,2017-09-26
3,69d21e601612cc97fc7df9bd17a281879f785358d1c681...,cf914aa939499e8e18fdd9494fce5297b9dce2e1f909ea...,chf,90.0,BASIC,0,2017-09-28,2021-09-27,False,47.968131,48.0,90.0,75.0,360.0,300.0,2020-10-01,11.860613,12.0,2017-09-28
4,29f98a3154a679f2fe953b1f3b148ddffcb28e60fff719...,8759ab0e44c1cf688770220cf1b87efac2726baca6bfcb...,eur,11.0,STANDARD,1,2017-10-13,2021-10-12,False,47.968131,48.0,15.0,12.5,720.0,600.0,2020-10-01,12.353436,13.0,2017-10-13


### First voucher created per account

In [55]:
# make new table for first voucher creation date per account
df_voucher_min_date = df_vouchers.groupby('email_hash_receiver')['creation_date'].min()

In [56]:
# merge with accounts EDA table
df_accounts_eda = pd.merge(df_accounts_eda, df_voucher_min_date, left_on='email_hash', right_on='email_hash_receiver', how='left')

In [57]:
# rename column
df_accounts_eda.rename(columns = {'creation_date':'min_voucher_creation_date'}, inplace = True)


### First One Time Rental (OTR) date per account

In [58]:
# make new table only for OTR playbacks
df_otr_playbacks = df_playbacks.query('subscription_playback == 0')

In [59]:
# make new table for first OTR playback per account
df_otr_playbacks_min = df_otr_playbacks.groupby('account_key')['date_start'].min()

In [60]:
# merge with accounts EDA table
df_accounts_eda = pd.merge(df_accounts_eda, df_otr_playbacks_min, on='account_key', how='left')

In [61]:
# rename columns
df_accounts_eda.rename(columns = {'date_start':'min_otr_playback'}, inplace = True)


# Gifted Subscriptions

In [65]:
# make tables with only gifted/paid subs
gifted_df_subscriptions = df_subscriptions.query('gift_subscription == True')
paid_df_subscriptions = df_subscriptions.query('gift_subscription == False')

In [66]:
# make tables with first of gifted/paid subscriptions per account
gifted_sub_min_date = gifted_df_subscriptions.groupby('account_key')['subscription_start'].min()
paid_sub_min_date = paid_df_subscriptions.groupby('account_key')['subscription_start'].min()

In [67]:
# merge gifted subs into the subscription EDA and accounts EDA data frames
df_subscriptions_eda = pd.merge(df_subscriptions_eda, gifted_sub_min_date, on='account_key', how='left')
df_subscriptions_eda.rename(columns = {'subscription_start_x':'subscription_start', 'subscription_start':'min_gifted_subscription'}, inplace = True)

df_accounts_eda = pd.merge(df_accounts_eda, gifted_sub_min_date, on='account_key', how='left')
df_accounts_eda.rename(columns = {'subscription_start':'min_gifted_subscription'}, inplace = True)


In [69]:
# merge paid subs into the subscription EDA and accounts EDA data frames
df_subscriptions_eda = pd.merge(df_subscriptions_eda, paid_sub_min_date, on='account_key', how='left')
df_subscriptions_eda.rename(columns = {'subscription_start_x':'subscription_start', 'subscription_start_y':'min_paid_subscription'}, inplace = True)

df_accounts_eda = pd.merge(df_accounts_eda, paid_sub_min_date, on='account_key', how='left')
df_accounts_eda.rename(columns = {'subscription_start':'min_paid_subscription'}, inplace = True)

In [77]:
# check individual account_keys if necessary
df_accounts_eda[df_accounts_eda['account_key'].str.contains('ab1bcf2c1b8d4eb422079becc0c59b1c7db2fd009235d2', na=False)]

Unnamed: 0,account_key,city_original,language,country_code,email_hash,onetime_rental_count,subscription_count,registration_date,lastlogin_date,postal_code_clean,...,language_code,country_name,region,sub_region,voucher_used,min_subscription_start,min_voucher_creation_date,min_otr_playback,min_gifted_subscription,min_paid_subscription
51,ab1bcf2c1b8d4eb422079becc0c59b1c7db2fd009235d2...,Sils->Maria,de,CH,45f699aaf1e364509ef9606e62ee49dc04e7b4ab25432a...,0,2,2007-06-21,2022-09-17 20:11:05,7514,...,1.0,Switzerland,Europe,Western Europe,,2017-02-24,NaT,NaT,NaT,2017-02-24


## Push EDA Tables to SQL

In [31]:
schema = 'capstone_filmingo' 
engine = get_engine()

In [32]:
# table_name = 'subscriptions_eda'

# if engine!=None:
#     try:
#         df_subscriptions_eda.to_sql(name=table_name, # Name of SQL table
#                         con=engine, # Engine or connection
#                         if_exists='replace', # Drop the table before inserting new values 
#                         schema=schema, # Use schema that was defined earlier
#                         index=False, # Write DataFrame index as a column
#                         chunksize=5000, # Specify the number of rows in each batch to be written at a time
#                         method='multi') # Pass multiple values in a single INSERT clause
#         print(f"The {table_name} table was imported successfully.")
#     # Error handling
#     except (Exception, psycopg2.DatabaseError) as error:
#         print(error)
#         engine = None
# else:
#      print('Push did not work')

In [33]:
# Just to be sure: Check if the number of rows match
table_name_sql = f'''SELECT count(*) 
                    FROM {schema}.{table_name}
                    '''
engine.execute(table_name_sql).fetchall()[0][0] == df_subscriptions_eda.shape[0]

False

In [None]:
# table_name = 'accounts_eda'

# if engine!=None:
#     try:
#         df_accounts_eda.to_sql(name=table_name, # Name of SQL table
#                         con=engine, # Engine or connection
#                         if_exists='replace', # Drop the table before inserting new values 
#                         schema=schema, # Use schema that was defined earlier
#                         index=False, # Write DataFrame index as a column
#                         chunksize=5000, # Specify the number of rows in each batch to be written at a time
#                         method='multi') # Pass multiple values in a single INSERT clause
#         print(f"The {table_name} table was imported successfully.")
#     # Error handling
#     except (Exception, psycopg2.DatabaseError) as error:
#         print(error)
#         engine = None
# else:
#      print('Push did not work')

In [None]:
# Just to be sure: Check if the number of rows match
table_name_sql = f'''SELECT count(*) 
                    FROM {schema}.{table_name}
                    '''
engine.execute(table_name_sql).fetchall()[0][0] == df_accounts_eda.shape[0]

False