# EDA vouchers and gifted subscriptions

## Read in tables

In [40]:
import pandas as pd
import numpy as np  
import datetime
from sql_functions import get_engine

In [41]:
schema = 'capstone_filmingo' 
engine = get_engine()

In [42]:
table_name = 'accounts'
df_accounts = pd.read_sql_query(f'select * from {schema}.{table_name}', engine)

In [43]:
table_name = 'vouchers'
df_vouchers = pd.read_sql_query(f'select * from {schema}.{table_name}', engine)

In [44]:
table_name = 'subscriptions'
df_subscriptions = pd.read_sql_query(f'select * from {schema}.{table_name}', engine)

In [45]:
table_name = 'playbacks'
df_playbacks = pd.read_sql_query(f'select * from {schema}.{table_name}', engine)

## Truncating subscription start dates

### Truncate subscription start date according to our time period (as of 01.10.2020)

In [46]:
# Define new EDA table
df_subscriptions_eda = df_subscriptions

In [47]:
# Truncating subscription start to 01.10.2020 if subscription started before
df_subscriptions_eda['subscription_start_adj_period'] = df_subscriptions['subscription_start'].apply(lambda x: x if str(x) > '2020-10-01' else '2020-10-01')

In [48]:
# Set data types for subscription dates to datetime
df_subscriptions_eda['subscription_start_adj_period'] = pd.to_datetime(df_subscriptions_eda['subscription_start_adj_period'])

In [49]:
# Create new column and calculate new subscription duration
df_subscriptions_eda['subscription_months_raw_adj_period'] = ((df_subscriptions_eda.subscription_end) - df_subscriptions_eda.subscription_start_adj_period)/np.timedelta64(1, 'M')

In [50]:
# Create new column for rounded subscription months for easier further processing
# Generally round up from 0.1 to be able to allow some discrepancies due to day to day calculation of subscription duration (deduct 0.1 to be able to use .ceil)
df_subscriptions_eda['subscription_months_adj_period'] = df_subscriptions_eda['subscription_months_raw_adj_period'] - 0.1
df_subscriptions_eda['subscription_months_adj_period'] = df_subscriptions_eda['subscription_months_adj_period'].apply(np.ceil)

### Calculate prices per subscription based on adjusted start date

In [83]:
# Calculate total price per subscription (price / 12 * subscription months)
df_subscriptions_eda['total_price_chf_adj_period'] = df_subscriptions_eda['price_chf'] / 12 * df_subscriptions_eda['subscription_months_adj_period']
df_subscriptions_eda['total_price_eur_adj_period'] = df_subscriptions_eda['price_eur'] / 12 * df_subscriptions_eda['subscription_months_adj_period']


# Conditional calculation for exceptions:

# If the subscription is monthly only calculate price * months
df_subscriptions_eda.loc[(df_subscriptions_eda['subscription_monthly'] == 1), 'total_price_chf_adj_period'] = (df_subscriptions_eda['price_chf'] * df_subscriptions_eda['subscription_months_adj_period'])
df_subscriptions_eda.loc[(df_subscriptions_eda['subscription_monthly'] == 1), 'total_price_eur_adj_period'] = (df_subscriptions_eda['price_eur'] * df_subscriptions_eda['subscription_months_adj_period'])

# If the subscription is gifted & 6 months long, a different price is applicable (there is only a 6 month subscription available for gifted subscriptions)
df_subscriptions_eda.loc[((df_subscriptions_eda['gift_subscription'] == True) & (df_subscriptions_eda['subscription_months_adj_period'] == 6)), 'total_price_chf_adj_period'] = '49'
df_subscriptions_eda.loc[((df_subscriptions_eda['gift_subscription'] == True) & (df_subscriptions_eda['subscription_months_adj_period'] == 6)), 'total_price_eur_adj_period'] = '41'

# Vouchers


### First subscription date per account

In [51]:
# Make new table for the first subscription date per account
df_account_sub_min_date = df_subscriptions.groupby('account_key')['subscription_start'].min()

In [52]:
# Merge with subscriptions EDA and accounts EDA table
df_subscriptions_eda = pd.merge(df_subscriptions_eda, df_account_sub_min_date, on='account_key', how='left')
df_accounts_eda = pd.merge(df_accounts, df_account_sub_min_date, on='account_key', how='left')

In [53]:
# Rename columns
df_subscriptions_eda.rename(columns = {'subscription_start_y':'min_subscription_start'}, inplace = True)
df_accounts_eda.rename(columns = {'subscription_start':'min_subscription_start'}, inplace = True)


In [None]:
df_subscriptions_eda.head()

### First voucher created per account

In [55]:
# Make new table for first voucher creation date per account
df_voucher_min_date = df_vouchers.groupby('email_hash_receiver')['creation_date'].min()

In [56]:
# Merge with accounts EDA table
df_accounts_eda = pd.merge(df_accounts_eda, df_voucher_min_date, left_on='email_hash', right_on='email_hash_receiver', how='left')

In [57]:
# Rename column
df_accounts_eda.rename(columns = {'creation_date':'min_voucher_creation_date'}, inplace = True)

### First One Time Rental (OTR) date per account

In [58]:
# Make new table only for OTR playbacks
df_otr_playbacks = df_playbacks.query('subscription_playback == 0')

In [59]:
# Make new table for first OTR playback per account
df_otr_playbacks_min = df_otr_playbacks.groupby('account_key')['date_start'].min()

In [60]:
# Merge with accounts EDA table
df_accounts_eda = pd.merge(df_accounts_eda, df_otr_playbacks_min, on='account_key', how='left')

In [61]:
# Rename columns
df_accounts_eda.rename(columns = {'date_start':'min_otr_playback'}, inplace = True)

# Gifted Subscriptions

In [65]:
# Make tables with only gifted/paid subs
gifted_df_subscriptions = df_subscriptions.query('gift_subscription == True')
paid_df_subscriptions = df_subscriptions.query('gift_subscription == False')

In [66]:
# Make tables with first of gifted/paid subscriptions per account
gifted_sub_min_date = gifted_df_subscriptions.groupby('account_key')['subscription_start'].min()
paid_sub_min_date = paid_df_subscriptions.groupby('account_key')['subscription_start'].min()

In [67]:
# Merge gifted subs into the subscription EDA and accounts EDA data frames
df_subscriptions_eda = pd.merge(df_subscriptions_eda, gifted_sub_min_date, on='account_key', how='left')
df_subscriptions_eda.rename(columns = {'subscription_start_x':'subscription_start', 'subscription_start':'min_gifted_subscription'}, inplace = True)

df_accounts_eda = pd.merge(df_accounts_eda, gifted_sub_min_date, on='account_key', how='left')
df_accounts_eda.rename(columns = {'subscription_start':'min_gifted_subscription'}, inplace = True)


In [69]:
# Merge paid subs into the subscription EDA and accounts EDA data frames
df_subscriptions_eda = pd.merge(df_subscriptions_eda, paid_sub_min_date, on='account_key', how='left')
df_subscriptions_eda.rename(columns = {'subscription_start_x':'subscription_start', 'subscription_start_y':'min_paid_subscription'}, inplace = True)

df_accounts_eda = pd.merge(df_accounts_eda, paid_sub_min_date, on='account_key', how='left')
df_accounts_eda.rename(columns = {'subscription_start':'min_paid_subscription'}, inplace = True)

In [None]:
# Check individual account_keys if necessary
df_accounts_eda[df_accounts_eda['account_key'].str.contains('ab1bcf2c1b8d4eb422079becc0c59b1c7db2fd009235d2', na=False)]

## Push EDA Tables to SQL

In [31]:
schema = 'capstone_filmingo' 
engine = get_engine()

In [32]:
# table_name = 'subscriptions_eda'

# if engine!=None:
#     try:
#         df_subscriptions_eda.to_sql(name=table_name, # Name of SQL table
#                         con=engine, # Engine or connection
#                         if_exists='replace', # Drop the table before inserting new values 
#                         schema=schema, # Use schema that was defined earlier
#                         index=False, # Write DataFrame index as a column
#                         chunksize=5000, # Specify the number of rows in each batch to be written at a time
#                         method='multi') # Pass multiple values in a single INSERT clause
#         print(f"The {table_name} table was imported successfully.")
#     # Error handling
#     except (Exception, psycopg2.DatabaseError) as error:
#         print(error)
#         engine = None
# else:
#      print('Push did not work')

In [None]:
# Just to be sure: Check if the number of rows match
table_name_sql = f'''SELECT count(*) 
                    FROM {schema}.{table_name}
                    '''
engine.execute(table_name_sql).fetchall()[0][0] == df_subscriptions_eda.shape[0]

In [None]:
# table_name = 'accounts_eda'

# if engine!=None:
#     try:
#         df_accounts_eda.to_sql(name=table_name, # Name of SQL table
#                         con=engine, # Engine or connection
#                         if_exists='replace', # Drop the table before inserting new values 
#                         schema=schema, # Use schema that was defined earlier
#                         index=False, # Write DataFrame index as a column
#                         chunksize=5000, # Specify the number of rows in each batch to be written at a time
#                         method='multi') # Pass multiple values in a single INSERT clause
#         print(f"The {table_name} table was imported successfully.")
#     # Error handling
#     except (Exception, psycopg2.DatabaseError) as error:
#         print(error)
#         engine = None
# else:
#      print('Push did not work')

In [None]:
# Just to be sure: Check if the number of rows match
table_name_sql = f'''SELECT count(*) 
                    FROM {schema}.{table_name}
                    '''
engine.execute(table_name_sql).fetchall()[0][0] == df_accounts_eda.shape[0]