In [1]:
import pandas as pd
import numpy as np


In [2]:
# define the name of the csv file you want to read in
accounts = './data/accounts.csv'
playbacks = './data/playbacks.csv'
subscriptions = './data/subscriptions.csv'

# Kanton

In [None]:

kanton = './data/plz_verzeichnis.csv'
df_kanton = pd.read_csv(kanton, sep=';')
#df_kanton.columns = df_cntrycd.columns.str.lower()
display(df_kanton.shape)
display(df_kanton.head())

# SUBSCRIPTIONS

In [None]:
# Read subscriptions
df_subscriptions = pd.read_csv(subscriptions)
df_subscriptions.columns = df_subscriptions.columns.str.lower()

display(df_subscriptions.shape)
display(df_subscriptions.head(10))

In [None]:
df_subscriptions.groupby(['currency','price']).subscription_type.value_counts()

In [None]:
#set data types for subscription dates to datetime
df_subscriptions['subscription_start'] = pd.to_datetime(df_subscriptions['subscription_start'])
df_subscriptions['subscription_end'] = pd.to_datetime(df_subscriptions['subscription_end'])

In [None]:
# Create new 'clean' DataFrame 
df_subscriptions_clean = df_subscriptions

In [None]:
# drop "wrong" subscription type line (FULLACCESS - unknown type to us - only one line therefore decided to drop)
df_subscriptions_clean.drop(df_subscriptions_clean[(df_subscriptions['subscription_type'] == 'FULLACCESS')].index, inplace = True)

In [None]:
#create new column and calculate subscription duration for calculating actual price per subscription
df_subscriptions_clean['subscription_months_raw'] = ((df_subscriptions_clean.subscription_end) - df_subscriptions_clean.subscription_start)/np.timedelta64(1, 'M')

In [None]:
#create new column with rounded subscription months
#generally round up from 0.1 to be able to allow some discrepancies due to day to day calculation of subscription duration (deduct 0.1 to be able to use .ceil)
df_subscriptions_clean['subscription_months'] = df_subscriptions_clean['subscription_months_raw'] - 0.1
df_subscriptions_clean['subscription_months'] = df_subscriptions_clean['subscription_months'].apply(np.ceil)

In [None]:
# Create two columns for chf and eur based on the subscription_type and prices from the filmingo website

# create a list of our conditions
conditions = [
    ((df_subscriptions_clean['subscription_type'] == 'BASIC') & (df_subscriptions_clean['subscription_monthly'] == 0)),
    ((df_subscriptions_clean['subscription_type'] == 'BASIC') & (df_subscriptions_clean['subscription_monthly'] == 1)),
    ((df_subscriptions_clean['subscription_type'] == 'STANDARD') & (df_subscriptions_clean['subscription_monthly'] == 0)),
    ((df_subscriptions_clean['subscription_type'] == 'STANDARD') & (df_subscriptions_clean['subscription_monthly'] == 1)),
    ((df_subscriptions_clean['subscription_type'] == 'PATRON') & (df_subscriptions_clean['subscription_monthly'] == 0))

]

# create a list of the values we want to assign for each condition
values_chf = ['90.0', '9.0', '150.0', '15.0', '240.0']
values_eur = ['75.0', '7.5', '125.0', '12.5', '200.0']

# create a new column and use np.select to assign values to it using our lists as arguments
df_subscriptions_clean['price_chf'] = np.select(conditions, values_chf)
df_subscriptions_clean['price_eur'] = np.select(conditions, values_eur)

#change datatype into float for further calculation
df_subscriptions_clean['price_chf'] = df_subscriptions_clean.price_chf.astype('float')
df_subscriptions_clean['price_eur'] = df_subscriptions_clean.price_eur.astype('float')

# decided to use these prices for all subscriptions regardless if they might have a different prices in the list (possibly due to discounts, total lines of abnormal prices: 39) or are gifted subscription (price: NaN, total lines 1.636)


In [None]:
#calculate total price per subscription (price / 12 * subscription months)
df_subscriptions_clean['total_price_chf'] = df_subscriptions_clean['price_chf'] / 12 * df_subscriptions_clean['subscription_months']
df_subscriptions_clean['total_price_eur'] = df_subscriptions_clean['price_eur'] / 12 * df_subscriptions_clean['subscription_months']

# conditional calculation for exceptions:

# if the subscription is monthly only calculate price * months
df_subscriptions_clean.loc[(df_subscriptions_clean['subscription_monthly'] == 1), 'total_price_chf'] = (df_subscriptions_clean['price_chf'] * df_subscriptions_clean['subscription_months'])
df_subscriptions_clean.loc[(df_subscriptions_clean['subscription_monthly'] == 1), 'total_price_eur'] = (df_subscriptions_clean['price_eur'] * df_subscriptions_clean['subscription_months'])

# if the subscription is gifted and 6 months long, a different price is applicable (there is only a 6 month subscription available for gifted subscriptions)
df_subscriptions_clean.loc[((df_subscriptions_clean['gift_subscription'] == True) & (df_subscriptions_clean['subscription_months'] == 6)), 'total_price_chf'] = '49'
df_subscriptions_clean.loc[((df_subscriptions_clean['gift_subscription'] == True) & (df_subscriptions_clean['subscription_months'] == 6)), 'total_price_eur'] = '41'

In [None]:
df_subscriptions_clean.head()

In [None]:
#test
df_subscriptions_clean.query('gift_subscription == True')

In [None]:
#test
df_subscriptions_clean.query('subscription_monthly == 1')

In [None]:
#display(gifted_df.groupby('subscription_months').max())
print(df_subscriptions_clean.groupby(['subscription_months','subscription_monthly']).sum())

In [None]:
df_subscriptions_clean.to_csv('./data/subcriptions_clean.csv')

In [None]:
# gifted_df= df_subscriptions_clean.loc[df_subscriptions_clean['gift_subscription']==True]
# gifted_df['months'] = gifted_df['subscription_months'].round()
# gifted_df.tail(15)
# #display(gifted_df.groupby('subscription_months').max())
# gifted_df.groupby('months').sum()



In [None]:
# nongifted_df= df_subscriptions_clean.loc[df_subscriptions_clean['gift_subscription']==False]

# nongifted_df.groupby(['months', 'subscription_monthly']).sum()

In [None]:
#nongifted_df.query('account_key == b43a87d35bf285afdbb1c931b68ea2e6dad1f9dcc62947')

#nongifted_df[nongifted_df['account_key'].str.contains('d8aa9f6793e94bc168a65808c9fe5809d4516448eae392a33edec16391c71d1e', na=False)]


In [None]:
# df_subscriptions_clean = df_subscriptions_clean.drop('calc_price_chf', axis=1)

# ACCOUNTS

In [3]:
# Read accounts (21.10.2020 - 01.10.2022)
df_accounts = pd.read_csv(accounts)
# set column names to lowercase
df_accounts.columns = df_accounts.columns.str.lower()

In [4]:
df_accounts.shape

(22154, 8)

## Postal Code

In [None]:
# todo drop duplicates from vouchers for merge
# add voucher information to accounts table - have to add both columns
df_accounts_new = pd.merge(df_accounts, df_vouchers [['email_hash_receiver', 'voucher_used']], left_on='email_hash', right_on='email_hash_receiver', how='left')

In [None]:
# remove non numeric characters
df_accounts['postal_code_clean'] = df_accounts['postal_code'].str.replace('-', '')
df_accounts['postal_code_clean'] = df_accounts['postal_code_clean'].str.extract('(\d+)')
# fill null-values with 0
df_accounts['postal_code_clean'].fillna(0, inplace=True)

In [None]:
# change data type to integer
df_accounts['postal_code_clean'] = df_accounts['postal_code_clean'].astype(int)

## City

### Mapping plz_files to accounts table for further geographical information

In [None]:
# set file paths
plz_ch = './data/plz_verzeichnis_ch.csv'
plz_kanton = './data/plz_kantone_ch.csv'
plz_de = './data/plz_verzeichnis_de.csv'
plz_at = './data/plz_verzeichnis_at.csv'
# read csv files
df_plz_ch = pd.read_csv(plz_ch, sep=';')
df_plz_kanton = pd.read_csv(plz_kanton, sep=';')
df_plz_de = pd.read_csv(plz_de, sep=',')
df_plz_at = pd.read_csv(plz_at, sep=';')
# set column names to lowercase
df_plz_ch.columns = df_plz_ch.columns.str.lower()
df_plz_kanton.columns = df_plz_kanton.columns.str.lower()
df_plz_de.columns = df_plz_de.columns.str.lower()
df_plz_at.columns = df_plz_at.columns.str.lower()


In [None]:
# clean plz_kanton
# only keep relevant columns, rename
df_plz_kanton = df_plz_kanton[['postleitzahl / code postal / codice postale', 'ort / ville / città', 'kanton']]
df_plz_kanton.rename(columns = {'postleitzahl / code postal / codice postale':'postal_code', 'ort / ville / città':'city', 'kanton':'state'}, inplace = True)
df_plz_kanton.drop_duplicates(inplace = True)
# add country_code for differentiation
df_plz_kanton['country_code'] = 'CH'

In [None]:
# clean plz_de
# only keep relevant columns, rename, drop duplicates
df_plz_de = df_plz_de[['plz', 'ort', 'bundesland']]
df_plz_de.rename(columns = {'plz':'postal_code', 'ort':'city', 'bundesland':'state'}, inplace = True)
df_plz_de.drop_duplicates(inplace = True)
# add country_code for differentiation
df_plz_de['country_code'] = 'DE'

In [None]:
# clean plz_at
# only keep relevant columns, rename, drop duplicates
df_plz_at = df_plz_at[['plz', 'ort', 'bundesland']]
df_plz_at.rename(columns = {'plz':'postal_code', 'ort':'city', 'bundesland':'state'}, inplace = True)
df_plz_at.drop_duplicates(inplace = True)
# add country_code for differentiation
df_plz_at['country_code'] = 'AT'

In [None]:
# unify for merging, check shape
df_plz_all = pd.concat([df_plz_kanton, df_plz_de, df_plz_at])
# dropping plz duplicates with multiple city, keeping the first entry each
df_plz_all = df_plz_all.groupby(['postal_code'])['city', 'state', 'country_code'].first().reset_index()

In [None]:
# merge city and state information to accounts table on plz and country code
df_accounts = pd.merge(df_accounts, df_plz_all, left_on=['postal_code_clean', 'country_code'], right_on=['postal_code', 'country_code'], how='left')

In [None]:
# clean plz_ch for language information
# only keep relevant columns, rename, drop duplicates
df_plz_ch = df_plz_ch[['postleitzahl', 'sprachcode']]
df_plz_ch.rename(columns = {'postleitzahl':'postal_code'}, inplace = True)
df_plz_ch.drop_duplicates(inplace=True)
# add country_code for differentiation
df_plz_ch['country_code'] = 'CH'

'''
Mapping of the language code:
#1 = German  
#2 = French  
#3 = Italian 
'''

In [None]:
# merge language code to accounts table
df_accounts = pd.merge(df_accounts, df_plz_ch, left_on=['postal_code_clean', 'country_code'], right_on=['postal_code', 'country_code'], how='left')

In [None]:
#drop duplicate postal code columns
df_accounts = df_accounts.drop(['postal_code_x', 'postal_code_y', 'postal_code'], axis=1)
#rename original postal code column
df_accounts.rename(columns = {'postal_code_x':'postal_code_original', 'city_x':'city_original', 'city_y':'city_clean'}, inplace = True)

In [None]:
df_accounts.head()

In [None]:
df_accounts.shape

## Country_Code

### Add country name and region information

In [None]:
# add country information
country= './data/country_code.csv'
df_country = pd.read_csv(country)
# make column names lowercase
df_country.columns = df_country.columns.str.lower()

In [None]:
#only keep relevant columns, rename
df_country = df_country[['name', 'alpha-2', 'region', 'sub-region']]
df_country.rename(columns = {'alpha-2':'country_code', 'name':'country_name', 'sub-region':'sub_region'}, inplace = True)

In [None]:
#merge to accounts_new table
df_accounts = pd.merge(df_accounts, df_country, on='country_code', how='left')

## Language

In [None]:
#fill null values
df_accounts['language'].fillna('na', inplace=True)

In [None]:
df_accounts.head()

# Vouchers

Findings summary:
* Multiple vouchers sent to the same email ( max 14 times) -> not really idea of voucher campaign -> not a "new customer"  
* most of these multiple sent vouchers come from the same account -> account sharing? -> but looks like more individual cases  
* approx. 50% do not redeem their voucher  
* 718 unique email_hash connected to an accounts (of 5827 -> 12%)

Possible further investigation:
* how many email_hashes which were sent only 1/2 vouchers -> more in line with campaign idea
* the emails with multiple vouchers sent -> are they paying customer as well?
* How many of the 718 accounts have a subscription / OTR?

## Cleaning

In [None]:
# set paths to csv files
accounts = './data/accounts.csv'
playbacks = './data/playbacks.csv'
subscriptions = './data/subscriptions.csv'
vouchers = './data/promo_vouchers.csv'

In [None]:
# Read promo_couchers
df_vouchers = pd.read_csv(vouchers, sep=';')
# set column names to lowercase
df_vouchers.columns = df_vouchers.columns.str.lower()

In [None]:
df_vouchers.head()

In [None]:
# changing datatype to datetime
df_vouchers['creationdate'] = pd.to_datetime(df_vouchers['creationdate'])
df_vouchers['expirationdate'] = pd.to_datetime(df_vouchers['expirationdate'])

In [None]:
#cut off time of creation -> time not relevant? 
#df_vouchers['creationdate'] = df_vouchers['creationdate'].dt.date -> does not work very well as it returns an object
df_vouchers['creationdate'] = df_vouchers['creationdate'].dt.normalize()

In [None]:
df_vouchers.dtypes

## Preliminary EDA

In [None]:
df_vouchers.groupby(['email_hash']).count().sort_values(by='account_key', ascending=False)
# 14 times shared to same account - not idea of promo campaign -> not a "new customer" -> possibly only individual cases
# ToDo: check how many cases are 1 / lower than 2 for example

In [None]:
# df_vouchers_acc.groupby(['email_hash']).count().sort_values(by='account_key_x', ascending=False)

In [None]:
df_vouchers.groupby(['email_hash', 'account_key']).count().sort_values(by='movie_id', ascending=False)
#most emails on top get vouchers from same account_key 

In [None]:
df_vouchers.groupby(['voucherused']).count().sort_values(by='movie_id', ascending=False)
# approx 50% do not redeem their voucher

In [None]:
# merge vouchers and accounts - not used as it takes in all information from accounts (possibly needed later)
# df_vouchers_new = df_vouchers.merge(df_accounts, how='left', on='email_hash')

In [None]:
# merges but only add the relevant columns for easier processing (here: account_key, subcription count and OTR count)
df_vouchers_new = pd.merge(df_vouchers, df_accounts [['email_hash', 'account_key', 'subscription_count', 'onetime_rental_count']], on='email_hash', how='left')

In [None]:
df_vouchers_new.head()

In [None]:
# df_vouchers_acc.account_key_y.value_counts()

In [None]:
print(df_vouchers_new.shape)
df_vouchers_new['account_key_y'].isna().sum()
# 6619 NaN (no account connected) of 7790 (so 1.171 emails with account connected)-> also be aware of multiples in vouchers.email_hash -> see code below for this


In [None]:
df_vouchers_new.isna().sum()

In [None]:
#make new dataframe and drop all NaN to see only the emails with connected account for further investigation
df_vouchers_acc = df_vouchers_new
df_vouchers_acc.dropna(inplace = True)


In [None]:
#check that correct amount of rows is left
df_vouchers_acc.shape

In [None]:
df_vouchers_acc.head(10)

In [None]:
df_vouchers_acc['email_hash'].nunique()
#718 unique email_hash/accounts  

In [None]:
df_vouchers.email_hash.nunique()
#5827 total unique email_hashs

In [None]:
df_vouchers.head()

In [None]:
df_accounts.head()

In [None]:
# merge accounts and vouchers, but only add the relevant column (voucherused)
df_accounts = pd.merge(df_accounts, df_vouchers [['email_hash', 'voucherused']], on='email_hash', how='left')

In [None]:
df_accounts.head()

In [None]:
#add new column vouchers in accounts


In [None]:
# add voucher information to accounts table - have to add both columns
df_accounts = pd.merge(df_accounts, df_vouchers [['email_hash', 'voucherused']], left_on='email_hash', right_on='email_hash', how='left')

In [None]:
# add voucher information to accounts table - have to add both columns
# df_accounts = pd.merge(df_accounts, df_vouchers [['email_hash_receiver', 'voucher_used']], left_on='email_hash', right_on='email_hash_receiver', how='left')

In [None]:
#drop email_hash_receiver as it is not needed
# df_accounts.drop('email_hash_receiver', axis=1, inplace = True)

In [None]:
df_accounts.shape