# SETUP

In [41]:
import pandas as pd
import numpy as np

In [42]:
# define the name of the csv file you want to read in
accounts = './data/accounts.csv'
#playbacks = './data/playbacks.csv'
#subscriptions = './data/subscriptions.csv'

# ACCOUNTS

In [43]:
# Read accounts (01.10.2020 - 01.10.2022)
df_accounts = pd.read_csv(accounts)
# set column names to lowercase
df_accounts.columns = df_accounts.columns.str.lower()
# show first rows
display(df_accounts.head())

Unnamed: 0,account_key,postal_code,city,language,country_code,onetime_rental_count,subscription_count
0,60a90104f44414d9036aed7d96d1468a5a9e1d104b6791...,5430,Wettingen,de,CH,0,1
1,a48b28809457e680de54b4b560e00117308431c574aab2...,8706,Meilen,de,CH,0,1
2,7a280859423057ac5f1e0bfc15af602edd23900f3cf7cb...,1005,Lausanne,fr,CH,1,0
3,23e7ac18b391549e95a98d85a3adae1f3f90c4fcc09732...,4436,Oberdorf,de,CH,0,2
4,a39dbaa7972fb67c15db79d4a66cf5d1b94855ae530774...,6005,Luzern,de,CH,8,2


In [78]:
#check basic information and null values
display(df_accounts.shape)
display(df_accounts.dtypes)
display(df_accounts.describe())
display(df_accounts.isna().sum())

(17079, 8)

account_key             object
postal_code             object
city                    object
language                object
country_code            object
onetime_rental_count     int64
subscription_count       int64
postal_code_clean        int64
dtype: object

Unnamed: 0,onetime_rental_count,subscription_count,postal_code_clean
count,17079.0,17079.0,17079.0
mean,1.686867,0.709292,29516.39
std,3.592025,0.877215,1357043.0
min,0.0,0.0,0.0
25%,0.0,0.0,1985.0
50%,1.0,1.0,4600.0
75%,2.0,1.0,8049.0
max,106.0,13.0,104522100.0


account_key              0
postal_code             13
city                    10
language                 0
country_code             0
onetime_rental_count     0
subscription_count       0
postal_code_clean        0
dtype: int64

## Postal Code


### Cleaning postal code column

In [46]:
# df_accounts.groupby('postal_code').sum()

In [47]:
# remove non numeric characters
df_accounts['postal_code_clean'] = df_accounts['postal_code'].str.replace('-', '')
df_accounts['postal_code_clean'] = df_accounts['postal_code_clean'].str.extract('(\d+)')
# fill null-values with 0
df_accounts['postal_code_clean'].fillna(0, inplace=True)
df_accounts.groupby(['postal_code_clean', 'postal_code']).sum().head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,onetime_rental_count,subscription_count
postal_code_clean,postal_code,Unnamed: 2_level_1,Unnamed: 3_level_1
0,-,0,2
0,Bern,3,3
0,Biel,27,0
0,Fribourg,3,1
0,Genève,0,4
0,Rudolfstette,0,2
0,Sierre,1,0
0,XXXX,8,1
0,asdf,0,1
0,rossens,1,0


In [48]:
# change data type to integer
df_accounts['postal_code_clean'] = df_accounts['postal_code_clean'].astype(int)
df_accounts.dtypes

account_key             object
postal_code             object
city                    object
language                object
country_code            object
onetime_rental_count     int64
subscription_count       int64
postal_code_clean        int64
dtype: object

## City

### manual cleaning

In [49]:
'''
# remove numbers
df_accounts['city_clean'] = df_accounts['city'].str.replace('\d+', '')
# fill null values and postal_code 0 with 'na'
df_accounts['city_clean'].fillna('na', inplace=True)
df_accounts.loc[df_accounts["postal_code_clean"] == 0, "city_clean"] = 'na'
# remove and replace special characters
df_accounts['city_clean'] = df_accounts['city_clean'].str.replace('-', ' ')
df_accounts['city_clean'] = df_accounts['city_clean'].str.replace('\.|<', '')
# remove leading and trailing whitespace
df_accounts['city_clean'] = df_accounts['city_clean'].str.strip()
# set city to lowercase for easier comparison
df_accounts['city_clean'] = df_accounts['city_clean'].str.lower()
# zürich cleanup
df_accounts['city_clean'] = df_accounts['city_clean'].str.replace('z..?r.ch.*', 'zürich')
# geneva cleanup
df_accounts['city_clean'] = df_accounts['city_clean'].str.replace('gen.v.*|genf', 'genève')
# biel cleanup
df_accounts['city_clean'] = df_accounts['city_clean'].str.replace('biel..?bienne|bienne.*', 'biel')
'''

'\n# remove numbers\ndf_accounts[\'city_clean\'] = df_accounts[\'city\'].str.replace(\'\\d+\', \'\')\n# fill null values and postal_code 0 with \'na\'\ndf_accounts[\'city_clean\'].fillna(\'na\', inplace=True)\ndf_accounts.loc[df_accounts["postal_code_clean"] == 0, "city_clean"] = \'na\'\n# remove and replace special characters\ndf_accounts[\'city_clean\'] = df_accounts[\'city_clean\'].str.replace(\'-\', \' \')\ndf_accounts[\'city_clean\'] = df_accounts[\'city_clean\'].str.replace(\'\\.|<\', \'\')\n# remove leading and trailing whitespace\ndf_accounts[\'city_clean\'] = df_accounts[\'city_clean\'].str.strip()\n# set city to lowercase for easier comparison\ndf_accounts[\'city_clean\'] = df_accounts[\'city_clean\'].str.lower()\n# zürich cleanup\ndf_accounts[\'city_clean\'] = df_accounts[\'city_clean\'].str.replace(\'z..?r.ch.*\', \'zürich\')\n# geneva cleanup\ndf_accounts[\'city_clean\'] = df_accounts[\'city_clean\'].str.replace(\'gen.v.*|genf\', \'genève\')\n# biel cleanup\ndf_accounts[\'

### mapping plz_files

In [50]:
# set file paths
plz_ch = './data/plz_verzeichnis_ch.csv'
plz_kanton = './data/plz_kantone_ch.csv'
plz_de = './data/plz_verzeichnis_de.csv'
plz_at = './data/plz_verzeichnis_at.csv'
# read csv files
df_plz_ch = pd.read_csv(plz_ch, sep=';')
df_plz_kanton = pd.read_csv(plz_kanton, sep=';')
df_plz_de = pd.read_csv(plz_de, sep=',')
df_plz_at = pd.read_csv(plz_at, sep=';')
# set column names to lowercase
df_plz_ch.columns = df_plz_ch.columns.str.lower()
df_plz_kanton.columns = df_plz_kanton.columns.str.lower()
df_plz_de.columns = df_plz_de.columns.str.lower()
df_plz_at.columns = df_plz_at.columns.str.lower()
# show first rows
'''
display(df_plz_ch.head())
display(df_plz_kanton.head())
display(df_plz_de.head())
display(df_plz_at.head())
'''


'\ndisplay(df_plz_ch.head())\ndisplay(df_plz_kanton.head())\ndisplay(df_plz_de.head())\ndisplay(df_plz_at.head())\n'

In [51]:
# clean plz_kanton
# only keep relevant columns, rename
df_plz_kanton = df_plz_kanton[['postleitzahl / code postal / codice postale', 'ort / ville / città', 'kanton']]
df_plz_kanton.rename(columns = {'postleitzahl / code postal / codice postale':'postal_code', 'ort / ville / città':'city', 'kanton':'state'}, inplace = True)
df_plz_kanton.drop_duplicates(inplace = True)
# add country_code for differentiation
df_plz_kanton['country_code'] = 'CH'

In [52]:
# clean plz_de
# only keep relevant columns, rename, drop duplicates
df_plz_de = df_plz_de[['plz', 'ort', 'bundesland']]
df_plz_de.rename(columns = {'plz':'postal_code', 'ort':'city', 'bundesland':'state'}, inplace = True)
df_plz_de.drop_duplicates(inplace = True)
# add country_code for differentiation
df_plz_de['country_code'] = 'DE'

In [53]:
# clean plz_at
# only keep relevant columns, rename, drop duplicates
df_plz_at = df_plz_at[['plz', 'ort', 'bundesland']]
df_plz_at.rename(columns = {'plz':'postal_code', 'ort':'city', 'bundesland':'state'}, inplace = True)
df_plz_at.drop_duplicates(inplace = True)
# add country_code for differentiation
df_plz_at['country_code'] = 'AT'

In [54]:
#check results 
display(df_plz_kanton.head())
display(df_plz_kanton.shape)
display(df_plz_de.head())
display(df_plz_de.shape)
display(df_plz_at.head())
display(df_plz_at.shape)

Unnamed: 0,postal_code,city,state,country_code
0,1000,Lausanne,Waadt,CH
1,1003,Lausanne,Waadt,CH
2,1004,Lausanne,Waadt,CH
3,1005,Lausanne,Waadt,CH
4,1006,Lausanne,Waadt,CH


(4120, 4)

Unnamed: 0,postal_code,city,state,country_code
0,78267,Aach,Baden-Württemberg,DE
1,54298,Aach,Rheinland-Pfalz,DE
2,52062,Aachen,Nordrhein-Westfalen,DE
3,52064,Aachen,Nordrhein-Westfalen,DE
4,52066,Aachen,Nordrhein-Westfalen,DE


(12869, 4)

Unnamed: 0,postal_code,city,state,country_code
0,1000,Wien,W,AT
1,1004,Wien,W,AT
2,1006,Wien,W,AT
3,1010,Wien,W,AT
4,1011,Wien Postfach,W,AT


(2520, 4)

In [55]:
# unify for merging, check shape
df_plz_all = pd.concat([df_plz_kanton, df_plz_de, df_plz_at])
# dropping plz duplicates with multiple city, keeping the first entry each
df_plz_all = df_plz_all.groupby(['postal_code'])['city', 'state', 'country_code'].first().reset_index()
#check result
display(df_plz_all.head())
display(df_plz_all.shape)

  df_plz_all = df_plz_all.groupby(['postal_code'])['city', 'state', 'country_code'].first().reset_index()


Unnamed: 0,postal_code,city,state,country_code
0,1000,Lausanne,Waadt,CH
1,1003,Lausanne,Waadt,CH
2,1004,Lausanne,Waadt,CH
3,1005,Lausanne,Waadt,CH
4,1006,Lausanne,Waadt,CH


(12548, 4)

In [56]:
# merge city and state information to accounts table on plz and country code
df_accounts_new = pd.merge(df_accounts, df_plz_all, left_on=['postal_code_clean', 'country_code'], right_on=['postal_code', 'country_code'], how='left')

In [57]:
# check results
display(df_accounts_new.head())
# check shape of original and new df
display(df_accounts.shape)
display(df_accounts_new.shape)

Unnamed: 0,account_key,postal_code_x,city_x,language,country_code,onetime_rental_count,subscription_count,postal_code_clean,postal_code_y,city_y,state
0,60a90104f44414d9036aed7d96d1468a5a9e1d104b6791...,5430,Wettingen,de,CH,0,1,5430,5430.0,Wettingen,Aargau
1,a48b28809457e680de54b4b560e00117308431c574aab2...,8706,Meilen,de,CH,0,1,8706,8706.0,Meilen,Zürich
2,7a280859423057ac5f1e0bfc15af602edd23900f3cf7cb...,1005,Lausanne,fr,CH,1,0,1005,1005.0,Lausanne,Waadt
3,23e7ac18b391549e95a98d85a3adae1f3f90c4fcc09732...,4436,Oberdorf,de,CH,0,2,4436,4436.0,Liedertswil,Basel-Landschaft
4,a39dbaa7972fb67c15db79d4a66cf5d1b94855ae530774...,6005,Luzern,de,CH,8,2,6005,6005.0,Luzern,Luzern


(17079, 8)

(17079, 11)

In [58]:
# clean plz_a for language information
# only keep relevant columns, rename, drop duplicates
df_plz_ch = df_plz_ch[['postleitzahl', 'sprachcode']]
df_plz_ch.rename(columns = {'postleitzahl':'postal_code'}, inplace = True)
df_plz_ch.drop_duplicates(inplace=True)
# add country_code for differentiation
df_plz_ch['country_code'] = 'CH'
# check result
display(df_plz_ch.head())
display(df_plz_ch.shape)


Unnamed: 0,postal_code,sprachcode,country_code
0,1000,2,CH
9,1001,2,CH
10,1004,2,CH
11,1009,2,CH
12,1015,2,CH


(3495, 3)

In [59]:
# mapping sprachcode to actual language here or in tableau?
#1 = deutsch,
#2 = französisch,
#3 = italienisch

In [60]:
# merge sprachcode to accounts table
df_accounts_new = pd.merge(df_accounts_new, df_plz_ch, left_on=['postal_code_clean', 'country_code'], right_on=['postal_code', 'country_code'], how='left')

In [61]:
#drop duplicate postal code columns
df_accounts_new = df_accounts_new.drop(['postal_code_x', 'postal_code_y', 'postal_code'], axis=1)
#rename original postal code column
df_accounts_new.rename(columns = {'postal_code_x':'postal_code_original', 'city_x':'city_original', 'city_y':'city_clean'}, inplace = True)

In [62]:
#check results
#display(df_accounts_new.head())
#display(df_accounts_new.shape)

In [63]:
# check result
df_accounts_new.groupby(['state', 'city_original', 'city_clean', 'postal_code_clean', 'sprachcode']).sum().head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,onetime_rental_count,subscription_count
state,city_original,city_clean,postal_code_clean,sprachcode,Unnamed: 5_level_1,Unnamed: 6_level_1
Aargau,Ennetbaden,Ennetbaden,5408,1.0,3,0
Aargau,5000 Aarau,Aarau,5000,1.0,1,0
Aargau,5024 Küttigen,Küttigen,5024,1.0,1,1
Aargau,5225 Bözberg,Oberbözberg,5225,1.0,0,1
Aargau,5400 Baden,Baden,5400,1.0,0,1
Aargau,Aarau,Aarau,5000,1.0,81,32
Aargau,Aarau,Aarau,5000,1.0,3,0
Aargau,Aarau Rohr,Aarau Rohr,5032,1.0,1,5
Aargau,Aarau-Rohr,Aarau Rohr,5032,1.0,0,4
Aargau,Aarburg,Aarburg,4663,1.0,0,1


## Country_Code

In [64]:
# show country codes 
#display(df_accounts.country_code.nunique())
#df_accounts.country_code.unique()

In [65]:
#df_accounts.country_code.value_counts().plot(kind='pie');

### Add country name information

In [66]:
# add country information
country= './data/country_code.csv'
df_country = pd.read_csv(country)
df_country.columns = df_country.columns.str.lower()
# check result
display(df_country.shape)
display(df_country.head())

(249, 11)

Unnamed: 0,name,alpha-2,alpha-3,country-code,iso_3166-2,region,sub-region,intermediate-region,region-code,sub-region-code,intermediate-region-code
0,Afghanistan,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,,142.0,34.0,
1,Åland Islands,AX,ALA,248,ISO 3166-2:AX,Europe,Northern Europe,,150.0,154.0,
2,Albania,AL,ALB,8,ISO 3166-2:AL,Europe,Southern Europe,,150.0,39.0,
3,Algeria,DZ,DZA,12,ISO 3166-2:DZ,Africa,Northern Africa,,2.0,15.0,
4,American Samoa,AS,ASM,16,ISO 3166-2:AS,Oceania,Polynesia,,9.0,61.0,


In [67]:
#only keep relevant columns, rename
df_country = df_country[['name', 'alpha-2', 'region', 'sub-region']]
df_country.rename(columns = {'alpha-2':'country_code', 'name':'country_name', 'sub-region':'sub_region'}, inplace = True)
df_country.head()

Unnamed: 0,country_name,country_code,region,sub_region
0,Afghanistan,AF,Asia,Southern Asia
1,Åland Islands,AX,Europe,Northern Europe
2,Albania,AL,Europe,Southern Europe
3,Algeria,DZ,Africa,Northern Africa
4,American Samoa,AS,Oceania,Polynesia


In [68]:
#merge to accounts_new table
df_accounts_new = pd.merge(df_accounts_new, df_country, on='country_code', how='left')
#df_accounts_new.head()

In [69]:
#check result
#df_accounts_new.groupby(['region', 'sub_region', 'country_code', 'country_name']).sum()

## Language

In [70]:
display(df_accounts.language.nunique())
display(df_accounts.language.unique())
display(df_accounts.language.value_counts())

4

array(['de', 'fr', 'en', nan, 'nl'], dtype=object)

de    10942
fr     4286
en     1794
nl        1
Name: language, dtype: int64

In [71]:
# check for null values
df_accounts.language.isna().sum()

56

In [72]:
#fill null values
df_accounts['language'].fillna('na', inplace=True)

In [73]:
df_accounts.groupby('language').sum()

Unnamed: 0_level_0,onetime_rental_count,subscription_count,postal_code_clean
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
de,19546,7600,119303153
en,2350,1119,361492188
fr,6864,3353,22843771
na,50,41,468865
nl,0,1,2394


## Onetime rental and Subscription Count


In [74]:
#general information on counts
display(df_accounts.onetime_rental_count.describe())
display(df_accounts.onetime_rental_count.value_counts())

count    17079.000000
mean         1.686867
std          3.592025
min          0.000000
25%          0.000000
50%          1.000000
75%          2.000000
max        106.000000
Name: onetime_rental_count, dtype: float64

0      6837
1      5804
2      1571
3       809
4       514
5       316
6       262
7       178
8       170
9       109
10       85
11       63
12       52
13       47
14       37
15       28
18       24
16       22
17       20
21       17
19       11
20       11
22        8
23        8
30        7
26        7
25        7
29        6
28        6
24        6
27        5
47        4
32        4
31        3
35        3
51        2
37        2
52        2
39        2
45        1
57        1
61        1
38        1
53        1
65        1
44        1
48        1
106       1
79        1
Name: onetime_rental_count, dtype: int64

In [75]:
#Check for accounts with no subscription
#display(df_accounts.subscription_count.unique())
df_accounts['subscription_count'].apply(lambda x: True if x == 0 else False).value_counts()

False    8976
True     8103
Name: subscription_count, dtype: int64

## CSV Export

In [76]:
#last check for null values
df_accounts_new.isnull().sum()
# high amount of null values for city/state/sprachcode --> added information only for D/A/CH region

account_key                0
city_original             10
language                  57
country_code               0
onetime_rental_count       0
subscription_count         0
postal_code_clean          0
city_clean               705
state                    705
sprachcode              1983
country_name               0
region                     0
sub_region                 0
dtype: int64

In [77]:
# export to csv for tableau eda
#df_accounts_new.to_csv('./data/accounts_new.csv')