### Basic model:
Features: Income, Household composition, Property type
Predict installed pV (aggregation level: buurt, no time dependency) 

Income: CBS data '84799NED' (Kerncijfers wijken en buurten 2020)

In [None]:
# pip install cbsodata

In [None]:
import cbsodata
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
#Read in (Kerncijfers wijken en buurten 2020
kerncijfers_2020 = '84799NED'
df_kerncijfers_2020 = pd.DataFrame(cbsodata.get_data(kerncijfers_2020))

Keep only the data on Buurt level

In [None]:
#remove whitespaces from beginning and end of string column labels
df_kerncijfers = df_kerncijfers_2020.apply(lambda x: x.str.strip() if x.dtype == "object" else x)   
is_buurt = df_kerncijfers['SoortRegio_2']=='Buurt'
df_kerncijfers = df_kerncijfers[is_buurt]

### Feature #1 - Income

Take a look at the number of buurten where the incomes are unknown

In [None]:
fraction_unfilled_incomes = df_kerncijfers['GemiddeldInkomenPerInwoner_72'].isna().sum()/ df_kerncijfers['GemiddeldInkomenPerInwoner_72'].sum() *100
print("Income not specified in: %.0f" % fraction_unfilled_incomes, "% of the buurten. Removing these entries.")

In [None]:
cutoff_income = 60
df_income_specified = df_kerncijfers[(df_kerncijfers['GemiddeldInkomenPerInwoner_72'].isna() == False) & (df_kerncijfers['GemiddeldInkomenPerInwoner_72'] < cutoff_income)]
df_income_specified['GemiddeldInkomenPerInwoner_72'].hist(bins=54, figsize=(8, 6))

### Feature #2 - Household composition

In [None]:
df_kerncijfers['GemiddeldeHuishoudensgrootte_32']
fraction_unfilled_huishoudensgroottes = df_kerncijfers['GemiddeldeHuishoudensgrootte_32'].isna().sum()/ df_kerncijfers['GemiddeldeHuishoudensgrootte_32'].sum() *100

print("Average size of household not specified in: %.0f" % fraction_unfilled_huishoudensgroottes, "% of the buurten. Removing these entries.")

In [None]:
cut_off_household_size = 5
df_average_household_size_specified = df_kerncijfers[(df_kerncijfers['GemiddeldeHuishoudensgrootte_32'].isna() == False) & (df_kerncijfers['GemiddeldeHuishoudensgrootte_32']<cut_off_household_size)]
df_average_household_size_specified['GemiddeldeHuishoudensgrootte_32'].hist(bins=140, figsize=(8, 6))

### Feature #3 - Percentage owned property (koopwoningen)

In [None]:
df_kerncijfers['Koopwoningen_40']
fraction_unfilled_owned_property_percentage = df_kerncijfers['Koopwoningen_40'].isna().sum()/ df_kerncijfers['Koopwoningen_40'].sum() *100

print("Average percentage of owned properties not specified in: %.0f" % fraction_unfilled_owned_property_percentage, "% of the buurten. Removing these entries.")


In [None]:
df_average_owned_property_percentage_specified = df_kerncijfers[df_kerncijfers['Koopwoningen_40'].isna() == False]
df_average_owned_property_percentage_specified['Koopwoningen_40'].hist(bins=346, figsize=(8, 6))

## Target variable - opgesteld vermogen

Load the data from the Enexis supplied data file.

In [None]:
decentral_generation_072020 = './data/Enexis_decentrale_opwek_kv_(zon_pv)_01072020.csv'
df_decentral_generation = pd.read_csv(decentral_generation_072020,
                         sep                = ';',
                         decimal            = ',',
                         thousands          = '.',
                         encoding           = 'unicode_escape')

In [None]:
cut_off_generation = 3000
df_decentral_generation_specified = df_decentral_generation[
    (df_decentral_generation['Opgesteld vermogen'].isna() == False)   
    & (df_decentral_generation['Opgesteld vermogen'] < cut_off_generation)]
df_decentral_generation_specified['Opgesteld vermogen'].hist(bins=53, figsize=(8, 6))

In [None]:
df_decentral_generation_specified_no_buurtcode_duplicates =  df_decentral_generation_specified[df_decentral_generation['CBS Buurtcode'].duplicated(keep=False)] #removes the NaN's in the buurt code

In [29]:
df_temp = df_decentral_generation_specified['CBS Buurtcode']
df_temp.head()

0    16800000.0
1    16800009.0
2    16800100.0
3    16800109.0
4    16800200.0
Name: CBS Buurtcode, dtype: float64

In [None]:
has_buurtcode_starting_with_BU = df_kerncijfers['Codering_3'].str.find('BU') == 0
buurtcodes_without_leading_BU = df_kerncijfers[has_buurtcode_starting_with_BU == False].sum()

All buurtcodes start with 'BU'. We can remove this so that we can match it with the other data frame.

In [32]:
df_kerncijfers['CBS Buurtcode'] = pd.to_numeric(df_kerncijfers['Codering_3'].apply(lambda s:s.replace("BU","")))

### Combine the demographic data with the generation data
Remove the duplicates from the buurtcodes, because duplicates mean 'NaN' is filled in.

In [33]:
df_temp_deduplicated_on_buurtcode = df_decentral_generation_specified[df_decentral_generation_specified['CBS Buurtcode'].duplicated(keep=False)]

df_average_owned_property_percentage_specified.rename(columns={"Codering_3":"CBS Buurtcode"})
# df_average_owned_property_percentage_specified_deduped = df_average_owned_property_percentage_specified[df_average_owned_property_percentage_specified['CBS Buurtcode'].duplicated(keep=False)]

Unnamed: 0,ID,WijkenEnBuurten,Gemeentenaam_1,SoortRegio_2,CBS Buurtcode,IndelingswijzigingWijkenEnBuurten_4,AantalInwoners_5,Mannen_6,Vrouwen_7,k_0Tot15Jaar_8,...,AfstandTotKinderdagverblijf_107,AfstandTotSchool_108,ScholenBinnen3Km_109,OppervlakteTotaal_110,OppervlakteLand_111,OppervlakteWater_112,MeestVoorkomendePostcode_113,Dekkingspercentage_114,MateVanStedelijkheid_115,Omgevingsadressendichtheid_116
3,3,Annen,Aa en Hunze,Buurt,BU16800000,1,3460,1685,1775,540,...,0.5,0.8,2.6,252,252,0,9468,1.0,5.0,411.0
4,4,Verspreide huizen Annen,Aa en Hunze,Buurt,BU16800009,1,155,80,80,15,...,1.7,2.0,1.2,1226,1213,13,9468,1.0,5.0,60.0
6,6,Eext,Aa en Hunze,Buurt,BU16800100,1,1255,640,615,185,...,1.2,0.9,1.0,171,170,1,9463,1.0,5.0,144.0
7,7,Verspreide huizen Eext,Aa en Hunze,Buurt,BU16800109,1,175,90,85,10,...,2.0,2.1,1.3,1618,1611,6,9463,1.0,5.0,36.0
9,9,Anloo,Aa en Hunze,Buurt,BU16800200,1,320,150,170,35,...,2.2,0.4,2.0,57,57,0,9467,1.0,5.0,126.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17335,17335,Gerenlanden,Zwolle,Buurt,BU01935140,1,3065,1520,1545,575,...,0.5,0.5,12.1,58,55,2,8014,1.0,2.0,1940.0
17336,17336,Gerenbroek,Zwolle,Buurt,BU01935150,1,3095,1565,1530,610,...,0.5,0.5,11.7,49,47,2,8014,1.0,2.0,1588.0
17337,17337,Oude Mars,Zwolle,Buurt,BU01935160,1,130,65,65,30,...,1.6,1.6,8.8,45,43,2,8015,2.0,4.0,918.0
17339,17339,Windesheim,Zwolle,Buurt,BU01935200,1,390,195,200,80,...,0.3,0.3,1.0,53,53,0,8015,1.0,5.0,62.0
