### Basic model:
Features: Income, Household composition, Property type
Predict installed pV per household (aggregation level: buurt, no time dependency) 

Income: CBS data '84799NED' (Kerncijfers wijken en buurten 2020)

In [None]:
# pip install cbsodata

In [None]:
import cbsodata
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import os.path

In [None]:
#Read in (Kerncijfers wijken en buurten 2020
kerncijfers_2020 = '84799NED'
path_kerncijfers_2020 = Path('../Data/CBS/84799NED.csv')
if(os.path.exists(path_kerncijfers_2020) == False):
    print("File does not exist. Downloading from CBS....")
    df_temp = pd.DataFrame(cbsodata.get_data(kerncijfers_2020))
    print("File downloaded.")
    df_temp.to_csv(path_or_buf=path_kerncijfers_2020)
    print("File stored on local file stystem.")
    
df_kerncijfers = pd.read_csv(path_kerncijfers_2020)
df_kerncijfers.info()

Remove unusable items

In [None]:
df_kerncijfers = df_kerncijfers[df_kerncijfers['Codering_3'].isna() == False]

Keep only the data on Buurt level

In [None]:
#remove whitespaces from beginning and end of string column labels
df_kerncijfers = df_kerncijfers.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

is_buurt = df_kerncijfers['SoortRegio_2']=='Buurt'
df_kerncijfers = df_kerncijfers[is_buurt]

df_kerncijfers.info()


Filter out the non residential areas

In [None]:
df_kerncijfers['WijkenEnBuurten_lower_case'] = df_kerncijfers['WijkenEnBuurten'].str.lower()
df_kerncijfers['Is_non_residential'] = (df_kerncijfers['WijkenEnBuurten_lower_case'].str.contains('bedrijventerrein') |
    df_kerncijfers['WijkenEnBuurten_lower_case'].str.contains('bedrijvengebied') |
    df_kerncijfers['WijkenEnBuurten_lower_case'].str.contains('landelijk') |
    df_kerncijfers['WijkenEnBuurten_lower_case'].str.contains('kantoren') |
    df_kerncijfers['WijkenEnBuurten_lower_case'].str.contains('industrieterrein') |
    df_kerncijfers['WijkenEnBuurten_lower_case'].str.contains('industriegebied'))

# df_kerncijfers[df_kerncijfers['Is_non_residential']==True].sum()

df_kerncijfers = df_kerncijfers[df_kerncijfers['Is_non_residential'] == False]
df_kerncijfers.info()

### Feature #1 - Income

Take a look at the number of buurten where the incomes are unknown

In [None]:
# dab = df_kerncijfers[df_kerncijfers['GemiddeldInkomenPerInwoner_72'].isna()]
# dab.info()

# dab[['WijkenEnBuurten', 'GemiddeldInkomenPerInwoner_72' ]].head()
df_kerncijfers[df_kerncijfers['GemiddeldInkomenPerInwoner_72'].isna()].sum()

In [None]:
total_no_income = df_kerncijfers['GemiddeldInkomenPerInwoner_72'].isna().sum()
fraction_unfilled_incomes = total_no_income / len(df_kerncijfers.index)  * 100

print("Income not specified in: %.0f" % fraction_unfilled_incomes, "% of the buurten. Removing these entries.")

## 85% of the data points are not useable because not income is specified

In [None]:
cutoff_income = 60
df_kerncijfers = df_kerncijfers[(df_kerncijfers['GemiddeldInkomenPerInwoner_72'].isna() == False)]


df_income_specified = df_kerncijfers[df_kerncijfers['GemiddeldInkomenPerInwoner_72'] < cutoff_income]
df_income_specified['GemiddeldInkomenPerInwoner_72'].hist(bins=54, figsize=(8, 6))

df_kerncijfers.info()

### Feature #2 - Household composition

In [None]:
fraction_unfilled_huishoudensgroottes = df_kerncijfers['GemiddeldeHuishoudensgrootte_32'].isna().sum()/ df_kerncijfers['GemiddeldeHuishoudensgrootte_32'].sum() *100

print("Average size of household not specified in: %.0f" % fraction_unfilled_huishoudensgroottes, "% of the buurten. Removing these entries.")

In [None]:
cut_off_household_size = 5
df_kerncijfers = df_kerncijfers[df_kerncijfers['GemiddeldeHuishoudensgrootte_32'].isna() == False]

df_average_household_size_specified = df_kerncijfers[df_kerncijfers['GemiddeldeHuishoudensgrootte_32']<cut_off_household_size]
df_average_household_size_specified['GemiddeldeHuishoudensgrootte_32'].hist(bins=140, figsize=(8, 6))

df_kerncijfers.info()

### Feature #3 - Percentage owned property (koopwoningen)

In [None]:
fraction_unfilled_owned_property_percentage = df_kerncijfers['Koopwoningen_40'].isna().sum()/ df_kerncijfers['Koopwoningen_40'].sum() *100

print("Average percentage of owned properties not specified in: %.0f" % fraction_unfilled_owned_property_percentage, "% of the buurten.")


Remove the 'BU' from the buurtcode and rename the column name so we can combine the datasets later

In [None]:
has_buurtcode_starting_with_BU = df_kerncijfers['Codering_3'].str.find('BU') == 0
buurtcodes_without_leading_BU = df_kerncijfers[has_buurtcode_starting_with_BU == False]
print("Number of buurten that don't start with 'BU': ", len(buurtcodes_without_leading_BU))

df_kerncijfers['CBS Buurtcode'] = pd.to_numeric(df_kerncijfers['Codering_3'].apply(lambda s:s.replace("BU","")))

df_kerncijfers.info()

In [None]:
print("Duplicate buurtcodes: %.0f" % df_kerncijfers.duplicated(['CBS Buurtcode']).sum())
df_kerncijfers.astype({"CBS Buurtcode" : int})
df_kerncijfers.head()

Take a look at the # of households per buurt. We need this to be able to 'normalize' the installed pV

In [None]:
print("Huishoudens totaal has: ", (df_kerncijfers['HuishoudensTotaal_28'].isna() == True).sum(), " empty items.")

## Target variable - opgesteld vermogen

Load the data from the Enexis supplied data file.

In [None]:
decentral_generation_072020 = '../Data/Enexis_decentrale_opwek_kv_(zon_pv)_01072020.csv'
df_decentral_generation = pd.read_csv(decentral_generation_072020,
                         sep                = ';',
                         decimal            = ',',
                         thousands          = '.',
                         encoding           = 'unicode_escape')
df_decentral_generation.info()

Remove empty items

In [None]:
df_decentral_generation = df_decentral_generation[df_decentral_generation['Opgesteld vermogen'].isna() == False]
df_decentral_generation = df_decentral_generation[df_decentral_generation['CBS Buurtcode'].isna() == False]

## Note: the unit of 'opgesteld vermogen' is kW

In [None]:
cut_off_generation = 3000

df_decentral_generation_specified = df_decentral_generation[df_decentral_generation['Opgesteld vermogen'] < cut_off_generation]
df_decentral_generation_specified['Opgesteld vermogen'].hist(bins=53, figsize=(8, 6))

In [None]:
print("Duplicate buurtcodes: %.0f" % df_decentral_generation.duplicated(['CBS Buurtcode']).sum())
df_decentral_generation.astype({"CBS Buurtcode" : int})

Check if the deduplication was successfull

### Combine the demographic data with the generation data

In [None]:
print("Number of rows in 'kerncijfers': %.0f" % len(df_kerncijfers))
print("Number of rows in 'generation data': %.0f" % len(df_decentral_generation))

df = pd.merge(df_kerncijfers, df_decentral_generation, on="CBS Buurtcode", validate='one_to_one')
print("Number of rows in combined data set: %.0f" % len(df))


#### To do: check why not more rows match on buurt code

### Introduce a normalized column. normalized_opgesteld_vermogen = opgesteld_vermogen / #households

In [None]:
df["normalized_opgesteld_vermogen"] = (df["Opgesteld vermogen"] / df["HuishoudensTotaal_28"])

## Create a first model - linear model

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

import altair as alt

In [None]:
opgesteld_vermogen = 'normalized_opgesteld_vermogen'
columns_to_keep = [opgesteld_vermogen, 'GemiddeldInkomenPerInwoner_72', 'GemiddeldeHuishoudensgrootte_32', 'Koopwoningen_40']
df = df[columns_to_keep]

train_set, test_set = train_test_split(df, test_size=0.2)

print(f"training set size: {len(train_set)}\ntest set size: {len(test_set)}")

Y_train_set = train_set[opgesteld_vermogen]
X_train_set = train_set.drop(opgesteld_vermogen, axis=1).copy()

Y_test_set = test_set[opgesteld_vermogen]
X_test_set = test_set.drop(opgesteld_vermogen, axis=1).copy()

lin_reg = LinearRegression()
lin_reg.fit(X_train_set, Y_train_set)

Plot the fit with income

In [None]:
alt.renderers.enable('default')

base = alt.Chart(train_set).mark_circle().encode(
    alt.X('GemiddeldInkomenPerInwoner_72',
     title='# income'),
       alt.Y(opgesteld_vermogen,
     title='Opgesteld vermogen')
)

linear_fit = [
    base.transform_regression(
        "GemiddeldInkomenPerInwoner_72", opgesteld_vermogen, method="linear"
    )
    .mark_line()   
]

graph = alt.layer(base, *linear_fit)
graph

#### Check the quality of the model

In [None]:
# Make predictions using the testing set
y_pred = lin_reg.predict(X_test_set)

# The coefficients
print("Coefficients: \n", lin_reg.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(Y_test_set, y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(Y_test_set, y_pred))

## Create a first model - tree model

In [None]:
from sklearn.tree import DecisionTreeRegressor

Create helper class

In [None]:
class Result:
  def __init__(self, r_squared, mean_squared_error):
    self.r_squared = r_squared
    self.mean_squared_error = mean_squared_error

In [None]:
results = {}

max_range = 10

# Fit regression models
for i in range(1, max_range):
    r = DecisionTreeRegressor(max_depth=i, random_state=3)
    r.fit(X_train_set, Y_train_set)
    y_predict = r.predict(X_test_set)
    results[i] = Result(r2_score(Y_test_set, y_predict), mean_squared_error(Y_test_set, y_predict)) 

#### Check the quality of the models

# Mark down

In [None]:
for i in range(1,max_range):
    print("Depth  %2.f" % i, ":R squared: %.2f" % results[i].r_squared,
    "-- Mean squared error: %.0f" % results[i].mean_squared_error)


Best performance is at depth = 2 (Not the same over runs &#9785; though)

In [None]:
optimum_depth = 2
print("Depth  %2.f" % optimum_depth, ":R squared: %.2f" % results[optimum_depth].r_squared,
    "-- Mean squared error: %.0f" % results[optimum_depth].mean_squared_error)