# ENEXIS Graduation Project

#### Creating auxiliary file with a directory of buurts, wijks, gemeenten and provinces in the Netherlands

# CRISP-DM Phase 2: Data Understanding - CBS Data

### Collect Initial Data

In [1]:
import cbsodata
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nbconvert
pd.set_option("max_rows", 120)
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
c_path = "./data/"
v_file = "brt2021" # CBS file with buurts 

In [3]:
df     = pd.read_csv(filepath_or_buffer = c_path + v_file + ".csv",
                         sep                = ';',
                         decimal            = ',',
                         thousands          = '.',
                         encoding           = 'UTF-8')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14175 entries, 0 to 14174
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   buurtcode2021  14175 non-null  int64 
 1   buurtnaam2021  14175 non-null  object
 2   GM_2021        14175 non-null  object
 3   GM2021         14175 non-null  int64 
 4   GM_NAAM        14175 non-null  object
 5   WK_2021        14175 non-null  object
 6   WK2021         14175 non-null  int64 
 7   WK_NAAM        14175 non-null  object
dtypes: int64(3), object(5)
memory usage: 886.1+ KB


In [5]:
df.head()

Unnamed: 0,buurtcode2021,buurtnaam2021,GM_2021,GM2021,GM_NAAM,WK_2021,WK2021,WK_NAAM
0,140000,Binnenstad-Noord,GM0014,14,Groningen,WK001400,1400,Centrum
1,140001,Binnenstad-Zuid,GM0014,14,Groningen,WK001400,1400,Centrum
2,140002,Binnenstad-Oost,GM0014,14,Groningen,WK001400,1400,Centrum
3,140003,Binnenstad-West,GM0014,14,Groningen,WK001400,1400,Centrum
4,140004,Noorderplantsoen,GM0014,14,Groningen,WK001400,1400,Centrum


In [6]:
df['BU_2021'] = df['buurtcode2021'].astype(str)

In [7]:
def makebuurt(code):
    if len(code) == 6:
        return 'BU00' + code
    elif len(code) == 7:
        return 'BU0' + code
    else:
        return 'BU' + code

In [8]:
df['BU_2021'] = df['BU_2021'].apply(makebuurt)

In [9]:
df = df[['BU_2021', 'buurtnaam2021', 'WK_2021', 'WK_NAAM', 'GM_2021', 'GM_NAAM']]

In [10]:
df = df.rename(columns={'buurtnaam2021': 'BU_NAAM'})

In [11]:
df.head()

Unnamed: 0,BU_2021,BU_NAAM,WK_2021,WK_NAAM,GM_2021,GM_NAAM
0,BU00140000,Binnenstad-Noord,WK001400,Centrum,GM0014,Groningen
1,BU00140001,Binnenstad-Zuid,WK001400,Centrum,GM0014,Groningen
2,BU00140002,Binnenstad-Oost,WK001400,Centrum,GM0014,Groningen
3,BU00140003,Binnenstad-West,WK001400,Centrum,GM0014,Groningen
4,BU00140004,Noorderplantsoen,WK001400,Centrum,GM0014,Groningen


In [12]:
v_file = "gemeenten-alfabetisch-2021"  # register of gemeentes per province
df1     = pd.read_csv(filepath_or_buffer = c_path + v_file + ".csv",
                         sep                = ';',
                         decimal            = ',',
                         thousands          = '.',
                         encoding           = 'UTF-8')

In [13]:
df1.head()

Unnamed: 0,Gemeentecode,GemeentecodeGM,Gemeentenaam,Provinciecode,ProvinciecodePV,Provincienaam
0,1680,GM1680,Aa en Hunze,22,PV22,Drenthe
1,358,GM0358,Aalsmeer,27,PV27,Noord-Holland
2,197,GM0197,Aalten,25,PV25,Gelderland
3,59,GM0059,Achtkarspelen,21,PV21,Fryslân
4,482,GM0482,Alblasserdam,28,PV28,Zuid-Holland


In [14]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 352 entries, 0 to 351
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Gemeentecode     352 non-null    int64 
 1   GemeentecodeGM   352 non-null    object
 2   Gemeentenaam     352 non-null    object
 3   Provinciecode    352 non-null    int64 
 4   ProvinciecodePV  352 non-null    object
 5   Provincienaam    352 non-null    object
dtypes: int64(2), object(4)
memory usage: 16.6+ KB


In [15]:
df = df.merge(df1, left_on = 'GM_2021', right_on = 'GemeentecodeGM')

In [16]:
df.columns

Index(['BU_2021', 'BU_NAAM', 'WK_2021', 'WK_NAAM', 'GM_2021', 'GM_NAAM',
       'Gemeentecode', 'GemeentecodeGM', 'Gemeentenaam', 'Provinciecode',
       'ProvinciecodePV', 'Provincienaam'],
      dtype='object')

In [17]:
df.drop(columns = ['Gemeentecode', 'GemeentecodeGM', 'Gemeentenaam', 'Provinciecode'], inplace = True)

In [18]:
df.head()

Unnamed: 0,BU_2021,BU_NAAM,WK_2021,WK_NAAM,GM_2021,GM_NAAM,ProvinciecodePV,Provincienaam
0,BU00140000,Binnenstad-Noord,WK001400,Centrum,GM0014,Groningen,PV20,Groningen
1,BU00140001,Binnenstad-Zuid,WK001400,Centrum,GM0014,Groningen,PV20,Groningen
2,BU00140002,Binnenstad-Oost,WK001400,Centrum,GM0014,Groningen,PV20,Groningen
3,BU00140003,Binnenstad-West,WK001400,Centrum,GM0014,Groningen,PV20,Groningen
4,BU00140004,Noorderplantsoen,WK001400,Centrum,GM0014,Groningen,PV20,Groningen


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14174 entries, 0 to 14173
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   BU_2021          14174 non-null  object
 1   BU_NAAM          14174 non-null  object
 2   WK_2021          14174 non-null  object
 3   WK_NAAM          14174 non-null  object
 4   GM_2021          14174 non-null  object
 5   GM_NAAM          14174 non-null  object
 6   ProvinciecodePV  14174 non-null  object
 7   Provincienaam    14174 non-null  object
dtypes: object(8)
memory usage: 996.6+ KB


In [20]:
df.describe()

Unnamed: 0,BU_2021,BU_NAAM,WK_2021,WK_NAAM,GM_2021,GM_NAAM,ProvinciecodePV,Provincienaam
count,14174,14174,14174,14174,14174,14174,14174,14174
unique,14174,13086,3330,3097,352,352,12,12
top,BU03630304,Groot binnenwater,WK029601,Wijk 00,GM0363,Amsterdam,PV28,Zuid-Holland
freq,1,51,39,112,481,481,2329,2329


In [22]:
df.to_csv(c_path + 'CSB Buurten Indeling.csv', encoding='utf-8', index_label= False)