## Loading and preprocessing ISTAT data

In [23]:
import numpy as np
import pandas as pd
import geopandas as gpd
from os import path, listdir

In [31]:
## Loading ISTAT data
cpaPath = '../data/dati-cpa_2011/Sezioni di Censimento/'
fileList = [f for f in listdir(cpaPath) if f.startswith('R')]
dfList = []
cityList = ['Milano', 'Torino']
dataDict = {}
for filename in fileList:
    regionData = pd.read_csv(path.join(cpaPath, filename), sep=';', encoding='latin').set_index('SEZ2011')
    # extract the councils we are intersted in
    for city in cityList:
        cityData = regionData[regionData.COMUNE==city]
        if cityData.size>0:
            dataDict[city] = cityData


In [32]:
fieldsMeaning = pd.read_csv(path.join(cpaPath, 'tracciato_2011_sezioni.csv'), sep=';',encoding='latin')
fieldsMeaning

Unnamed: 0,NOME_CAMPO,DEFINIZIONE
0,CODREG,Codice numerico che identifica univocamente la...
1,REGIONE,Denominazione della regione
2,CODPRO,Codice numerico che identifica univocamente la...
3,PROVINCIA,Denominazione della provincia
4,CODCOM,Codice numerico che identifica univocamente il...
5,COMUNE,Denominazione del comune
6,PROCOM,Codice numerico che identifica univocamente il...
7,SEZ2011,Codice numerico che identifica univocamente la...
8,NSEZ,Numero che identifica univocamente la sezione ...
9,ACE,Numero che identifica univocamente l'area di c...


In [203]:
# Join sez geofile for Milano
sezNameCol = 'SEZ2011'
shapeDataMilano = gpd.read_file('../data/milanoSezGeo')
print(shapeDataMilano[sezNameCol].isin(dataDict['Milano'].index).mean())
print(dataDict['Milano'].index.isin(shapeDataMilano[sezNameCol]).mean())
joinedData = pd.merge(shapeDataMilano, dataDict['Milano'], how='inner', right_index=True, left_on=sezNameCol)
joinedData.SEZ2011 = joinedData.SEZ2011.astype(int)

0.9888139496627735
0.9990028253282367


In [204]:
# collect quartiere label
quartiereLabels = pd.read_csv('../data/Milano_sezToQuartieri.csv')
# fix typo
quartiereLabels.NIL.replace(to_replace='MAGENTA - S.VITTORE', value='MAGENTA - S. VITTORE', inplace=True)

quartieriData = gpd.read_file('../data/Milano_quartieri.geojson')
# join
quartiereLabels = quartiereLabels.join(quartieriData[['NIL', 'ID_NIL']].set_index('NIL'), on='NIL')
quartiereLabels = quartiereLabels.set_index(sezNameCol)

assert not any(quartiereLabels.ID_NIL.isnull()), 'Typos in NIL field'

In [205]:
joinedData = joinedData.join(quartiereLabels, on=sezNameCol)
joinedData = joinedData[~joinedData.ID_NIL.isnull()]
joinedData.ID_NIL = joinedData.ID_NIL.astype(int)
#joinedData.to_csv('test.csv')

In [197]:
quartiereLabels

Unnamed: 0_level_0,NIL,ID_NIL
SEZ2011,Unnamed: 1_level_1,Unnamed: 2_level_1
151460000001,DUOMO,1
151460000002,DUOMO,1
151460000003,DUOMO,1
151460000004,DUOMO,1
151460000005,DUOMO,1
151460000006,DUOMO,1
151460000007,DUOMO,1
151460000008,DUOMO,1
151460000009,DUOMO,1
151460000010,DUOMO,1


In [213]:
# rename columns
joinedData.rename({'NIL':'quartiere', 'ID_NIL': 'IDquartiere'}, axis='columns', inplace=True)


In [214]:
joinedData.to_file('../final/Milano_sezioni.geojson', driver='GeoJSON')

In [207]:
zz = gpd.read_file('../final/Milano_sezioni.geojson')

In [208]:
zz

Unnamed: 0,OBJECTID,PRO_COM,SEZ,TIPO_LOC,SEZ2011,SHAPE_AREA,SHAPE_LEN,POP_2010,ACE_x,mappa2,...,E25,E26,E27,E28,E29,E30,E31,NIL,ID_NIL,geometry
0,49010,15146,236,1,2147483647,8061.468253,449.226698,70.0,1.0,3,...,2,0,51,5,2,0,0,DUOMO,1,"POLYGON ((1514122.149438965 5034191.777491422,..."
1,49011,15146,237,1,2147483647,5416.911543,344.834372,173.0,1.0,12,...,5,1,74,0,7,0,0,DUOMO,1,"POLYGON ((1514166.339374326 5034198.582483572,..."
2,49015,15146,241,1,2147483647,12107.858114,510.055011,160.0,1.0,13,...,3,1,105,14,0,0,0,DUOMO,1,"POLYGON ((1514365.509413606 5034211.737496569,..."
3,49018,15146,244,1,2147483647,11178.703560,421.080703,105.0,1.0,22,...,1,2,67,6,1,0,0,DUOMO,1,"POLYGON ((1514508.984403429 5034317.407504656,..."
4,49241,15146,151,1,2147483647,2727.769331,262.767785,6.0,1.0,1,...,0,0,3,1,0,0,0,DUOMO,1,"POLYGON ((1515504.809281959 5035162.942410328,..."
5,49253,15146,163,1,2147483647,7925.594736,380.231112,0.0,1.0,0,...,0,0,0,0,0,0,0,DUOMO,1,"POLYGON ((1515423.209294741 5034342.942450254,..."
6,50663,15146,257,1,2147483647,12029.853529,504.172316,40.0,1.0,7,...,2,0,39,6,0,0,0,DUOMO,1,"POLYGON ((1514314.974360594 5034574.617470345,..."
7,50669,15146,263,1,2147483647,7023.062636,338.646027,127.0,1.0,8,...,5,0,70,1,6,1,0,DUOMO,1,"POLYGON ((1513960.469447921 5034610.952478236,..."
8,50675,15146,270,1,2147483647,11837.226655,939.197794,178.0,1.0,29,...,5,2,111,7,3,0,0,DUOMO,1,"POLYGON ((1513671.81441165 5034319.432449303, ..."
9,51544,15146,3,1,2147483647,10964.756918,432.517441,22.0,1.0,1,...,0,2,45,2,0,0,0,DUOMO,1,"POLYGON ((1515049.884268219 5034427.202487691,..."


In [41]:
# export as csv to final folder
for city, cityData in dataDict.items():
    cityData.to_csv('../final/'+city+'_cpa_2011.csv', sep=';')