In [None]:
%load_ext autoreload
%autoreload 2

import pandas_profiling
%pylab inline

import pandas as pd
import geopandas as gp

import seaborn as sns
sns.set(color_codes=True)
import numpy as np

from pathlib import Path
import itertools
from tqdm import tqdm

from analysis.settings.settings import DevelopmentConfig
c = DevelopmentConfig

from shapely.geometry.multipolygon import MultiPolygon
from shapely.geometry.polygon import Polygon
import shapely

# open data integration
Cleaning shape files delivered as open data from http://www.laerminfo.at/laermkarten/methoden/inspire.html

> an accompanying blog post can be found at: https://georgheiler.com/2019/02/08/noise-pollution-data-cleanup/

In [None]:
%ls ../data/

In [None]:
def iter_dirs(directory_in_str, glob):
    pathlist = Path(directory_in_str).glob(glob)
    for path in pathlist:
        yield str(path)
        
def parse_attributes_from_path(path):
    file_name = path.split('/')[-1]
    elements = file_name.split('_')
    result = {}
    result['year'] = elements[0]
    result['kind'] = elements[1]
    result['timing'] = elements[2]
    result['state'] = elements[-1].split('.')[0]
    return result

def add_columns_to_df(df, items):
    for key, value in items.items():
        df[key] = value
    return df

In [None]:
paths = iter_dirs(c.BASE_PATH, '**/*.shp')
#items = 2
#print(f"WARN UNSET THIS FOR PRODUCTION. currently only {items} are processed")
#paths = itertools.islice(paths, items)
tmp_appended_data = []
for path in tqdm(paths):
    attributes_from_filenname = parse_attributes_from_path(path)
    df = gp.read_file(path)
    df = add_columns_to_df(df, attributes_from_filenname)
    tmp_appended_data.append(df)

In [None]:
df = pd.concat(tmp_appended_data, axis=0)
df = df.reset_index(drop=True)
display(df.head())
df.shape

Convert to all multipolygon shapes

In [None]:
def convert_polygon_to_multipolygon(raw_geometry):
    if(isinstance(raw_geometry, shapely.geometry.polygon.Polygon)):
        return MultiPolygon([raw_geometry])
    else:
        # we currently only have MULTIPOLYGON and POLYGON so plain else is good enough
        return raw_geometry
            
df.geometry = df.geometry.apply(convert_polygon_to_multipolygon)
df.head()

## Write output files
Geopackage is a nice standard and a bit more structured than CSV. But CSV is more versatile i.e. as no spatial index and type information are present polygon and multipolygons can be put into the same column. That's why we needed to convert geometries in the step above.

The cooridnate system is `EPSG:31287`, http://spatialreference.org/ref/epsg/mgi-austria-lambert-2/ it can be stored in the geodatabase for easier downstream processing.

In [None]:
df.crs = {'init' :'epsg:31287'}

In [None]:
#df.to_csv(c.BASE_PATH + 'noise_pollution.gzip.csv', index=False, compression='gzip')
# is a bit smaller in file size and has no spatial index / less optimal

df.to_file(c.BASE_PATH + 'noise_pollution.gpkg', driver="GPKG")