# Data preparation

### Example

In [1]:
# Let us load up the first dataframe
import pandas as pd
url_elderly_homes = "https://www.bcn.cat/tercerlloc/files/serveis-socials/opendatabcn_serveis-socials_residencies-gent-gran-js.json"
elderly_homes = pd.read_json(url_elderly_homes)

In [2]:
# As we have a big dataset with many columns we won't be working with them all
elderly_homes.columns

Index(['register_id', 'prefix', 'suffix', 'name', 'created', 'modified',
       'status', 'status_name', 'core_type', 'core_type_name', 'body',
       'tickets_data', 'addresses', 'entity_types_data',
       'attribute_categories', 'values', 'from_relationships',
       'to_relationships', 'classifications_data', 'secondary_filters_data',
       'geo_epgs_23031', 'geo_epgs_4326', 'is_section_of_data',
       'sections_data', 'start_date', 'end_date', 'estimated_dates',
       'languages_data', 'type', 'type_name', 'period', 'period_name',
       'event_status_name', 'event_status', 'ical'],
      dtype='object')

In [3]:
# With a bit of inspection we see that "geo_epgs_4326" contains a python
# dictionary with the latitude (x) and longitude (y) coordinates of each
# eldery home
display(elderly_homes["geo_epgs_4326"])

0       {'x': 41.41119232456979, 'y': 2.156265475296702}
1      {'x': 41.399360209783595, 'y': 2.114771587797944}
2       {'x': 41.39358185075261, 'y': 2.151072435340858}
3      {'x': 41.416439264796225, 'y': 2.181497632059419}
4       {'x': 41.39375515186429, 'y': 2.173575787347303}
                             ...                        
261     {'x': 41.38588813619825, 'y': 2.166545365678093}
262    {'x': 41.404633443795575, 'y': 2.172549877531641}
263     {'x': 41.38120535101579, 'y': 2.134549650370459}
264     {'x': 41.39298027772813, 'y': 2.172542767714582}
265     {'x': 41.37477903173226, 'y': 2.169371824240612}
Name: geo_epgs_4326, Length: 266, dtype: object

In [4]:
# Splitting the coordinates column
elderly_homes[["latitude", "longitude"]] = [
    [coordinates["x"], coordinates["y"]] for coordinates in elderly_homes.geo_epgs_4326
]

In [5]:
# Quick check to see what we have done
display(elderly_homes[["geo_epgs_4326", "latitude", "longitude"]])

Unnamed: 0,geo_epgs_4326,latitude,longitude
0,"{'x': 41.41119232456979, 'y': 2.156265475296702}",41.411192,2.156265
1,"{'x': 41.399360209783595, 'y': 2.114771587797944}",41.399360,2.114772
2,"{'x': 41.39358185075261, 'y': 2.151072435340858}",41.393582,2.151072
3,"{'x': 41.416439264796225, 'y': 2.181497632059419}",41.416439,2.181498
4,"{'x': 41.39375515186429, 'y': 2.173575787347303}",41.393755,2.173576
...,...,...,...
261,"{'x': 41.38588813619825, 'y': 2.166545365678093}",41.385888,2.166545
262,"{'x': 41.404633443795575, 'y': 2.172549877531641}",41.404633,2.172550
263,"{'x': 41.38120535101579, 'y': 2.134549650370459}",41.381205,2.134550
264,"{'x': 41.39298027772813, 'y': 2.172542767714582}",41.392980,2.172543


In [6]:
# Selecting the columns we wish to keep
cols = ["name", "latitude", "longitude"]
df = elderly_homes[cols]
display(df)

Unnamed: 0,name,latitude,longitude
0,Residència Assistencial per a Gent Gran Parc G...,41.411192,2.156265
1,Residència Assistida per a Gent Gran Pedralbes...,41.399360,2.114772
2,Residència Assistida per a Gent Gran Sant Dani...,41.393582,2.151072
3,Residència per a Gent Gran Alchemika,41.416439,2.181498
4,Residència Assistida per a Gent Gran Rosben,41.393755,2.173576
...,...,...,...
261,Residència Geros 4,41.385888,2.166545
262,Residència Assistida ORS,41.404633,2.172550
263,Residència Assistida per a Gent Gran Centre Parc,41.381205,2.134550
264,Residència Com a Casa,41.392980,2.172543


In [7]:
# This will add a new column with a brief description of what it is
df = df.assign(description="elderly_homes")

In [8]:
# Final dataset
display(df)

Unnamed: 0,name,latitude,longitude,description
0,Residència Assistencial per a Gent Gran Parc G...,41.411192,2.156265,elderly_homes
1,Residència Assistida per a Gent Gran Pedralbes...,41.399360,2.114772,elderly_homes
2,Residència Assistida per a Gent Gran Sant Dani...,41.393582,2.151072,elderly_homes
3,Residència per a Gent Gran Alchemika,41.416439,2.181498,elderly_homes
4,Residència Assistida per a Gent Gran Rosben,41.393755,2.173576,elderly_homes
...,...,...,...,...
261,Residència Geros 4,41.385888,2.166545,elderly_homes
262,Residència Assistida ORS,41.404633,2.172550,elderly_homes
263,Residència Assistida per a Gent Gran Centre Parc,41.381205,2.134550,elderly_homes
264,Residència Com a Casa,41.392980,2.172543,elderly_homes


### Lets now build the big dataset!

In [9]:
urls = {
    "day_centers": "https://www.bcn.cat/tercerlloc/files/serveis-socials/opendatabcn_serveis-socials_centres-dia-gent-gran-js.json",
    "hospitals": "https://www.bcn.cat/tercerlloc/files/sanitat/opendatabcn_sanitat_hospitals-i-centres-atencio-primaria-js.json",
    "pharmacies": "https://www.bcn.cat/tercerlloc/files/sanitat/opendatabcn_sanitat_farmacies-js.json"
}

for description, url in urls.items():
    dff = pd.read_json(url)
    dff[["latitude", "longitude"]] = [
        [coordinates["x"], coordinates["y"]] for coordinates in dff.geo_epgs_4326
    ]
    dff = dff[["name", "latitude", "longitude"]]
    dff = dff.assign(description=description)
    df = pd.concat([df, dff])

In [None]:
# Saving the dataset
df.to_csv("./dataset.csv")