# BCN open data SQL

In [1]:
import requests
import urllib.parse
import pandas as pd
from database import database
from pyproj import Transformer, CRS
import json
from tqdm.notebook import tqdm

url_base = 'https://opendata-ajuntament.barcelona.cat/data/api/action/datastore_search_sql?sql='
# Use generic "table" when writing the SQL query for clearer visualization

def convert_sql_query_to_bcnod_url (base, query, table_id):
    q = query.replace('table', table_id)
    q = urllib.parse.quote(q)
    return str(base + q)

In [2]:
table_id = '31431b23-d5b9-42b8-bcd0-a84da9d8c7fa'
sql_query = """
SELECT * from "table" 
"""

api_url = convert_sql_query_to_bcnod_url (url_base, sql_query, table_id)
response = requests.get(api_url).json()
df = pd.DataFrame(response['result']['records'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 871 entries, 0 to 870
Data columns (total 33 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   addresses_roadtype_name        871 non-null    object
 1   addresses_end_street_number    49 non-null     object
 2   institution_name               871 non-null    object
 3   values_attribute_name          871 non-null    object
 4   addresses_road_name            871 non-null    object
 5   values_category                871 non-null    object
 6   addresses_zip_code             871 non-null    object
 7   values_value                   871 non-null    object
 8   addresses_town                 871 non-null    object
 9   geo_epgs_4326_y                871 non-null    object
 10  geo_epgs_4326_x                871 non-null    object
 11  addresses_district_name        871 non-null    object
 12  geo_epgs_25831_x               871 non-null    object
 13  addre

In [4]:
max([len(d) for d in df['name']])

81

# Data cleaning and formatting

In [15]:
#df = df.dropna()

# Convert columns values to numeric if possible, otherwise make sure they are stored as strings.
columns_to_num = df.columns.tolist()
for col in columns_to_num:
    try:
        df[col] = df[col].astype(str)
        df[col] = pd.to_numeric(df[col])
    except ValueError:
        df[col] = df[col].astype(str)
        pass
        #raise ('This column is not numeric')        
#df.info()

# Is it really necessary to delete columns?
columns_to_keep = ['geo_epgs_25831_y', 'geo_epgs_25831_x', 'name', 'secondary_filters_name']
for c in df.columns:
    if (c not in columns_to_keep):
        df = df.drop(columns = c)
df.info()

# Add local coordinates   
#coord_transformer = Transformer.from_crs("WGS84", "EPSG:25831") # Coordinate transformer
#df['EPSG_25831_X'] = coord_transformer.transform(df['LATITUD'].to_list(),df['LONGITUD'].to_list())[0]
#df['EPSG_25831_Y'] = coord_transformer.transform(df['LATITUD'].to_list(),df['LONGITUD'].to_list())[1]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141 entries, 0 to 140
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   secondary_filters_name  141 non-null    object 
 1   geo_epgs_25831_x        141 non-null    float64
 2   name                    141 non-null    object 
 3   geo_epgs_25831_y        141 non-null    float64
dtypes: float64(2), object(2)
memory usage: 4.5+ KB


# Insert in our database

In [16]:
f = open('config_file.json')
config = json.load(f)
table = "medical_centers"
my_db = database(config['host_name'], config['user_name'], config['password'], config['db'])

# Create table
my_db.execute_query(""" CREATE TABLE {table}(
id INT AUTO_INCREMENT,
NAME VARCHAR(85) NOT NULL,
TYPE VARCHAR(60) NOT NULL,
EPSG_25831_X INT NOT NULL,
EPSG_25831_Y INT NOT NULL,
PRIMARY KEY (id)
)
""".format(table = table))


MySQL Server connection successful
MySQL Database connection test successful
Query successful


In [17]:
#Insert all data
col_names = [ 'TYPE', 'EPSG_25831_X', 'NAME', 'EPSG_25831_Y']
columns = ", ".join(col_names)

query = 'INSERT INTO {table} ({columns}) VALUES '.format(columns = columns, table = table)
for row in tqdm(df.index):
    vals = ", ".join("'{s}'".format(s=s.replace("'", "''")) if isinstance(s, str) else str(s) for s in df.loc[row])
    query_line = "(" + vals + ") , "
    query = query + query_line
    
query = query[:-2] + ';'
#Insert all rows at once
my_db.execute_query(query)

  0%|          | 0/141 [00:00<?, ?it/s]

Query successful
