# BCN open data SQL

In [2]:
import requests
import urllib.parse
import pandas as pd
from database import database
from pyproj import Transformer, CRS
import json
from tqdm.notebook import tqdm

url_base = 'https://opendata-ajuntament.barcelona.cat/data/api/action/datastore_search_sql?sql='
# Use generic "table" when writing the SQL query for clearer visualization

def convert_sql_query_to_bcnod_url (base, query, table_id):
    q = query.replace('table', table_id)
    q = urllib.parse.quote(q)
    return str(base + q)

In [9]:
table_id = 'b32fa7f6-d464-403b-8a02-0292a64883bf'
sql_query = """
SELECT * from "table" 
"""

api_url = convert_sql_query_to_bcnod_url (url_base, sql_query, table_id)
response = requests.get(api_url).json()
df = pd.DataFrame(response['result']['records'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 23 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   CODI_DISTRICTE               1500 non-null   object
 1   LONGITUD_X                   1500 non-null   object
 2   DISTRICTE                    1500 non-null   object
 3   ESCALA                       1500 non-null   object
 4   TIPUS_NUM                    1500 non-null   object
 5   PORTAL                       1500 non-null   object
 6   BARRI                        1500 non-null   object
 7   TIPUS_CARRER                 1500 non-null   object
 8   NUM1                         1500 non-null   object
 9   NUM2                         111 non-null    object
 10  NUMERO_REGISTRE_GENERALITAT  1500 non-null   object
 11  PIS                          1500 non-null   object
 12  PORTA                        1500 non-null   object
 13  LLETRA2                      1500

# Data cleaning and formatting

In [10]:
#df = df.dropna()

# Convert columns values to numeric if possible, otherwise make sure they are stored as strings.
columns_to_num = df.columns.tolist()
for col in columns_to_num:
    try:
        df[col] = df[col].astype(str)
        df[col] = pd.to_numeric(df[col])
    except ValueError:
        df[col] = df[col].astype(str)
        pass
        #raise ('This column is not numeric')        
#df.info()

# Is it really necessary to delete columns?
columns_to_keep = ['LATITUD_Y', 'LONGITUD_X', 'NUMERO_PLACES']
for c in df.columns:
    if (c not in columns_to_keep):
        df = df.drop(columns = c)
df.info()

# Add local coordinates   
coord_transformer = Transformer.from_crs("WGS84", "EPSG:25831") # Coordinate transformer
df['EPSG_25831_X'] = coord_transformer.transform(df['LATITUD_Y'].to_list(),df['LONGITUD_X'].to_list())[0]
df['EPSG_25831_Y'] = coord_transformer.transform(df['LATITUD_Y'].to_list(),df['LONGITUD_X'].to_list())[1]
df = df.drop(columns = ['LATITUD_Y', 'LONGITUD_X'] )

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   LONGITUD_X     1500 non-null   float64
 1   NUMERO_PLACES  1500 non-null   int64  
 2   LATITUD_Y      1500 non-null   float64
dtypes: float64(2), int64(1)
memory usage: 35.3 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   NUMERO_PLACES  1500 non-null   int64  
 1   EPSG_25831_X   1500 non-null   float64
 2   EPSG_25831_Y   1500 non-null   float64
dtypes: float64(2), int64(1)
memory usage: 35.3 KB


# Insert in our database

In [11]:
f = open('config_file.json')
config = json.load(f)
table = "tourist_housing"
my_db = database(config['host_name'], config['user_name'], config['password'], config['db'])

# Create table
my_db.execute_query(""" CREATE TABLE {table}(
id INT AUTO_INCREMENT,
CAPACITY INT NOT NULL,
EPSG_25831_X INT NOT NULL,
EPSG_25831_Y INT NOT NULL,
PRIMARY KEY (id)
)
""".format(table = table))


MySQL Server connection successful
MySQL Database connection test successful
Query successful


In [12]:
#Insert all data
col_names = [ 'CAPACITY', 'EPSG_25831_X', 'EPSG_25831_Y']
columns = ", ".join(col_names)

query = 'INSERT INTO {table} ({columns}) VALUES '.format(columns = columns, table = table)
for row in tqdm(df.index):
    vals = ", ".join("'{s}'".format(s=s.replace("'", "''")) if isinstance(s, str) else str(s) for s in df.loc[row])
    query_line = "(" + vals + ") , "
    query = query + query_line
    
query = query[:-2] + ';'
#Insert all rows at once
my_db.execute_query(query)

  0%|          | 0/1500 [00:00<?, ?it/s]

Query successful
