# BCN open data SQL

In [1]:
import requests
import urllib.parse
import pandas as pd
from database import database
from pyproj import Transformer, CRS
import json
from tqdm.notebook import tqdm

url_base = 'https://opendata-ajuntament.barcelona.cat/data/api/action/datastore_search_sql?sql='
# Use generic "table" when writing the SQL query for clearer visualization

def convert_sql_query_to_bcnod_url (base, query, table_id):
    q = query.replace('table', table_id)
    q = urllib.parse.quote(q)
    return str(base + q)

In [17]:
table_id = 'e07dec0d-4aeb-40f3-b987-e1f35e088ce2'
sql_query = """
SELECT * from "table" 
"""

api_url = convert_sql_query_to_bcnod_url (url_base, sql_query, table_id)
response = requests.get(api_url).json()
df = pd.DataFrame(response['result']['records'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 684 entries, 0 to 683
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   LATITUD         684 non-null    object
 1   CODI_CAPA       684 non-null    object
 2   _id             684 non-null    int64 
 3   DISTRICTE       488 non-null    object
 4   ETRS89_COORD_X  684 non-null    object
 5   ETRS89_COORD_Y  684 non-null    object
 6   NOM_DISTRICTE   684 non-null    object
 7   _full_text      684 non-null    object
 8   TELEFON         684 non-null    object
 9   EQUIPAMENT      684 non-null    object
 10  NOM_BARRI       684 non-null    object
 11  ADRECA          684 non-null    object
 12  NOM_CAPA        684 non-null    object
 13  BARRI           488 non-null    object
 14  ED50_COORD_Y    684 non-null    object
 15  ED50_COORD_X    684 non-null    object
 16  CAPA_GENERICA   684 non-null    object
 17  LONGITUD        684 non-null    object
dtypes: int64(1

# Data cleaning and formatting

In [18]:
#df = df.dropna()

# Convert columns values to numeric if possible, otherwise make sure they are stored as strings.
columns_to_num = df.columns.tolist()
for col in columns_to_num:
    try:
        df[col] = df[col].astype(str)
        df[col] = pd.to_numeric(df[col])
    except ValueError:
        df[col] = df[col].astype(str)
        pass
        #raise ('This column is not numeric')        
#df.info()

# Is it really necessary to delete columns?
columns_to_keep = ['LATITUD', 'LONGITUD', 'NOM_CAPA', 'EQUIPAMENT']
for c in df.columns:
    if (c not in columns_to_keep):
        df = df.drop(columns = c)

# Add local coordinates   
coord_transformer = Transformer.from_crs("WGS84", "EPSG:25831") # Coordinate transformer
df['EPSG_25831_X'] = coord_transformer.transform(df['LATITUD'].to_list(),df['LONGITUD'].to_list())[0]
df['EPSG_25831_Y'] = coord_transformer.transform(df['LATITUD'].to_list(),df['LONGITUD'].to_list())[1]
df = df.drop(columns = ['LATITUD', 'LONGITUD'] )
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 684 entries, 0 to 683
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   EQUIPAMENT    684 non-null    object 
 1   NOM_CAPA      684 non-null    object 
 2   EPSG_25831_X  684 non-null    float64
 3   EPSG_25831_Y  684 non-null    float64
dtypes: float64(2), object(2)
memory usage: 21.5+ KB


# Insert in our database

In [19]:
f = open('config_file.json')
config = json.load(f)
table = "public_transport"
my_db = database(config['host_name'], config['user_name'], config['password'], config['db'])

# Create table
my_db.execute_query(""" CREATE TABLE {table}(
id INT AUTO_INCREMENT,
NAME VARCHAR(65) NOT NULL,
TYPE VARCHAR(35) NOT NULL,
EPSG_25831_X INT NOT NULL,
EPSG_25831_Y INT NOT NULL,
PRIMARY KEY (id)
)
""".format(table = table))


MySQL Server connection successful
MySQL Database connection test successful
Query successful


In [20]:
#Insert all data
col_names = [ 'NAME', 'TYPE', 'EPSG_25831_X', 'EPSG_25831_Y']
columns = ", ".join(col_names)

query = 'INSERT INTO {table} ({columns}) VALUES '.format(columns = columns, table = table)
for row in tqdm(df.index):
    vals = ", ".join("'{s}'".format(s=s.replace("'", "''")) if isinstance(s, str) else str(s) for s in df.loc[row])
    query_line = "(" + vals + ") , "
    query = query + query_line
    
query = query[:-2] + ';'
#Insert all rows at once
my_db.execute_query(query)

  0%|          | 0/684 [00:00<?, ?it/s]

Query successful
