# BCN open data SQL

In [1]:
import requests
import urllib.parse
import pandas as pd
from database import database
from pyproj import Transformer, CRS
import json
from tqdm.notebook import tqdm

url_base = 'https://opendata-ajuntament.barcelona.cat/data/api/action/datastore_search_sql?sql='
# Use generic "table" when writing the SQL query for clearer visualization

def convert_sql_query_to_bcnod_url (base, query, table_id):
    q = query.replace('table', table_id)
    q = urllib.parse.quote(q)
    return str(base + q)

In [37]:
table_id = 'f1d9d5aa-61d7-460e-b423-1bbfff96fab3'
sql_query = """
SELECT * from "table" 
"""

api_url = convert_sql_query_to_bcnod_url (url_base, sql_query, table_id)
response = requests.get(api_url).json()
df = pd.DataFrame(response['result']['records'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1068 entries, 0 to 1067
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   HOMES          1068 non-null   object
 1   EDAT_0_A_14    1068 non-null   object
 2   EDAT_65_A_MES  1068 non-null   object
 3   DONES          1068 non-null   object
 4   EDAT_25_A_64   1068 non-null   object
 5   COMUNITARIS    1068 non-null   object
 6   _full_text     1068 non-null   object
 7   NACIONALS      1068 non-null   object
 8   DATA_DADES     1068 non-null   object
 9   EDAT_15_A_24   1068 non-null   object
 10  SECCIO_CENSAL  1068 non-null   object
 11  _id            1068 non-null   int64 
 12  ESTRANGERS     1068 non-null   object
dtypes: int64(1), object(12)
memory usage: 108.6+ KB


# Data cleaning and formatting

In [38]:
df = df.dropna()

# We need to split the censal section and district since in this table it is in a different format
df['CENSAL_SECTION'] = df['SECCIO_CENSAL'].str[-3:]
df['DISTRICT'] = df['SECCIO_CENSAL'].str[-5:-3]

# Convert columns values to numeric if possible, otherwise make sure they are stored as strings.
columns_to_num = df.columns.tolist()
for col in columns_to_num:
    try:
        df[col] = df[col].astype(str)
        df[col] = pd.to_numeric(df[col])
    except ValueError:
        df[col] = df[col].astype(str)
        pass
        #raise ('This column is not numeric')        
#df.info()

# Is it really necessary to delete columns?
columns_to_delete = df.columns[[6,8,10,11]]
df = df.drop(columns = columns_to_delete)
df.info()

# Add local coordinates   
#coord_transformer = Transformer.from_crs("WGS84", "EPSG:25831") # Coordinate transformer
#df['EPSG_25831_X'] = coord_transformer.transform(df['LATITUD'].to_list(),df['LONGITUD'].to_list())[0]
#df['EPSG_25831_Y'] = coord_transformer.transform(df['LATITUD'].to_list(),df['LONGITUD'].to_list())[1]dd

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1068 entries, 0 to 1067
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   HOMES           1068 non-null   int64
 1   EDAT_0_A_14     1068 non-null   int64
 2   EDAT_65_A_MES   1068 non-null   int64
 3   DONES           1068 non-null   int64
 4   EDAT_25_A_64    1068 non-null   int64
 5   COMUNITARIS     1068 non-null   int64
 6   NACIONALS       1068 non-null   int64
 7   EDAT_15_A_24    1068 non-null   int64
 8   ESTRANGERS      1068 non-null   int64
 9   CENSAL_SECTION  1068 non-null   int64
 10  DISTRICT        1068 non-null   int64
dtypes: int64(11)
memory usage: 91.9 KB


# Insert in our database

In [40]:
f = open('config_file.json')
config = json.load(f)
table = "demographics"
my_db = database(config['host_name'], config['user_name'], config['password'], config['db'])

# Create table
my_db.execute_query(""" CREATE TABLE {table}(
id INT AUTO_INCREMENT,
DISTRICT INT NOT NULL,
CENSAL_SECTION INT NOT NULL,
MEN INT NOT NULL,
WOMEN INT NOT NULL,
AGED_0_TO_14 INT NOT NULL,
AGED_15_TO_24 INT NOT NULL,
AGED_25_TO_64 INT NOT NULL,
AGED_ABOVE_65 INT NOT NULL,
NATIONALS INT NOT NULL,
EU INT NOT NULL,
NON_EU INT NOT NULL,
PRIMARY KEY (id)
)
""".format(table = table))
#LATITUDE DECIMAL(8,6) NOT NULL,
#LONGITUDE DECIMAL(8,6) NOT NULL,
#EPSG_25831_X INT NOT NULL,
#EPSG_25831_Y INT NOT NULL,

MySQL Server connection successful
MySQL Database connection test successful
Query successful


In [41]:
#Insert all data
col_names = ['MEN', 'AGED_0_TO_14', 'AGED_ABOVE_65','WOMEN', 'AGED_25_TO_64', 'EU', 'NATIONALS', 'AGED_15_TO_24', 'NON_EU','CENSAL_SECTION', 'DISTRICT']
columns = ", ".join(col_names)

query = 'INSERT INTO {table} ({columns}) VALUES '.format(columns = columns, table = table)
for row in tqdm(df.index):
    vals = ", ".join("'{s}'".format(s=s.replace("'", "''")) if isinstance(s, str) else str(s) for s in df.loc[row])
    query_line = "(" + vals + ") , "
    query = query + query_line
    
query = query[:-2] + ';'
#Insert all rows at once
my_db.execute_query(query)

  0%|          | 0/1068 [00:00<?, ?it/s]

Query successful
