In [1]:
import gc
import os
import time
import logging
import pandas as pd
from sqlalchemy import create_engine
from datetime import datetime, timedelta
from trino.dbapi import connect
from pathlib import Path

In [2]:
# SQL engine class
class TrinoEngine():
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query:str) -> list:
        """
        Create and drop statements.
        """
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql(self, query:str) -> pd.DataFrame: 
        """
        Select and insert into operations.
        """
        return pd.read_sql(query, self.engine)

    def read_sql_chunked(self, query: str, chunksize: int = 10000):
        return pd.read_sql(query, self.engine, chunksize=chunksize)

sql_engine = TrinoEngine()

In [3]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Database connection setup
output_schema_name = 'pop_density'
con = create_engine(f"trino://localhost:9090/dedicated/{output_schema_name}")

start_time = time.time()  # Start timing

# Define the input parameters
country_code = 'CO'
# country_code = 'MX'
# country_code = 'ID'
# country_code = 'IN'
#start_date = 20190101
#end_date = 20190131

# Define the input schema and table name
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}
pe_dl_table = f"{schema_name['cda']}.device_location_uplevelled"

# Convert integer dates to datetime objects
#start_date_dt = datetime.strptime(str(start_date), '%Y%m%d')
#end_date_dt = datetime.strptime(str(end_date), '%Y%m%d')


In [4]:
#current_date = start_date_dt
popDensity_threshold = 10
pathFolder = "/home/jovyan/Data/daily_pd5/".format(country_code)
Path(pathFolder).mkdir(parents=True, exist_ok=True)


#formatted_current_date = current_date.strftime('%Y%m%d')



In [12]:
def kanonimization (fileName,popDensity_threshold):
    df = pd.read_csv(fileName)
    print ("Before k-anonimity {}".format(df.shape[0]))
    df = df[ df['no_of_unique_users'] >= popDensity_threshold ]
    print ("After k-anonimity {}".format(df.shape[0]))
    df[['geohash_5','no_of_points','no_of_unique_users','grt', 'day', 'month']].to_csv(fileName+'_kanonnimized.csv', index=False)


# Convert the generator to a DataFrame

## January

In [None]:
# Construct the SQL query
query = f"""
SELECT
    COUNT(cuebiq_id)  no_of_points,
    COUNT(DISTINCT(cuebiq_id))  no_of_unique_users,
    grt,
    day,
    month,
    geohash_5 
FROM
(SELECT 
    cuebiq_id,
    event_zoned_datetime,
    extract(HOUR FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  hour,
    extract(DAY FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  day,
    extract(MONTH FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  month,
    geohash_encode(lat, lng, 5) AS geohash_5, 
    extract(HOUR FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))/3  grt
FROM {pe_dl_table}
WHERE 
    country_code = '{country_code}' 
    AND event_zoned_datetime IS NOT NULL
    AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
    AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) BETWEEN date_parse('20190101', '%Y%m%d') AND date_parse('20190131', '%Y%m%d')   
)
GROUP BY grt, day, month, geohash_5
"""

pe_dl_table_gen = sql_engine.read_sql(query)
fileName = "{}{}_daily_agg5_hour_jan.csv".format(pathFolder,country_code)
pe_dl_table_gen.sort_values(['grt','day'], inplace=True)
pe_dl_table_gen.to_csv(fileName,  sep=',', index=False)
kanonimization (fileName,popDensity_threshold)

In [None]:
kanonimization (fileName,popDensity_threshold)

# February

In [11]:
# Construct the SQL query
query = f"""
SELECT
    COUNT(cuebiq_id)  no_of_points,
    COUNT(DISTINCT(cuebiq_id))  no_of_unique_users,
    grt,
    day,
    month,
    geohash_5 
FROM
(SELECT 
    cuebiq_id,
    event_zoned_datetime,
    extract(HOUR FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  hour,
    extract(DAY FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  day,
    extract(MONTH FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  month,
    geohash_encode(lat, lng, 5) AS geohash_5, 
    extract(HOUR FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))/3  grt
FROM {pe_dl_table}
WHERE 
    country_code = '{country_code}' 
    AND event_zoned_datetime IS NOT NULL
    AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
    AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) BETWEEN date_parse('20200201', '%Y%m%d') AND date_parse('20200229', '%Y%m%d') 
)
GROUP BY grt, day, month, geohash_5
"""

pe_dl_table_gen = sql_engine.read_sql(query)
fileName = "{}{}_daily_agg5_hour_feb_2020.csv".format(pathFolder,country_code)
pe_dl_table_gen.sort_values(['grt','day'], inplace=True)
pe_dl_table_gen.to_csv(fileName,  sep=',', index=False)
kanonimization (fileName,popDensity_threshold)

Before k-anonimity 1971767
After k-anonimity 204845


## March

In [None]:
# Construct the SQL query
query = f"""
SELECT
    COUNT(cuebiq_id)  no_of_points,
    COUNT(DISTINCT(cuebiq_id))  no_of_unique_users,
    grt,
    day,
    month,
    geohash_5 
FROM
(SELECT 
    cuebiq_id,
    event_zoned_datetime,
    extract(HOUR FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  hour,
    extract(DAY FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  day,
    extract(MONTH FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  month,
    geohash_encode(lat, lng, 5) AS geohash_5, 
    extract(HOUR FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))/3  grt
FROM {pe_dl_table}
WHERE 
    country_code = '{country_code}' 
    AND event_zoned_datetime IS NOT NULL
    AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
    AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) BETWEEN date_parse('20190301', '%Y%m%d') AND date_parse('20190331', '%Y%m%d') 
)
GROUP BY grt, day, month, geohash_5
"""

pe_dl_table_gen = sql_engine.read_sql(query)
fileName = "{}{}_daily_agg5_hour_mar.csv".format(pathFolder,country_code)
pe_dl_table_gen.sort_values(['grt','day'], inplace=True)
pe_dl_table_gen.to_csv(fileName,  sep=',', index=False)
kanonimization (fileName,popDensity_threshold)

# April

In [None]:
# Construct the SQL query
query = f"""
SELECT
    COUNT(cuebiq_id)  no_of_points,
    COUNT(DISTINCT(cuebiq_id))  no_of_unique_users,
    grt,
    day,
    month,
    geohash_5 
FROM
(SELECT 
    cuebiq_id,
    event_zoned_datetime,
    extract(HOUR FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  hour,
    extract(DAY FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  day,
    extract(MONTH FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  month,
    geohash_encode(lat, lng, 5) AS geohash_5, 
    extract(HOUR FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))/3  grt
FROM {pe_dl_table}
WHERE 
    country_code = '{country_code}' 
    AND event_zoned_datetime IS NOT NULL
    AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
    AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) BETWEEN date_parse('20190401', '%Y%m%d') AND date_parse('20190430', '%Y%m%d') 
)
GROUP BY grt, day, month, geohash_5
"""

pe_dl_table_gen = sql_engine.read_sql(query)
fileName = "{}{}_daily_agg5_hour_apr.csv".format(pathFolder,country_code)
pe_dl_table_gen.sort_values(['grt','day'], inplace=True)
pe_dl_table_gen.to_csv(fileName,  sep=',', index=False)
kanonimization (fileName,popDensity_threshold)


# May

In [None]:
# Construct the SQL query
query = f"""
SELECT
    COUNT(cuebiq_id)  no_of_points,
    COUNT(DISTINCT(cuebiq_id))  no_of_unique_users,
    grt,
    day,
    month,
    geohash_5 
FROM
(SELECT 
    cuebiq_id,
    event_zoned_datetime,
    extract(HOUR FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  hour,
    extract(DAY FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  day,
    extract(MONTH FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  month,
    geohash_encode(lat, lng, 5) AS geohash_5, 
    extract(HOUR FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))/3  grt
FROM {pe_dl_table}
WHERE 
    country_code = '{country_code}' 
    AND event_zoned_datetime IS NOT NULL
    AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
    AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) BETWEEN date_parse('20190501', '%Y%m%d') AND date_parse('20190531', '%Y%m%d') 
)
GROUP BY grt, day, month, geohash_5
"""

pe_dl_table_gen = sql_engine.read_sql(query)
fileName = "{}{}_daily_agg5_hour_may.csv".format(pathFolder,country_code)
pe_dl_table_gen.sort_values(['grt','day'], inplace=True)
pe_dl_table_gen.to_csv(fileName,  sep=',', index=False)
kanonimization (fileName,popDensity_threshold)


# Jun

In [None]:
# Construct the SQL query
query = f"""
SELECT
    COUNT(cuebiq_id)  no_of_points,
    COUNT(DISTINCT(cuebiq_id))  no_of_unique_users,
    grt,
    day,
    month,
    geohash_5 
FROM
(SELECT 
    cuebiq_id,
    event_zoned_datetime,
    extract(HOUR FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  hour,
    extract(DAY FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  day,
    extract(MONTH FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  month,
    geohash_encode(lat, lng, 5) AS geohash_5, 
    extract(HOUR FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))/3  grt
FROM {pe_dl_table}
WHERE 
    country_code = '{country_code}' 
    AND event_zoned_datetime IS NOT NULL
    AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
    AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) BETWEEN date_parse('20190601', '%Y%m%d') AND date_parse('20190630', '%Y%m%d') 
)
GROUP BY grt, day, month, geohash_5
"""

pe_dl_table_gen = sql_engine.read_sql(query)
fileName = "{}{}_daily_agg5_hour_jun.csv".format(pathFolder,country_code)
pe_dl_table_gen.sort_values(['grt','day'], inplace=True)
pe_dl_table_gen.to_csv(fileName,  sep=',', index=False)
kanonimization (fileName,popDensity_threshold)



# July

In [None]:
# Construct the SQL query
query = f"""
SELECT
    COUNT(cuebiq_id)  no_of_points,
    COUNT(DISTINCT(cuebiq_id))  no_of_unique_users,
    grt,
    day,
    month,
    geohash_5 
FROM
(SELECT 
    cuebiq_id,
    event_zoned_datetime,
    extract(HOUR FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  hour,
    extract(DAY FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  day,
    extract(MONTH FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  month,
    geohash_encode(lat, lng, 5) AS geohash_5, 
    extract(HOUR FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))/3  grt
FROM {pe_dl_table}
WHERE 
    country_code = '{country_code}' 
    AND event_zoned_datetime IS NOT NULL
    AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
    AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) BETWEEN date_parse('20190701', '%Y%m%d') AND date_parse('20190731', '%Y%m%d') 
)
GROUP BY grt, day, month, geohash_5
"""

pe_dl_table_gen = sql_engine.read_sql(query)
fileName = "{}{}_daily_agg5_hour_jul.csv".format(pathFolder,country_code)
pe_dl_table_gen.sort_values(['grt','day'], inplace=True)
pe_dl_table_gen.to_csv(fileName,  sep=',', index=False)
kanonimization (fileName,popDensity_threshold)


# August

In [None]:
# Construct the SQL query
query = f"""
SELECT
    COUNT(cuebiq_id)  no_of_points,
    COUNT(DISTINCT(cuebiq_id))  no_of_unique_users,
    grt,
    day,
    month,
    geohash_5 
FROM
(SELECT 
    cuebiq_id,
    event_zoned_datetime,
    extract(HOUR FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  hour,
    extract(DAY FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  day,
    extract(MONTH FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  month,
    geohash_encode(lat, lng, 5) AS geohash_5, 
    extract(HOUR FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))/3  grt
FROM {pe_dl_table}
WHERE 
    country_code = '{country_code}' 
    AND event_zoned_datetime IS NOT NULL
    AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
    AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) BETWEEN date_parse('20190801', '%Y%m%d') AND date_parse('20190831', '%Y%m%d') 
)
GROUP BY grt, day, month, geohash_5
"""

pe_dl_table_gen = sql_engine.read_sql(query)
fileName = "{}{}_daily_agg5_hour_aug.csv".format(pathFolder,country_code)
pe_dl_table_gen.sort_values(['grt','day'], inplace=True)
pe_dl_table_gen.to_csv(fileName,  sep=',', index=False)
kanonimization (fileName,popDensity_threshold)



# September

In [None]:
# Construct the SQL query
query = f"""
SELECT
    COUNT(cuebiq_id)  no_of_points,
    COUNT(DISTINCT(cuebiq_id))  no_of_unique_users,
    grt,
    day,
    month,
    geohash_5 
FROM
(SELECT 
    cuebiq_id,
    event_zoned_datetime,
    extract(HOUR FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  hour,
    extract(DAY FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  day,
    extract(MONTH FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  month,
    geohash_encode(lat, lng, 5) AS geohash_5, 
    extract(HOUR FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))/3  grt
FROM {pe_dl_table}
WHERE 
    country_code = '{country_code}' 
    AND event_zoned_datetime IS NOT NULL
    AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
    AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) BETWEEN date_parse('20190901', '%Y%m%d') AND date_parse('20190930', '%Y%m%d') 
)
GROUP BY grt, day, month, geohash_5
"""

pe_dl_table_gen = sql_engine.read_sql(query)
fileName = "{}{}_daily_agg5_hour_set.csv".format(pathFolder,country_code)
pe_dl_table_gen.sort_values(['grt','day'], inplace=True)
pe_dl_table_gen.to_csv(fileName,  sep=',', index=False)
kanonimization (fileName,popDensity_threshold)



# October

In [None]:
# Construct the SQL query
query = f"""
SELECT
    COUNT(cuebiq_id)  no_of_points,
    COUNT(DISTINCT(cuebiq_id))  no_of_unique_users,
    grt,
    day,
    month,
    geohash_5 
FROM
(SELECT 
    cuebiq_id,
    event_zoned_datetime,
    extract(HOUR FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  hour,
    extract(DAY FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  day,
    extract(MONTH FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  month,
    geohash_encode(lat, lng, 5) AS geohash_5, 
    extract(HOUR FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))/3  grt
FROM {pe_dl_table}
WHERE 
    country_code = '{country_code}' 
    AND event_zoned_datetime IS NOT NULL
    AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
    AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) BETWEEN date_parse('20191001', '%Y%m%d') AND date_parse('20191031', '%Y%m%d') 
)
GROUP BY grt, day, month, geohash_5
"""

pe_dl_table_gen = sql_engine.read_sql(query)
fileName = "{}{}_daily_agg5_hour_oct.csv".format(pathFolder,country_code)
pe_dl_table_gen.sort_values(['grt','day'], inplace=True)
pe_dl_table_gen.to_csv(fileName,  sep=',', index=False)
kanonimization (fileName,popDensity_threshold)


# November

In [7]:
# Construct the SQL query
query = f"""
SELECT
    COUNT(cuebiq_id)  no_of_points,
    COUNT(DISTINCT(cuebiq_id))  no_of_unique_users,
    grt,
    day,
    month,
    geohash_5 
FROM
(SELECT 
    cuebiq_id,
    event_zoned_datetime,
    extract(HOUR FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  hour,
    extract(DAY FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  day,
    extract(MONTH FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  month,
    geohash_encode(lat, lng, 5) AS geohash_5, 
    extract(HOUR FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))/3  grt
FROM {pe_dl_table}
WHERE 
    country_code = '{country_code}' 
    AND event_zoned_datetime IS NOT NULL
    AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
    AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) BETWEEN date_parse('20191101', '%Y%m%d') AND date_parse('20191130', '%Y%m%d') 
    AND lat <> 0
    AND lng <> 0
)
GROUP BY grt, day, month, geohash_5
"""

pe_dl_table_gen = sql_engine.read_sql(query)
fileName = "{}{}_daily_agg5_hour_nov.csv".format(pathFolder,country_code)
pe_dl_table_gen.sort_values(['grt','day'], inplace=True)
pe_dl_table_gen.to_csv(fileName,  sep=',', index=False)
kanonimization (fileName,popDensity_threshold)

Before k-anonimity 816586
After k-anonimity 121606


# December

In [13]:
# Construct the SQL query
query = f"""
SELECT
    COUNT(cuebiq_id)  no_of_points,
    COUNT(DISTINCT(cuebiq_id))  no_of_unique_users,
    grt,
    day,
    month,
    geohash_5 
FROM
(SELECT 
    cuebiq_id,
    event_zoned_datetime,
    extract(HOUR FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  hour,
    extract(DAY FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  day,
    extract(MONTH FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  month,
    geohash_encode(lat, lng, 5) AS geohash_5, 
    extract(HOUR FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))/3  grt
FROM {pe_dl_table}
WHERE 
    country_code = '{country_code}' 
    AND event_zoned_datetime IS NOT NULL
    AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
    AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) BETWEEN date_parse('20191201', '%Y%m%d') AND date_parse('20191231', '%Y%m%d') 
    AND lat <> 0
    AND lng <> 0
)
GROUP BY grt, day, month, geohash_5
"""

pe_dl_table_gen = sql_engine.read_sql(query)
fileName = "{}{}_daily_agg5_hour_dec.csv".format(pathFolder,country_code)
pe_dl_table_gen.sort_values(['grt','day'], inplace=True)
pe_dl_table_gen.to_csv(fileName,  sep=',', index=False)
kanonimization (fileName,popDensity_threshold)

Before k-anonimity 747384
After k-anonimity 95979
