In [10]:
import gc
import os
import time
import logging
import pandas as pd
from sqlalchemy import create_engine
from datetime import datetime, timedelta
from trino.dbapi import connect
from pathlib import Path

In [11]:
# SQL engine class
class TrinoEngine():
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query:str) -> list:
        """
        Create and drop statements.
        """
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql(self, query:str) -> pd.DataFrame: 
        """
        Select and insert into operations.
        """
        return pd.read_sql(query, self.engine)

    def read_sql_chunked(self, query: str, chunksize: int = 10000):
        return pd.read_sql(query, self.engine, chunksize=chunksize)

sql_engine = TrinoEngine()

In [12]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define the input parameters
country_code = 'CO'
# country_code = 'ID'
# country_code = 'IN'
# country_code = 'MX'

# Define the input schema and table name
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}
pe_dl_table = f"{schema_name['cda']}.device_location_uplevelled"

In [13]:
pathFolder = "/home/jovyan/Data/3h_pd5/".format(country_code)
Path(pathFolder).mkdir(parents=True, exist_ok=True)

# January to March

In [14]:
query = f"""
SELECT 
    COUNT(*) AS no_of_points,
    COUNT(DISTINCT(cuebiq_id)) AS no_of_unique_users,
    grt,
    local_date,
    geohash_5
FROM
    (SELECT 
        cuebiq_id,
        event_zoned_datetime,
        geohash_encode(lat, lng, 5) AS geohash_5, 
        extract(HOUR FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))/3  grt, 
        DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS local_date
    FROM {pe_dl_table}
    WHERE 
        country_code = '{country_code}' 
        AND event_zoned_datetime IS NOT NULL
        AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
        AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) BETWEEN date_parse('20200101', '%Y%m%d') AND date_parse('20200331', '%Y%m%d')
        AND lat <> 0
        AND lng <> 0
    )
GROUP BY grt, local_date, geohash_5
"""

pe_dl = sql_engine.read_sql(query)

# Convert 'local_date' to 'yyyymmdd' format
pe_dl['local_date'] = pd.to_datetime(pe_dl['local_date']).dt.strftime('%Y%m%d').astype(int)

# pe_dl

fileName = "{}{}13_2020_3h_agg5.csv".format(pathFolder,country_code)
pe_dl.to_csv(fileName,  sep=',', index=False)

In [15]:
query = f"""
SELECT 
    COUNT(*) AS no_of_points,
    COUNT(DISTINCT(cuebiq_id)) AS no_of_unique_users,
    grt,
    local_date,
    geohash_5
FROM
    (SELECT 
        cuebiq_id,
        event_zoned_datetime,
        geohash_encode(lat, lng, 5) AS geohash_5, 
        extract(HOUR FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))/3  grt, 
        DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS local_date
    FROM {pe_dl_table}
    WHERE 
        country_code = '{country_code}' 
        AND event_zoned_datetime IS NOT NULL
        AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
        AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) BETWEEN date_parse('20200401', '%Y%m%d') AND date_parse('20200630', '%Y%m%d')
        AND lat <> 0
        AND lng <> 0
    )
GROUP BY grt, local_date, geohash_5
"""

pe_dl = sql_engine.read_sql(query)

# Convert 'local_date' to 'yyyymmdd' format
pe_dl['local_date'] = pd.to_datetime(pe_dl['local_date']).dt.strftime('%Y%m%d').astype(int)

# pe_dl

fileName = "{}{}46_2020_3h_agg5.csv".format(pathFolder,country_code)
pe_dl.to_csv(fileName,  sep=',', index=False)

In [16]:
query = f"""
SELECT 
    COUNT(*) AS no_of_points,
    COUNT(DISTINCT(cuebiq_id)) AS no_of_unique_users,
    grt,
    local_date,
    geohash_5
FROM
    (SELECT 
        cuebiq_id,
        event_zoned_datetime,
        geohash_encode(lat, lng, 5) AS geohash_5, 
        extract(HOUR FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))/3  grt, 
        DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS local_date
    FROM {pe_dl_table}
    WHERE 
        country_code = '{country_code}' 
        AND event_zoned_datetime IS NOT NULL
        AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
        AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) BETWEEN date_parse('20200701', '%Y%m%d') AND date_parse('20201231', '%Y%m%d')
        AND lat <> 0
        AND lng <> 0
    )
GROUP BY grt, local_date, geohash_5
"""

pe_dl = sql_engine.read_sql(query)

# Convert 'local_date' to 'yyyymmdd' format
pe_dl['local_date'] = pd.to_datetime(pe_dl['local_date']).dt.strftime('%Y%m%d').astype(int)

# pe_dl

fileName = "{}{}712_2020_3h_agg5.csv".format(pathFolder,country_code)
pe_dl.to_csv(fileName,  sep=',', index=False)

In [None]:
query = f"""
SELECT 
    COUNT(*) AS no_of_points,
    COUNT(DISTINCT(cuebiq_id)) AS no_of_unique_users,
    grt,
    local_date,
    geohash_5
FROM
    (SELECT 
        cuebiq_id,
        event_zoned_datetime,
        geohash_encode(lat, lng, 5) AS geohash_5, 
        extract(HOUR FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))/3  grt, 
        DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS local_date
    FROM {pe_dl_table}
    WHERE 
        country_code = '{country_code}' 
        AND event_zoned_datetime IS NOT NULL
        AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
        AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) BETWEEN date_parse('20190801', '%Y%m%d') AND date_parse('20190831', '%Y%m%d')
        AND lat <> 0
        AND lng <> 0
    )
GROUP BY grt, local_date, geohash_5
"""

pe_dl = sql_engine.read_sql(query)

# Convert 'local_date' to 'yyyymmdd' format
pe_dl['local_date'] = pd.to_datetime(pe_dl['local_date']).dt.strftime('%Y%m%d').astype(int)

# pe_dl

fileName = "{}{}8_2019_3h_agg5.csv".format(pathFolder,country_code)
pe_dl.to_csv(fileName,  sep=',', index=False)

In [None]:
query = f"""
SELECT 
    COUNT(*) AS no_of_points,
    COUNT(DISTINCT(cuebiq_id)) AS no_of_unique_users,
    grt,
    local_date,
    geohash_5
FROM
    (SELECT 
        cuebiq_id,
        event_zoned_datetime,
        geohash_encode(lat, lng, 5) AS geohash_5, 
        extract(HOUR FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))/3  grt, 
        DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS local_date
    FROM {pe_dl_table}
    WHERE 
        country_code = '{country_code}' 
        AND event_zoned_datetime IS NOT NULL
        AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
        AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) BETWEEN date_parse('20190901', '%Y%m%d') AND date_parse('20190930', '%Y%m%d')
        AND lat <> 0
        AND lng <> 0
    )
GROUP BY grt, local_date, geohash_5
"""

pe_dl = sql_engine.read_sql(query)

# Convert 'local_date' to 'yyyymmdd' format
pe_dl['local_date'] = pd.to_datetime(pe_dl['local_date']).dt.strftime('%Y%m%d').astype(int)

# pe_dl

fileName = "{}{}9_2019_3h_agg5.csv".format(pathFolder,country_code)
pe_dl.to_csv(fileName,  sep=',', index=False)

# Check

In [None]:
pip install seaborn

In [2]:
import glob
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
path_pattern = '/home/jovyan/Data/3h_pd5/MX*_2019_3h_agg5.csv'
file_list = glob.glob(path_pattern)
file_sub_list = [path for path in file_list]
file_sub_list

['/home/jovyan/Data/3h_pd5/MX46_2019_3h_agg5.csv',
 '/home/jovyan/Data/3h_pd5/MX13_2019_3h_agg5.csv',
 '/home/jovyan/Data/3h_pd5/MX1012_2019_3h_agg5.csv',
 '/home/jovyan/Data/3h_pd5/MX79_2019_3h_agg5.csv']

In [4]:
dataframes = []
for file in file_list:
    df = pd.read_csv(file)
    dataframes.append(df)
combined_df = pd.concat(dataframes, ignore_index=True)
combined_df

Unnamed: 0,no_of_points,no_of_unique_users,grt,local_date,geohash_5
0,6531,806,6,20190410,9mubu
1,45,5,4,20190503,9eyb4
2,25670,3866,2,20190503,9g3qw
3,6409,893,3,20190503,9gbnn
4,9095,904,5,20190409,9mvct
...,...,...,...,...,...
45577299,1,1,2,20190711,9t7xg
45577300,1,1,7,20190710,9ss9d
45577301,1,1,3,20190709,9te1h
45577302,2,1,5,20190708,9u94u


## Check Duplicates

In [5]:
combined_df['geohash_5'] = combined_df['geohash_5'].astype(str)
combined_df['no_of_points'] = combined_df['no_of_points'].astype(int)
combined_df['no_of_unique_users'] = combined_df['no_of_unique_users'].astype(int)
combined_df['local_date'] = combined_df['local_date'].astype(int)

combined_df = combined_df.drop_duplicates()
combined_df

Unnamed: 0,no_of_points,no_of_unique_users,grt,local_date,geohash_5
0,6531,806,6,20190410,9mubu
1,45,5,4,20190503,9eyb4
2,25670,3866,2,20190503,9g3qw
3,6409,893,3,20190503,9gbnn
4,9095,904,5,20190409,9mvct
...,...,...,...,...,...
45577299,1,1,2,20190711,9t7xg
45577300,1,1,7,20190710,9ss9d
45577301,1,1,3,20190709,9te1h
45577302,2,1,5,20190708,9u94u


## Check missing date 

In [6]:
unique_values_count = combined_df["local_date"].unique()
all_dates = pd.date_range(start='2019-01-01', end='2019-12-31').strftime('%Y%m%d').astype(int)
all_dates = np.array(all_dates)
missing_dates = np.setdiff1d(all_dates, unique_values_count)

missing_dates

array([], dtype=int64)

## Filter 

In [7]:
combined_df_10 = combined_df[combined_df['no_of_unique_users'] >= 10]
combined_df_10

Unnamed: 0,no_of_points,no_of_unique_users,grt,local_date,geohash_5
0,6531,806,6,20190410,9mubu
2,25670,3866,2,20190503,9g3qw
3,6409,893,3,20190503,9gbnn
4,9095,904,5,20190409,9mvct
5,241,46,5,20190503,9u2pe
...,...,...,...,...,...
45427994,27,10,3,20190925,9mv1n
45437107,88,10,4,20190817,d59ju
45463380,69,10,7,20190706,9mx3g
45521494,113,11,7,20190707,9mu8w


## Check with figure

## Export 

In [8]:
# Function to calculate local_time based on grt
def calculate_local_time(local_date, grt):
    start_hour = grt * 3
    end_hour = start_hour + 3
    return f"{str(local_date)} {str(start_hour).zfill(2)}:00:00 - {str(end_hour).zfill(2)}:00:00"

In [9]:
combined_df_10.loc[:, 'local_time'] = combined_df_10.apply(lambda row: calculate_local_time(row['local_date'], row['grt']), axis=1)
combined_df_10 = combined_df_10.drop(columns=['grt'])

combined_df_10

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_df_10.loc[:, 'local_time'] = combined_df_10.apply(lambda row: calculate_local_time(row['local_date'], row['grt']), axis=1)


Unnamed: 0,no_of_points,no_of_unique_users,local_date,geohash_5,local_time
0,6531,806,20190410,9mubu,20190410 18:00:00 - 21:00:00
2,25670,3866,20190503,9g3qw,20190503 06:00:00 - 09:00:00
3,6409,893,20190503,9gbnn,20190503 09:00:00 - 12:00:00
4,9095,904,20190409,9mvct,20190409 15:00:00 - 18:00:00
5,241,46,20190503,9u2pe,20190503 15:00:00 - 18:00:00
...,...,...,...,...,...
45427994,27,10,20190925,9mv1n,20190925 09:00:00 - 12:00:00
45437107,88,10,20190817,d59ju,20190817 12:00:00 - 15:00:00
45463380,69,10,20190706,9mx3g,20190706 21:00:00 - 24:00:00
45521494,113,11,20190707,9mu8w,20190707 21:00:00 - 24:00:00


In [10]:
# Save the combined DataFrame to a new CSV file
folder_path = '/home/jovyan/Data/3h_pd5/Clean/'
combined_df_10.to_csv(folder_path + 'pd_3h_gh5_mx_2019.csv', index=False)

In [None]:
# Construct the SQL query
query = f"""
SELECT 
    COUNT(*) AS no_of_points,
    COUNT(DISTINCT(cuebiq_id)) AS no_of_unique_users,
    grt,
    day,
    month,
    geohash5
FROM
(SELECT 
    cuebiq_id,
    event_zoned_datetime,
    extract(HOUR FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  hour,
    extract(DAY FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  day,
    extract(MONTH FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  month,
    geohash_encode(lat, lng, 5) AS geohash5, 
    extract(HOUR FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))/3  grt
FROM {pe_dl_table}
WHERE 
    country_code = '{country_code}' 
    AND event_zoned_datetime IS NOT NULL
    AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
    AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) BETWEEN date_parse('20201101', '%Y%m%d') AND date_parse('20201130', '%Y%m%d') 
)
GROUP BY grt, day, month, geohash5
"""

pe_dl_table_gen = sql_engine.read_sql(query)
fileName = "{}{}_daily_agg5_hour_nov.csv".format(pathFolder,country_code)
pe_dl_table_gen.sort_values(['grt','day'], inplace=True)
pe_dl_table_gen.to_csv(fileName,  sep=',', index=False)
kanonimization (fileName,popDensity_threshold)


In [None]:
# Construct the SQL query
query = f"""
SELECT 
    COUNT(*) AS no_of_points,
    COUNT(DISTINCT(cuebiq_id)) AS no_of_unique_users,
    grt,
    day,
    month,
    geohash5
FROM
(SELECT 
    cuebiq_id,
    event_zoned_datetime,
    extract(HOUR FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  hour,
    extract(DAY FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  day,
    extract(MONTH FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))  month,
    geohash_encode(lat, lng, 5) AS geohash5, 
    extract(HOUR FROM date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))/3  grt
FROM {pe_dl_table}
WHERE 
    country_code = '{country_code}' 
    AND event_zoned_datetime IS NOT NULL
    AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
    AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) BETWEEN date_parse('20201201', '%Y%m%d') AND date_parse('20201231', '%Y%m%d') 
)
GROUP BY grt, day, month, geohash5
"""

pe_dl_table_gen = sql_engine.read_sql(query)
fileName = "{}{}_daily_agg5_hour_dec.csv".format(pathFolder,country_code)
pe_dl_table_gen.sort_values(['grt','day'], inplace=True)
pe_dl_table_gen.to_csv(fileName,  sep=',', index=False)
kanonimization (fileName,popDensity_threshold)

