Download dl data

In [1]:
import gc
import os
import time
import pandas as pd
from sqlalchemy import create_engine
from datetime import datetime, timedelta

In [2]:
# SQL engine
from trino.dbapi import connect 
from sqlalchemy import create_engine
import pandas as pd
import time

class TrinoEngine():
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query:str) -> list:
        """
        Create and drop statements.
        """
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql(self, query:str) -> pd.DataFrame: 
        """
        Select and insert into operations.
        """
        return pd.read_sql(query, self.engine)

sql_engine = TrinoEngine()

In [3]:
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}

# dl_table = f"{schema_name['cda']}.device_location"  
pe_dl_table = f"{schema_name['cda']}.device_location_uplevelled"

tj_table = f"{schema_name['cda']}.trajectory"     
pe_tj_table = f"{schema_name['cda']}.trajectory_uplevelled"

# stop_table = f"{schema['cda']}.stop" 
pe_stop_table = f"{schema_name['cda']}.stop_uplevelled"

visit_table = f"{schema_name['cda']}.visit " 

In [4]:
class TrinoEngine:
    def __init__(self):
        self.conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = self.conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query: str):
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql_chunked(self, query: str, chunksize: int = 10000):
        return pd.read_sql(query, self.engine, chunksize=chunksize)

sql_engine = TrinoEngine()


In [5]:
countries = ['MX']
start_date = 20190129
end_date = 20190131

# Convert integer dates to datetime objects
start_date_dt = datetime.strptime(str(start_date), '%Y%m%d')
end_date_dt = datetime.strptime(str(end_date), '%Y%m%d')

# Start timing
start_time = time.time()

for country_code in countries:
    print(f"Processing country: {country_code}")
    current_date = start_date_dt
    while current_date <= end_date_dt:
        formatted_date = current_date.strftime('%Y%m%d')
        next_date = (current_date + timedelta(days=1)).strftime('%Y-%m-%d')
        print(f"Processing date: {formatted_date} for country: {country_code}")

        query = f"""
            SELECT 
                cuebiq_id, 
                event_zoned_datetime, 
                processing_date,
                timezoneoffset_secs,
                lat,
                lng, 
                TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s') +
                interval '1' second * timezoneoffset_secs) AS event_datetime_utc
            FROM {pe_dl_table}
            WHERE 
                processing_date = {formatted_date} 
                AND country_code = '{country_code}' 
                AND event_zoned_datetime IS NOT NULL
                AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
                AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) >= date_parse('{start_date_dt.strftime('%Y%m%d')}', '%Y%m%d')
                AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) <= date_parse('{next_date}', '%Y-%m-%d')
        """

        for chunk in sql_engine.read_sql_chunked(query):
            # Extract event_zoned_date from event_zoned_datetime
            chunk['event_zoned_date'] = chunk['event_zoned_datetime'].apply(lambda x: x[:10])

            # Convert processing_date and event_zoned_date to datetime objects
            chunk['event_zoned_date'] = pd.to_datetime(chunk['event_zoned_date'])
            processing_date_dt = datetime.strptime(formatted_date, '%Y%m%d')

            # Filter data based on event_zoned_date
            chunk = chunk[(chunk['event_zoned_date'] >= start_date_dt.strftime('%Y-%m-%d')) & 
                          (chunk['event_zoned_date'] <= processing_date_dt.strftime('%Y-%m-%d'))]

            # Group by event_zoned_date
            grouped = chunk.groupby('event_zoned_date')

            for event_zoned_date, group_df in grouped:
                # Remove duplicates
                group_df = group_df.drop_duplicates()

                # Convert event_zoned_date to string and remove hyphens
                event_zoned_date_str = event_zoned_date.strftime('%Y%m%d')
                file_path = f'/home/jovyan/Data/DL/{country_code}/{event_zoned_date_str}_{country_code}_pe_dl.csv'

                if os.path.exists(file_path):
                    # If the file exists, append the new data
                    group_df.to_csv(file_path, mode='a', header=False, index=False)
                else:
                    # If the file does not exist, create it
                    try:
                        group_df.to_csv(file_path, index=False)
                    except OSError as e:
                        if not os.path.exists(os.path.dirname(file_path)):
                            os.makedirs(os.path.dirname(file_path), exist_ok=True)
                            group_df.to_csv(file_path, index=False)
                        else:
                            raise e

            # Delete the DataFrame to free up memory
            del chunk

        current_date += timedelta(days=1)

    print(f"Finished processing country: {country_code}")

# End timing
end_time = time.time()

# Calculate and print the total time taken
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")


# Processing date: 20190114 for country: IN
# Processing date: 20190430 for country: ID


Processing country: MX
Processing date: 20190129 for country: MX
Processing date: 20190130 for country: MX
Processing date: 20190131 for country: MX
Finished processing country: MX
Total time taken: 1595.3800954818726 seconds
