In [7]:
import pandas as pd

csv_path = "/Users/darshil/projects/llm-cfg-demo/flights-delay/flights.csv"

# Keep only core analytical columns plus a single date column
columns_to_keep = [
    "AIRLINE",
    "ORIGIN_AIRPORT",
    "DESTINATION_AIRPORT",
    "SCHEDULED_DEPARTURE",
    "SCHEDULED_ARRIVAL",
    "DEPARTURE_DELAY",
    "ARRIVAL_DELAY",
    "DISTANCE",
    "DAY_OF_WEEK",
    "AIR_TIME",
    "MONTH",
    "DAY",
    "ELAPSED_TIME",
    "SCHEDULED_TIME",
]

flights_df = pd.read_csv(csv_path, low_memory=False)

# Create a single date column if not present, prefer YEAR/MONTH/DAY
single_date_column = "FLIGHT_DATE"
if single_date_column not in flights_df.columns:
    if all(col in flights_df.columns for col in ["YEAR", "MONTH", "DAY"]):
        flights_df[single_date_column] = pd.to_datetime(
            flights_df[["YEAR", "MONTH", "DAY"]].rename(
                columns={"YEAR": "year", "MONTH": "month", "DAY": "day"}
            ),
            errors="coerce",
        )
    elif "DATE" in flights_df.columns:
        flights_df[single_date_column] = pd.to_datetime(
            flights_df["DATE"], errors="coerce"
        )
    else:
        flights_df[single_date_column] = pd.NaT

final_columns = [
    column_name
    for column_name in columns_to_keep + [single_date_column]
    if column_name in flights_df.columns
]

flights_df = flights_df[final_columns]

flights_df = flights_df.drop(columns=["MONTH", "DAY"], errors="ignore")

# Preview resulting dataframe
flights_df.head()

Unnamed: 0,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,SCHEDULED_ARRIVAL,DEPARTURE_DELAY,ARRIVAL_DELAY,DISTANCE,DAY_OF_WEEK,AIR_TIME,ELAPSED_TIME,SCHEDULED_TIME,FLIGHT_DATE
0,AS,ANC,SEA,5,430,-11.0,-22.0,1448,4,169.0,194.0,205.0,2015-01-01
1,AA,LAX,PBI,10,750,-8.0,-9.0,2330,4,263.0,279.0,280.0,2015-01-01
2,US,SFO,CLT,20,806,-2.0,5.0,2296,4,266.0,293.0,286.0,2015-01-01
3,AA,LAX,MIA,20,805,-5.0,-9.0,2342,4,258.0,281.0,285.0,2015-01-01
4,AS,SEA,ANC,25,320,-1.0,-21.0,1448,4,199.0,215.0,235.0,2015-01-01


In [8]:
flights_df.to_csv("flights_df.csv", index=False)

In [1]:
import os
import requests
import pandas as pd
from io import StringIO

# ClickHouse Cloud API credentials
key_id = os.environ.get('CH_KEY_ID')
key_secret = os.environ.get('CH_KEY_SECRET')
service_id = '88ac54be-2166-445d-9172-dc3173309069'

def run_clickhouse_query(sql_query, format_type='CSV'):
    """Execute SQL query against ClickHouse Cloud using REST API"""
    url = f'https://queries.clickhouse.cloud/service/{service_id}/run'
    
    response = requests.post(
        url,
        auth=(key_id, key_secret),
        headers={'Content-Type': 'application/json'},
        params={'format': format_type},
        json={'sql': sql_query},
        timeout=30
    )
    
    response.raise_for_status()
    return response.text

# sanity check
version_result = run_clickhouse_query("SELECT version()")
print("ClickHouse version:", version_result.strip())

# example query with results
sample_query = """
SELECT * FROM flights_df
LIMIT 10
"""

csv_result = run_clickhouse_query(sample_query)
sample_df = pd.read_csv(StringIO(csv_result))
print("\nSample query results:")
print(sample_df.head())

# # larger dataset query for analysis
# events_query = "SELECT * FROM mydb.events WHERE event_date >= today() - 7"
# events_csv = run_clickhouse_query(events_query)
# events_df = pd.read_csv(StringIO(events_csv))
# print("\nEvents data:")
# print(events_df.head())

ClickHouse version: "25.4.1.37654"

Sample query results:
   AS  ANC  SEA   5  430  -11  -22  1448  4  169  194  205  2015-01-01
0  AA  LAX  PBI  10  750   -8   -9  2330  4  263  279  280  2015-01-01
1  US  SFO  CLT  20  806   -2    5  2296  4  266  293  286  2015-01-01
2  AA  LAX  MIA  20  805   -5   -9  2342  4  258  281  285  2015-01-01
3  AS  SEA  ANC  25  320   -1  -21  1448  4  199  215  235  2015-01-01
4  DL  SFO  MSP  25  602   -5    8  1589  4  206  230  217  2015-01-01
