In [None]:
print("Hello World!")

In [None]:
import snowflake.connector
import os
import pandas as pd
import dotenv as dot
from sqlalchemy import create_engine
# from dotenv import load_dotenv

DATABASE_SCHEMA = "EVENT.DATATHON_2025_TEAM_ETA"

def get_snowflake_connection():
    """
    Create a connection to Snowflake using credentials from .env file
    """
    # Load environment variables
    dot.load_dotenv()
    
    # Get connection parameters from environment variables
    conn = snowflake.connector.connect(
        account=os.getenv("SNOWFLAKE_ACCOUNT"),
        user=os.getenv("SNOWFLAKE_USER"),
        password=os.getenv("SNOWFLAKE_PASSWORD"),
        role=os.getenv("SNOWFLAKE_ROLE"),
        warehouse=os.getenv("SNOWFLAKE_WAREHOUSE"),
        database=os.getenv("SNOWFLAKE_DATABASE"),
        schema=os.getenv("SNOWFLAKE_SCHEMA")
    )
    
    return conn

def query_to_df(query):
    """
    Execute a query and return the results as a pandas DataFrame
    """
    conn = get_snowflake_connection()
    cursor = conn.cursor()
    cursor.execute(query)

    # Get column names
    columns = [col[0] for col in cursor.description]

    # Fetch all rows and convert to list of dictionaries
    results = [dict(zip(columns, row)) for row in cursor.fetchall()]

    cursor.close()
    conn.close()

    return pd.DataFrame(results)

def upload_csv_to_snowflake(dataframe, table_name):
    
    conn = get_snowflake_connection()
    cursor = conn.cursor()
    
    engine = create_engine(
        f'snowflake://{conn.user}:{os.getenv("SNOWFLAKE_PASSWORD")}@{conn.account}/{conn.database}/{conn.schema}?warehouse={conn.warehouse}'
    )
    dataframe.to_sql(
        name=table_name,
        con=engine,
        schema=conn.schema,
        if_exists="replace",
        index=False,
        method='multi'
    )
    
    # Get row count
    cursor = conn.cursor()
    cursor.execute(f"SELECT COUNT(*) FROM {conn.database}.{conn.schema}.{table_name}")
    row_count = cursor.fetchone()[0]
    cursor.close()
    conn.close()
    
    return row_count
    

# Example status_df 
status = query_to_df(f"SELECT * FROM {DATABASE_SCHEMA}.status")
print(status.head())

TASK A

In [39]:
# Create a master dataset from drivers, races, constructors, status, results

drivers = query_to_df(f"SELECT * FROM {DATABASE_SCHEMA}.DRIVERS")

races = query_to_df(f"SELECT * FROM {DATABASE_SCHEMA}.RACES")
races = races.rename(columns={'NAME': 'RACE_NAME'})

constructors = query_to_df(f"SELECT * FROM {DATABASE_SCHEMA}.CONSTRUCTORS")
constructors = constructors.rename(columns={'NAME': 'CONSTRUCTOR_NAME'})
constructors = constructors.rename(columns={'NATIONALITY': 'CONSTRUCTOR_NATIONALITY'})

results = query_to_df(f"SELECT * FROM {DATABASE_SCHEMA}.RESULTS")


main_df = results.merge(races, on="RACEID", suffixes=('', '_race')).merge(
    drivers, 
    on="DRIVERID", 
    suffixes=('', '_driver')
).merge(
    constructors, 
    on="CONSTRUCTORID", 
    suffixes=('', '_constructor')
).merge(
    status, 
    on="STATUSID", 
    suffixes=('', '_status')
)

print(main_df.head())

# Upload to Snowflake
row_count = upload_csv_to_snowflake(main_df, "CONSTRUCTOR_DRIVER_RACE_STATUS")

   RESULTID  RACEID  DRIVERID  CONSTRUCTORID  NUMBER  GRID  POSITION  \
0         1      18         1              1    22.0     1       1.0   
1         2      18         2              2     3.0     5       2.0   
2         3      18         3              3     7.0     7       3.0   
3         4      18         4              4     5.0    11       4.0   
4         5      18         5              1    23.0     3       5.0   

  POSITIONTEXT  POSITIONORDER POINTS  ...  QUALI_TIME SPRINT_DATE  \
0            1              1   10.0  ...        None        None   
1            2              2    8.0  ...        None        None   
2            3              3    6.0  ...        None        None   
3            4              4    5.0  ...        None        None   
4            5              5    4.0  ...        None        None   

   SPRINT_TIME          FULL_NAME         DOB NATIONALITY WINS  \
0         None     Lewis Hamilton  1985-01-07     British  105   
1         None      

  dataframe.to_sql(


In [42]:
# Now we've created the big dataset, let's drop columns we don't need

columns_to_drop = ['POSITION', 'POSITIONTEXT','TIME', 'WINS']

# Drop columns from the DataFrame
main_df_cleaned = main_df.drop(columns=columns_to_drop, errors='ignore')
print(f"Dropped {len(columns_to_drop)} columns. New shape: {main_df_cleaned.shape}")

conn = get_snowflake_connection()
cursor = conn.cursor()

# Drop columns from the existing table
for column in columns_to_drop:
    try:
        cursor.execute(f"ALTER TABLE {DATABASE_SCHEMA}.CONSTRUCTOR_DRIVER_RACE_STATUS DROP COLUMN IF EXISTS {column}")
        print(f"Dropped column {column} from table")
    except:
        print(f"Column {column} not found or could not be dropped")

cursor.close()
conn.close()

# Query the updated table to refresh the DataFrame
main_df_updated = query_to_df(f"SELECT * FROM {DATABASE_SCHEMA}.CONSTRUCTOR_DRIVER_RACE_STATUS")
print(f"Updated DataFrame shape: {main_df_updated.shape}")
print(main_df_updated.head())

Dropped 4 columns. New shape: (26759, 37)
Dropped column POSITION from table
Dropped column POSITIONTEXT from table
Dropped column TIME from table
Dropped column WINS from table
Updated DataFrame shape: (26759, 37)
   RESULTID  RACEID  DRIVERID  CONSTRUCTORID  NUMBER  GRID  POSITIONORDER  \
0         1      18         1              1    22.0     1              1   
1         2      18         2              2     3.0     5              2   
2         3      18         3              3     7.0     7              3   
3         4      18         4              4     5.0    11              4   
4         5      18         5              1    23.0     3              5   

  POINTS  LAPS  MILLISECONDS  ...  QUALI_DATE  QUALI_TIME SPRINT_DATE  \
0   10.0    58     5690616.0  ...        None        None        None   
1    8.0    58     5696094.0  ...        None        None        None   
2    6.0    58     5698779.0  ...        None        None        None   
3    5.0    58     5707797.0  