In [1]:
import snowflake.connector
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import dotenv as dot
from sqlalchemy import create_engine
# from dotenv import load_dotenv

DATABASE_SCHEMA = 'EVENT.DATATHON_2025_TEAM_ETA'

def get_snowflake_connection():
    '''
    Create a connection to Snowflake using credentials from .env file
    '''
    # Load environment variables
    dot.load_dotenv()
    
    # Get connection parameters from environment variables
    conn = snowflake.connector.connect(
        account=os.getenv('SNOWFLAKE_ACCOUNT'),
        user=os.getenv('SNOWFLAKE_USER'),
        password=os.getenv('SNOWFLAKE_PASSWORD'),
        role=os.getenv('SNOWFLAKE_ROLE'),
        warehouse=os.getenv('SNOWFLAKE_WAREHOUSE'),
        database=os.getenv('SNOWFLAKE_DATABASE'),
        schema=os.getenv('SNOWFLAKE_SCHEMA')
    )
    
    return conn

def query_to_df(query):
    '''
    Execute a query and return the results as a pandas DataFrame
    '''
    conn = get_snowflake_connection()
    cursor = conn.cursor()
    cursor.execute(query)

    # Get column names
    columns = [col[0] for col in cursor.description]

    # Fetch all rows and convert to list of dictionaries
    results = [dict(zip(columns, row)) for row in cursor.fetchall()]

    cursor.close()
    conn.close()

    return pd.DataFrame(results)

def upload_csv_to_snowflake(dataframe, table_name):
    
    conn = get_snowflake_connection()
    cursor = conn.cursor()
    
    engine = create_engine(
        f'snowflake://{conn.user}:{os.getenv('SNOWFLAKE_PASSWORD')}@{conn.account}/{conn.database}/{conn.schema}?warehouse={conn.warehouse}'
    )
    dataframe.to_sql(
        name=table_name,
        con=engine,
        schema=conn.schema,
        if_exists='replace',
        index=False,
        method='multi'
    )
    
    # Get row count
    cursor = conn.cursor()
    cursor.execute(f'SELECT COUNT(*) FROM {conn.database}.{conn.schema}.{table_name}')
    row_count = cursor.fetchone()[0]
    cursor.close()
    conn.close()
    
    return row_count
    

# Example status_df 
status = query_to_df(f'SELECT * FROM {DATABASE_SCHEMA}.status')
print(status.head())

   STATUSID        STATUS
0         1      Finished
1         2  Disqualified
2         3      Accident
3         4     Collision
4         5        Engine


In [None]:
maindf = query_to_df(f'SELECT * FROM {DATABASE_SCHEMA}.CIRCUIT_CONSTRUCTOR_DRIVER_RACE_STATUS')
pitstops = query_to_df(f'SELECT * FROM {DATABASE_SCHEMA}.PIT_STOPS')

newPitstops = pd.merge(pitstops, maindf,left_on='RACEID',right_index=True,how='left')


In [9]:
df = newPitstops[newPitstops['YEAR'] >=2014].copy()

print(df.shape)
print(df.columns)
print(df.info())

(11371, 46)
Index(['RACEID', 'RACEID_x', 'DRIVERID_x', 'STOP', 'LAP', 'TIME', 'DURATION',
       'MILLISECONDS_x', 'RESULTID', 'RACEID_y', 'DRIVERID_y', 'CONSTRUCTORID',
       'GRID', 'POSITIONORDER', 'POINTS', 'LAPS', 'MILLISECONDS_y',
       'FASTESTLAP', 'RANK', 'FASTESTLAPTIME', 'FASTESTLAPSPEED', 'STATUSID',
       'YEAR', 'ROUND', 'CIRCUITID', 'RACE_NAME', 'DATE', 'TIME_race',
       'FULL_NAME', 'DOB', 'NATIONALITY', 'WINS', 'CONSTRUCTOR_NAME',
       'CONSTRUCTOR_NATIONALITY', 'STATUS', 'NAME', 'LOCATION', 'COUNTRY',
       'LAT', 'LNG', 'ALT', 'ALTITUDE_CATEGORY', 'ESTIMATED_LENGTH_KM',
       'AVG_RACE_LAPS', 'ESTIMATED_RACE_DISTANCE_KM', 'DNF'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11371 entries, 0 to 11370
Data columns (total 46 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   RACEID                      11371 non-null  int64         
 1   RACEID

In [None]:

if 'PIT_STOPS' not in df.columns:
    df['ESTIMATED_PIT_STOPS'] = np.ceil(df['LAPS'] / 20) - 1
    df['ESTIMATED_PIT_STOPS'] = df['ESTIMATED_PIT_STOPS'].clip(lower=0)

# Analyze Mercedes performance by estimated pit stop count
pitstop_performance = df[
    df['CONSTRUCTOR_NAME'].isin(top_teams)
].groupby(['ESTIMATED_PIT_STOPS', 'CONSTRUCTOR_NAME']).agg({
    'WIN': 'sum',
    'PODIUM': 'sum',
    'RESULTID': 'count'
}).reset_index()

pitstop_performance['WIN_RATE'] = pitstop_performance['WIN'] / pitstop_performance['RESULTID']
pitstop_pivot = pitstop_performance.pivot(
    index='ESTIMATED_PIT_STOPS', 
    columns='CONSTRUCTOR_NAME', 
    values='WIN_RATE'
).fillna(0)

print("\nWin Rate by Estimated Pit Stop Count:")
print(pitstop_pivot)

# Visualize pit stop performance
plt.figure(figsize=(10, 6))
pitstop_pivot.plot(kind='bar')
plt.title('Win Rate by Estimated Pit Stop Count (2014 onwards)')
plt.xlabel('Estimated Pit Stops')
plt.ylabel('Win Rate')
plt

