In [1]:
import sqlite3
import pandas as pd
import nbimporter

Importing custom functions from other files

In [2]:
from final_scores import calculate_final_driver_places 
from lap_times_functions import get_lap_times_stats
from lap_times_functions import get_median_lap_position
from lap_times_functions import get_total_laps_count
from lap_times_functions import categorize_total_laps
from pit_stop_functions import get_pit_stop_stats_in_miliseconds
from pit_stop_functions import get_most_common_pit_stop_lap
from pit_stop_functions import categorize_pitstop_lap
from constructor_table import get_drivers_with_constructor_relationship
from constructor_table import get_constructor_points
from drivers_functions import get_age_at_first_race
from drivers_functions import get_age_at_each_race
from circuits_functions import get_best_performing_circuits
from results_functions import get_most_common_positions
from utils import number_to_words
from utils import categorize_position

ModuleNotFoundError: No module named 'final_scores'

Connecting to the database

In [None]:
conn = sqlite3.connect('formula1.db')

# Data Agreggation : races -> driver per year
Calculating final driver positions per year

In [None]:
drivers_final_positions = calculate_final_driver_places(conn)

In [None]:
driver_starting_year = get_age_at_first_race(conn)
drivers_final_positions = pd.merge(drivers_final_positions, driver_starting_year, on=['driverId'], how='left')

In [None]:
driver_years = get_age_at_each_race(conn)
drivers_final_positions = pd.merge(drivers_final_positions, driver_years, on=['driverId','year'], how='left')

In [None]:
drivers_final_positions

In [None]:
lap_times_stats = get_lap_times_stats(conn)

In [None]:
lap_positions = get_median_lap_position(conn)
lap_times_stats = pd.merge(lap_times_stats, lap_positions, on=['driverId', 'year'], how='left')

In [None]:
lap_count = get_total_laps_count(conn)
lap_times_stats = pd.merge(lap_times_stats, lap_count, on=['driverId', 'year'], how='left')

In [None]:
lap_times_stats

In [None]:
pit_stop_stats = get_pit_stop_stats_in_miliseconds(conn)

In [None]:
most_common_pit_stop_lap = get_most_common_pit_stop_lap(conn)
pit_stop_stats = pd.merge(pit_stop_stats, most_common_pit_stop_lap, on=['driverId', 'year'], how='left')

In [None]:
pit_stop_stats

In [None]:
constructor_driver_connection = get_drivers_with_constructor_relationship(conn)

In [None]:
drivers_with_constructors = pd.merge(drivers_final_positions, constructor_driver_connection, on=['driverId', 'year'], how='left')

In [None]:
connstructor_points = get_constructor_points(conn)

In [None]:
drivers_with_constructors = pd.merge(drivers_with_constructors, connstructor_points, on=['constructorId', 'year'], how='left')

In [None]:
best_performing_circuits = get_best_performing_circuits(conn)
drivers_with_constructors = pd.merge(drivers_with_constructors, best_performing_circuits, on=['year','driverId' ], how='left')

In [None]:
drivers_with_constructors

In [None]:
lap_pit_df = pd.merge(lap_times_stats, pit_stop_stats, on=['driverId', 'year'], how='left', suffixes=('_lap', '_pit'))

In [None]:
lap_pit_df

In [None]:
final_df = pd.merge(drivers_with_constructors, lap_pit_df, on=['driverId', 'year'], how='left')

In [None]:
positions = get_most_common_positions(conn)

In [None]:
final_df = pd.merge(final_df, positions, on=['driverId', 'year'], how='left')

In [None]:
final_df = final_df.drop_duplicates(['driverId', 'year'])

In [None]:
final_df

# Feature Creation
Calculating various counts based on historical data

In [None]:
final_df = final_df.sort_values(by=['year', 'driverId'])

for index, row in final_df.iterrows():
    temp_df = final_df[(final_df['driverId'] == row['driverId']) & (final_df['year'] <= row['year'])]
    
    first_place_count = temp_df[temp_df['driver_end_position'] == 1].shape[0]
    second_place_count = temp_df[temp_df['driver_end_position'] == 2].shape[0]
    third_place_count = temp_df[temp_df['driver_end_position'] == 3].shape[0]
    top_5_count = temp_df[temp_df['driver_end_position'] <= 5].shape[0]

    final_df.at[index, 'driver_first_place_count'] = first_place_count
    final_df.at[index, 'driver_second_place_count'] = second_place_count
    final_df.at[index, 'driver_third_place_count'] = third_place_count
    final_df.at[index, 'driver_top_5_place_count'] = top_5_count


In [None]:
final_df = final_df.sort_values(by=['year', 'constructorId'])

for index, row in final_df.iterrows():
    temp_df = final_df[(final_df['constructorId'] == row['constructorId']) & (final_df['year'] <= row['year'])]
    
    temp_first_place_count = temp_df[temp_df['constructor_place'] == 1].shape[0]
    temp_second_place_count = temp_df[temp_df['constructor_place'] == 2].shape[0]
    temp_third_place_count = temp_df[temp_df['constructor_place'] == 3].shape[0]
    temp_top_5_count = temp_df[temp_df['constructor_place'] <= 5].shape[0]
    
    final_df.at[index, 'constructor_first_place_count'] = temp_first_place_count
    final_df.at[index, 'constructor_second_place_count'] = temp_second_place_count
    final_df.at[index, 'constructor_third_place_count'] = temp_third_place_count
    final_df.at[index, 'constructor_top_5_place_count'] = temp_top_5_count

In [None]:
final_df

# Diskretization

In [None]:
columns_to_discretize = ['min_lap_time_diff', 'avg_lap_time_diff', 'max_lap_time_diff']
labels = ['0-25%', '25-50%', '50% and more']

for col in columns_to_discretize:
    if col == 'min_lap_time_diff':
        custom_bin_edges = [final_df[col].min(), 0.1, final_df[col].quantile(0.5), final_df[col].max()]
        final_df[col] = pd.cut(final_df[col], bins=custom_bin_edges, include_lowest=True, duplicates='drop', labels=labels)
    else:
        bin_edges = [final_df[col].min(), final_df[col].quantile(0.25), final_df[col].quantile(0.5), final_df[col].max()]
        final_df[col] = pd.cut(final_df[col], bins=bin_edges, include_lowest=True, duplicates='drop', labels=labels)

In [None]:
categorical_columns = ['min_lap_time_diff', 'avg_lap_time_diff', 'max_lap_time_diff']

for col in categorical_columns:
    final_df[col] = final_df[col].astype('category').cat.add_categories('Unknown')
    final_df[col].fillna('Unknown', inplace=True)

In [None]:
columns_to_discretize = ['min_pit_stop_time', 'avg_pit_time', 'max_pit_stop_time']
labels = ['0-25%', '25-50%', '50% and more']

for col in columns_to_discretize:
    if col in pit_stop_stats.columns:
        bin_edges = [final_df[col].min(), final_df[col].quantile(0.25), final_df[col].quantile(0.5), final_df[col].max()]
        final_df[col] = pd.cut(final_df[col], bins=bin_edges, include_lowest=True, duplicates='drop', labels=labels)

In [None]:
categorical_columns = ['min_pit_stop_time', 'avg_pit_time', 'max_pit_stop_time']

for col in categorical_columns:
    final_df[col] = final_df[col].astype('category').cat.add_categories('Unknown')
    final_df[col].fillna('Unknown', inplace=True)

In [None]:
final_df['total_laps'] = final_df['total_laps'].apply(categorize_total_laps)

In [None]:
final_df

In [None]:
final_df['most_common_ending_position'] = final_df['most_common_ending_position'].replace({'\\N': 'Did Not Finish'})

In [None]:
final_df['most_common_starting_position'] = final_df['most_common_starting_position'].replace(0, 'Unknown')

In [None]:
final_df['most_common_pitstop_lap'] = final_df['most_common_pitstop_lap'].apply(categorize_pitstop_lap)

final_df['most_common_starting_position'] = pd.to_numeric(positions['most_common_starting_position'], errors='coerce')
final_df['most_common_ending_position'] = pd.to_numeric(positions['most_common_ending_position'], errors='coerce')

final_df['most_common_starting_position'] = final_df['most_common_starting_position'].apply(categorize_position)
final_df['most_common_ending_position'] = final_df['most_common_ending_position'].apply(categorize_position)

In [None]:
final_df

# Dealing with NaN Values
Converting NaN values in categorical columns to 'Unknown'

In [None]:
final_df.info()

In [None]:
columns_to_fill_with_zero = ['constructor_points', 'constructor_place', 'median_lap_position']
final_df[columns_to_fill_with_zero] = final_df[columns_to_fill_with_zero].fillna(0)

In [None]:
final_df['driver_first_place_count'] = final_df['driver_second_place_count'].astype(int)
final_df['driver_second_place_count'] = final_df['driver_second_place_count'].astype(int)
final_df['driver_third_place_count'] = final_df['driver_top_5_place_count'].astype(int)
final_df['driver_top_5_place_count'] = final_df['driver_top_5_place_count'].astype(int)
final_df['driver_points'] = final_df['driver_points'].astype(int)
final_df['driver_end_position'] = final_df['driver_end_position'].astype(int)
final_df['age_at_first_race'] = final_df['age_at_first_race'].astype(int)
final_df['age_at_race'] = final_df['age_at_race'].astype(int)
final_df['median_lap_position'] = final_df['median_lap_position'].astype(int)
final_df['constructor_points'] = final_df['constructor_points'].astype(int)
final_df['constructor_place'] = final_df['constructor_place'].astype(int)
final_df['constructor_first_place_count'] = final_df['constructor_first_place_count'].astype(int)
final_df['constructor_second_place_count'] = final_df['constructor_second_place_count'].astype(int)
final_df['constructor_third_place_count'] = final_df['constructor_third_place_count'].astype(int)
final_df['constructor_top_5_place_count'] = final_df['constructor_top_5_place_count'].astype(int)
final_df['constructor_first_place_count'] = final_df['constructor_first_place_count'].astype(int)
final_df['constructor_second_place_count'] = final_df['constructor_second_place_count'].astype(int)
final_df['constructor_third_place_count'] = final_df['constructor_third_place_count'].astype(int)
final_df['constructor_top_5_place_count'] = final_df['constructor_top_5_place_count'].astype(int)
final_df['constructor_points'] = final_df['constructor_points'].astype(int)
final_df['constructor_place'] = final_df['constructor_place'].astype(int)

In [None]:
lap_times_stats['median_lap_position'] = lap_times_stats['median_lap_position'].replace(0, 'Unknown')

# Aggregation: driver per year -> driver

In [None]:
df = final_df.sort_values(by='year', ascending=False)

numeric_columns = ['year','driver_points', 'driver_end_position', 'constructor_points', 'constructor_place','median_lap_position', 'total_laps']
df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')

aggregation_functions = {
    'year': 'max',
    'driver_nationality': 'first',
    'age_at_first_race': 'first',
    'driver_first_place_count': 'first',
    'driver_second_place_count': 'first',
    'driver_third_place_count': 'first',
    'driver_top_5_place_count': 'first',
    'constructorName': 'first',
    'constructor_nationality': 'first',
    'constructor_first_place_count': 'first',
    'constructor_second_place_count': 'first',
    'constructor_third_place_count': 'first',
    'constructor_top_5_place_count': 'first',
    'best_performing_circuit_name': lambda x: x.mode().iloc[0], 
    'min_lap_time_diff': lambda x: x.mode().iloc[0],
    'avg_lap_time_diff': lambda x: x.mode().iloc[0],
    'max_lap_time_diff': lambda x: x.mode().iloc[0],
    'median_lap_position':lambda x: round(x.mean()) if x.notna().any() else 'Unknown',
    'min_pit_stop_time': lambda x: x.mode().iloc[0],
    'avg_pit_time': lambda x: x.mode().iloc[0],
    'max_pit_stop_time': lambda x: x.mode().iloc[0],
    'most_common_pitstop_lap': lambda x: x.mode().iloc[0],
    'most_common_starting_position': lambda x: x.mode().iloc[0] if x.notna().any() else 'Unknown',
    'most_common_ending_position': lambda x: x.mode().iloc[0] 
}

aggregated_df = df.groupby('driverId').agg(aggregation_functions)

aggregated_df = aggregated_df.reset_index()

In [None]:
aggregated_df = aggregated_df.drop(['year', 'driverId'], axis=1)

In [None]:
drivers_lost = aggregated_df[aggregated_df['driver_top_5_place_count'] == 0]
drivers_won = aggregated_df[aggregated_df['driver_top_5_place_count'] != 0]

In [None]:
drivers_lost

In [None]:
drivers_won

In [None]:
columns_to_convert = ['driver_first_place_count','driver_second_place_count','driver_third_place_count','driver_top_5_place_count','age_at_first_race','constructor_first_place_count','constructor_second_place_count','constructor_third_place_count','constructor_top_5_place_count', 'median_lap_position']
drivers_won[columns_to_convert] = drivers_won[columns_to_convert].fillna(0).astype(int)

drivers_won[columns_to_convert] = drivers_won[columns_to_convert].apply(pd.to_numeric, errors='coerce')

drivers_won[columns_to_convert] = drivers_won[columns_to_convert].applymap(number_to_words)

print(drivers_won)


In [None]:
columns_to_convert = ['driver_first_place_count','driver_second_place_count','driver_third_place_count','driver_top_5_place_count','age_at_first_race','constructor_first_place_count','constructor_second_place_count','constructor_third_place_count','constructor_top_5_place_count', 'median_lap_position']
drivers_lost[columns_to_convert] = drivers_lost[columns_to_convert].fillna(0).astype(int)

drivers_lost[columns_to_convert] = drivers_lost[columns_to_convert].apply(pd.to_numeric, errors='coerce')

drivers_lost[columns_to_convert] = drivers_lost[columns_to_convert].applymap(number_to_words)

print(drivers_lost)



In [None]:
columns_to_convert = ['driver_first_place_count','driver_second_place_count','driver_third_place_count','driver_top_5_place_count','age_at_first_race','constructor_first_place_count','constructor_second_place_count','constructor_third_place_count','constructor_top_5_place_count', 'median_lap_position']
aggregated_df[columns_to_convert] = aggregated_df[columns_to_convert].fillna(0).astype(int)

aggregated_df[columns_to_convert] = aggregated_df[columns_to_convert].apply(pd.to_numeric, errors='coerce')

aggregated_df[columns_to_convert] = aggregated_df[columns_to_convert].applymap(number_to_words)

print(aggregated_df)


In [None]:
aggregated_df.to_csv("../outputs/all_drivers.csv", index=False, encoding='utf-8')

In [None]:
aggregated_df = aggregated_df[aggregated_df.ne('Unknown').all(axis=1)]

In [None]:
aggregated_df.to_csv("../outputs/all_drivers_withiut_unknown.csv", index=False, encoding='utf-8')

In [None]:
drivers_won.to_csv("../outputs/winning_drivers.csv", index=False, encoding='utf-8')

In [None]:
drivers_won = drivers_won[drivers_won.ne('Unknown').all(axis=1)]

In [None]:
drivers_won.to_csv("../outputs/winning_drivers_without_unknown.csv", index=False, encoding='utf-8')

In [None]:
drivers_lost.to_csv("../outputs/losing_drivers.csv", index=False, encoding='utf-8')

In [None]:
drivers_lost = drivers_lost[drivers_lost.ne('Unknown').all(axis=1)]

In [None]:
drivers_lost.to_csv("../outputs/losing_drivers_without_unknown.csv", index=False, encoding='utf-8')