In [1]:
import sqlite3
import pandas as pd
import nbimporter

In [2]:
from final_scores import calculate_final_driver_places 
from lap_times_functions import get_lap_times_stats
from lap_times_functions import get_median_lap_position
from lap_times_functions import get_total_laps_count
from pit_stop_functions import get_pit_stop_stats_in_miliseconds
from pit_stop_functions import get_most_common_pit_stop_lap
from constructor_table import get_drivers_with_constructor_relationship
from constructor_table import get_constructor_points
from drivers_functions import get_age_at_first_race
from drivers_functions import get_age_at_each_race
from circuits_functions import get_best_performing_circuits
from results_functions import get_most_common_positions

In [3]:
conn = sqlite3.connect('formula1.db')

# Drivers and their positions at the end of each year

In [4]:
drivers_final_positions = calculate_final_driver_places(conn)
print(drivers_final_positions)

      year  driverId driver_nationality  driver_points  driver_end_position
73    1950       642            Italian           30.0                    1
66    1950       579          Argentine           27.0                    2
72    1950       786            Italian           24.0                    3
70    1950       627             French           13.0                    4
56    1950       647            Italian           11.0                    5
...    ...       ...                ...            ...                  ...
3133  2022       855            Chinese            6.0                   18
3136  2022       848               Thai            4.0                   19
3139  2022       849           Canadian            2.0                   20
3145  2022       856              Dutch            2.0                   20
3140  2022       807             German            0.0                   22

[3146 rows x 5 columns]


In [5]:
drivers_final_positions

Unnamed: 0,year,driverId,driver_nationality,driver_points,driver_end_position
73,1950,642,Italian,30.0,1
66,1950,579,Argentine,27.0,2
72,1950,786,Italian,24.0,3
70,1950,627,French,13.0,4
56,1950,647,Italian,11.0,5
...,...,...,...,...,...
3133,2022,855,Chinese,6.0,18
3136,2022,848,Thai,4.0,19
3139,2022,849,Canadian,2.0,20
3145,2022,856,Dutch,2.0,20


In [6]:
drivers_final_positions = drivers_final_positions.sort_values(by=['year', 'driverId'])

for index, row in drivers_final_positions.iterrows():
    temp_df = drivers_final_positions[(drivers_final_positions['driverId'] == row['driverId']) & (drivers_final_positions['year'] <= row['year'])]
    
    first_place_count = temp_df[temp_df['driver_end_position'] == 1].shape[0]
    second_place_count = temp_df[temp_df['driver_end_position'] == 2].shape[0]
    third_place_count = temp_df[temp_df['driver_end_position'] == 3].shape[0]
    top_5_count = temp_df[temp_df['driver_end_position'] <= 5].shape[0]

    drivers_final_positions.at[index, 'first_place_count'] = first_place_count
    drivers_final_positions.at[index, 'second_place_count'] = second_place_count
    drivers_final_positions.at[index, 'third_place_count'] = third_place_count
    drivers_final_positions.at[index, 'top_5_count'] = top_5_count

print(drivers_final_positions)



KeyboardInterrupt



In [None]:
drivers_final_positions['first_place_count'] = drivers_final_positions['first_place_count'].astype(int)
drivers_final_positions['second_place_count'] = drivers_final_positions['second_place_count'].astype(int)
drivers_final_positions['third_place_count'] = drivers_final_positions['third_place_count'].astype(int)
drivers_final_positions['top_5_count'] = drivers_final_positions['top_5_count'].astype(int)
drivers_final_positions['driver_points'] = drivers_final_positions['driver_points'].astype(int)
drivers_final_positions['driver_end_position'] = drivers_final_positions['driver_end_position'].astype(int)

In [None]:
drivers_final_positions = drivers_final_positions.sort_values(by='top_5_count', ascending=False)

In [None]:
drivers_final_positions

In [None]:
driver_starting_year = get_age_at_first_race(conn)
drivers_final_positions = pd.merge(drivers_final_positions, driver_starting_year, on=['driverId'], how='left')

In [None]:
drivers_final_positions

In [None]:
drivers_final_positions['age_at_first_race'] = drivers_final_positions['age_at_first_race'].astype(int)

In [None]:
drivers_final_positions

In [None]:
driver_years = get_age_at_each_race(conn)
drivers_final_positions = pd.merge(drivers_final_positions, driver_years, on=['driverId','year'], how='left')

In [None]:
drivers_final_positions

In [None]:
drivers_final_positions['age_at_race'] = drivers_final_positions['age_at_race'].astype(int)

In [None]:
drivers_final_positions

# Lap times 

In [None]:
lap_times_stats = get_lap_times_stats(conn)
print(lap_times_stats)

In [None]:
columns_to_discretize = ['min_lap_time_diff', 'avg_lap_time_diff', 'max_lap_time_diff']
labels = ['0-25%', '25-50%', '50% and more']

for col in columns_to_discretize:
    if col == 'min_lap_time_diff':
        custom_bin_edges = [lap_times_stats[col].min(), 0.1, lap_times_stats[col].quantile(0.5), lap_times_stats[col].max()]
        lap_times_stats[col] = pd.cut(lap_times_stats[col], bins=custom_bin_edges, include_lowest=True, duplicates='drop', labels=labels)
    else:
        bin_edges = [lap_times_stats[col].min(), lap_times_stats[col].quantile(0.25), lap_times_stats[col].quantile(0.5), lap_times_stats[col].max()]
        lap_times_stats[col] = pd.cut(lap_times_stats[col], bins=bin_edges, include_lowest=True, duplicates='drop', labels=labels)

print(lap_times_stats)


In [None]:
lap_positions = get_median_lap_position(conn)
lap_times_stats = pd.merge(lap_times_stats, lap_positions, on=['driverId', 'year'], how='left')
lap_times_stats['median_lap_position'] = lap_times_stats['median_lap_position'].astype(int)
print(lap_times_stats)

In [None]:
lap_count = get_total_laps_count(conn)
lap_times_stats = pd.merge(lap_times_stats, lap_count, on=['driverId', 'year'], how='left')
lap_times_stats['total_laps'] = lap_times_stats['total_laps'].astype(int)
print(lap_times_stats)

# Pit Stops

In [None]:
pit_stop_stats = get_pit_stop_stats_in_miliseconds(conn)
print(pit_stop_stats)

In [None]:
columns_to_discretize = ['min_pit_stop_time', 'avg_pit_time', 'max_pit_stop_time']
labels = ['0-25%', '25-50%', '50% and more']

for col in columns_to_discretize:
    if col in pit_stop_stats.columns:
        bin_edges = [pit_stop_stats[col].min(), pit_stop_stats[col].quantile(0.25), pit_stop_stats[col].quantile(0.5), pit_stop_stats[col].max()]
        pit_stop_stats[col] = pd.cut(pit_stop_stats[col], bins=bin_edges, include_lowest=True, duplicates='drop', labels=labels)

print(pit_stop_stats)


In [None]:
most_common_pit_stop_lap = get_most_common_pit_stop_lap(conn)
pit_stop_stats = pd.merge(pit_stop_stats, most_common_pit_stop_lap, on=['driverId', 'year'], how='left')
print(pit_stop_stats)

In [None]:
pit_stop_stats['most_common_pitstop_lap'] = pit_stop_stats['most_common_pitstop_lap'].astype(int)

In [None]:
pit_stop_stats

# Constructor

In [None]:
constructor_driver_connection = get_drivers_with_constructor_relationship(conn)

In [None]:
constructor_driver_connection

# Drivers with Races

In [None]:
drivers_final_positions

In [None]:
constructor_driver_connection

In [None]:
drivers_final_positions = drivers_final_positions.drop_duplicates(['driverId', 'year'])
constructor_driver_connection = constructor_driver_connection.drop_duplicates(['driverId', 'year'])

In [None]:
drivers_final_positions

In [None]:
constructor_driver_connection

In [None]:
drivers_with_constructors = pd.merge(drivers_final_positions, constructor_driver_connection, on=['driverId', 'year'], how='left')
print(drivers_with_constructors)

In [None]:
connstructor_points = get_constructor_points(conn)
print(connstructor_points)

In [None]:
connstructor_points['constructor_points'] = connstructor_points['constructor_points'].astype(int)
connstructor_points['constructor_place'] = connstructor_points['constructor_place'].astype(int)

In [None]:
connstructor_points

In [None]:
connstructor_points = connstructor_points.sort_values(by=['year', 'constructorId'])

for index, row in connstructor_points.iterrows():
    temp_df = connstructor_points[(connstructor_points['constructorId'] == row['constructorId']) & (connstructor_points['year'] <= row['year'])]
    
    temp_first_place_count = temp_df[temp_df['constructor_place'] == 1].shape[0]
    temp_second_place_count = temp_df[temp_df['constructor_place'] == 2].shape[0]
    temp_third_place_count = temp_df[temp_df['constructor_place'] == 3].shape[0]
    temp_top_5_count = temp_df[temp_df['constructor_place'] <= 5].shape[0]
    
    connstructor_points.at[index, 'constructor_first_place_count'] = temp_first_place_count
    connstructor_points.at[index, 'constructor_second_place_count'] = temp_second_place_count
    connstructor_points.at[index, 'constructor_third_place_count'] = temp_third_place_count
    connstructor_points.at[index, 'constructor_top_5_count'] = temp_top_5_count

print(connstructor_points)


In [None]:
connstructor_points['constructor_first_place_count'] = connstructor_points['constructor_first_place_count'].astype(int)
connstructor_points['constructor_second_place_count'] = connstructor_points['constructor_second_place_count'].astype(int)
connstructor_points['constructor_third_place_count'] = connstructor_points['constructor_third_place_count'].astype(int)
connstructor_points['constructor_top_5_count'] = connstructor_points['constructor_top_5_count'].astype(int)

In [None]:
connstructor_points

In [None]:
drivers_with_constructors = pd.merge(drivers_with_constructors, connstructor_points, on=['constructorId', 'year'], how='left')
print(drivers_with_constructors)

In [None]:
drivers_with_constructors.fillna(0)

In [None]:
best_performing_circuits = get_best_performing_circuits(conn)
drivers_with_constructors = pd.merge(drivers_with_constructors, best_performing_circuits, on=['year','driverId' ], how='left')
print(drivers_with_constructors)

In [None]:
drivers_with_constructors.fillna(0, inplace=True)

In [None]:
drivers_with_constructors['constructor_first_place_count'] = drivers_with_constructors['constructor_first_place_count'].astype(int)
drivers_with_constructors['constructor_second_place_count'] = drivers_with_constructors['constructor_second_place_count'].astype(int)
drivers_with_constructors['constructor_third_place_count'] = drivers_with_constructors['constructor_third_place_count'].astype(int)
drivers_with_constructors['constructor_top_5_count'] = drivers_with_constructors['constructor_top_5_count'].astype(int)
drivers_with_constructors['constructor_points'] = drivers_with_constructors['constructor_points'].astype(int)
drivers_with_constructors['constructor_place'] = drivers_with_constructors['constructor_place'].astype(int)

In [None]:
drivers_with_constructors

# Connecting Laps with Pitstops

In [None]:
lap_pit_df = pd.merge(lap_times_stats, pit_stop_stats, on=['driverId', 'year'], how='left', suffixes=('_lap', '_pit'))

print(lap_pit_df)

In [None]:
def categorize_total_laps(laps):
    if laps <= 1000:
        return '0-1000 Laps'
    elif laps <= 3000:
        return '1001-3000 Laps'
    elif laps <= 6000:
        return '3001-6000 Laps'
    elif laps <= 9000:
        return '6001-9000 Laps'
    elif laps <= 12000:
        return '9001-12000 Laps'
    else:
        return 'More than 12000 Laps'

lap_pit_df['total_laps'] = lap_pit_df['total_laps'].apply(categorize_total_laps)


In [None]:
lap_pit_df

In [None]:
categorical_columns = ['min_pit_stop_time', 'avg_pit_time', 'max_pit_stop_time', 'most_common_pitstop_lap']

for col in categorical_columns:
    lap_pit_df[col] = lap_pit_df[col].astype('category').cat.add_categories('Unknown')
    lap_pit_df[col].fillna('Unknown', inplace=True)

print(lap_pit_df)


In [None]:
final_df = pd.merge(drivers_with_constructors, lap_pit_df, on=['driverId', 'year'], how='left')

In [None]:
final_df

In [None]:
columns_to_check = ['min_lap_time_diff', 'avg_lap_time_diff', 'max_lap_time_diff']

final_df.dropna(subset=columns_to_check, inplace=True)

print(final_df)


In [None]:
positions = get_most_common_positions(conn)

In [None]:
positions['most_common_ending_position'] = positions['most_common_ending_position'].replace({'\\N': 'Did Not Finish'})

In [None]:
positions['most_common_starting_position'] = positions['most_common_starting_position'].replace(0, 'Unknown')

In [None]:
def categorize_pitstop_lap(lap):
    if isinstance(lap, str):
        return 'Unknown'
    if lap <= 10:
        return 'Early Pitstop'
    elif lap <= 35:
        return 'Mid-race Pitstop'
    else:
        return 'Late Pitstop'

final_df['most_common_pitstop_lap'] = final_df['most_common_pitstop_lap'].apply(categorize_pitstop_lap)

positions['most_common_starting_position'] = pd.to_numeric(positions['most_common_starting_position'], errors='coerce')
positions['most_common_ending_position'] = pd.to_numeric(positions['most_common_ending_position'], errors='coerce')

def categorize_position(position):
    if isinstance(position, str):
        return 'Unknown'
    if position <= 5:
        return 'Top 5'
    elif position <= 10:
        return 'Top 10'
    else:
        return 'Beyond Top 10'

positions['most_common_starting_position'] = positions['most_common_starting_position'].apply(categorize_position)
positions['most_common_ending_position'] = positions['most_common_ending_position'].apply(categorize_position)


print(final_df)

print(positions)

In [None]:
final_df = pd.merge(final_df, positions, on=['driverId', 'year'], how='left')

In [None]:
final_df

In [None]:
final_df['median_lap_position'] = final_df['median_lap_position'].astype(int)

In [None]:
final_df

In [None]:
drivers_per_year = final_df.copy()

In [None]:
drivers_per_year

In [None]:
drivers_per_year = drivers_per_year.drop('driverId', axis=1)
drivers_per_year = drivers_per_year.drop('year', axis=1)
drivers_per_year = drivers_per_year.drop('constructorId', axis=1)

In [None]:
drivers_per_year

In [None]:
drivers_per_year.to_csv("drivers_per_year.csv", index=False, encoding='utf-8')

In [None]:
df = final_df.sort_values(by='year', ascending=False)
df = df.astype(str)

numeric_columns = ['year','driver_points', 'driver_end_position', 'constructor_points', 'constructor_place','median_lap_position', 'total_laps']
df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')

aggregation_functions = {
    'year': 'max',
    'driver_nationality': 'first',
    'age_at_first_race': 'first',
    'first_place_count': 'first',
    'second_place_count': 'first',
    'third_place_count': 'first',
    'top_5_count': 'first',
    'constructorName': 'first',
    'constructor_nationality': 'first',
    'constructor_first_place_count': 'first',
    'constructor_second_place_count': 'first',
    'constructor_third_place_count': 'first',
    'constructor_top_5_count': 'first',
    'best_performing_circuit_name': lambda x: x.mode().iloc[0], 
    'min_lap_time_diff': lambda x: x.mode().iloc[0],
    'avg_lap_time_diff': lambda x: x.mode().iloc[0],
    'max_lap_time_diff': lambda x: x.mode().iloc[0],
    'median_lap_position':lambda x: round(x.mean()) if x.notna().any() else 'Unknown',
    'min_pit_stop_time': lambda x: x.mode().iloc[0],
    'avg_pit_time': lambda x: x.mode().iloc[0],
    'max_pit_stop_time': lambda x: x.mode().iloc[0],
    'most_common_pitstop_lap': lambda x: x.mode().iloc[0],
    'most_common_starting_position': lambda x: x.mode().iloc[0] if x.notna().any() else 'Unknown',
    'most_common_ending_position': lambda x: x.mode().iloc[0] 
}

aggregated_df = df.groupby('driverId').agg(aggregation_functions)

aggregated_df = aggregated_df.reset_index()

In [None]:
aggregated_df

In [None]:
aggregated_df = aggregated_df.drop(['year', 'driverId'], axis=1)

In [None]:
aggregated_df

In [None]:
aggregated_df['top_5_count'] = aggregated_df['top_5_count'].astype(int)

In [None]:
drivers_lost = aggregated_df[aggregated_df['top_5_count'] == 0]
drivers_won = aggregated_df[aggregated_df['top_5_count'] != 0]

In [None]:
drivers_lost

In [None]:
drivers_won

In [None]:
import inflect

def number_to_words(number):
    p = inflect.engine()
    return p.number_to_words(number)

columns_to_convert = ['first_place_count','second_place_count','third_place_count','top_5_count','age_at_first_race','constructor_first_place_count','constructor_second_place_count','constructor_third_place_count','constructor_top_5_count', 'median_lap_position']
drivers_won[columns_to_convert] = drivers_won[columns_to_convert].fillna(0).astype(int)

drivers_won[columns_to_convert] = drivers_won[columns_to_convert].apply(pd.to_numeric, errors='coerce')

drivers_won[columns_to_convert] = drivers_won[columns_to_convert].applymap(number_to_words)

print(drivers_won)



In [None]:
import inflect

def number_to_words(number):
    p = inflect.engine()
    return p.number_to_words(number)

columns_to_convert = ['first_place_count','second_place_count','third_place_count','top_5_count','age_at_first_race','constructor_first_place_count','constructor_second_place_count','constructor_third_place_count','constructor_top_5_count', 'median_lap_position']
drivers_lost[columns_to_convert] = drivers_lost[columns_to_convert].fillna(0).astype(int)

drivers_lost[columns_to_convert] = drivers_lost[columns_to_convert].apply(pd.to_numeric, errors='coerce')

drivers_lost[columns_to_convert] = drivers_lost[columns_to_convert].applymap(number_to_words)

print(drivers_lost)



In [None]:
import inflect

def number_to_words(number):
    p = inflect.engine()
    return p.number_to_words(number)

columns_to_convert = ['first_place_count','second_place_count','third_place_count','top_5_count','age_at_first_race','constructor_first_place_count','constructor_second_place_count','constructor_third_place_count','constructor_top_5_count', 'median_lap_position']
aggregated_df[columns_to_convert] = aggregated_df[columns_to_convert].fillna(0).astype(int)

aggregated_df[columns_to_convert] = aggregated_df[columns_to_convert].apply(pd.to_numeric, errors='coerce')

aggregated_df[columns_to_convert] = aggregated_df[columns_to_convert].applymap(number_to_words)

print(aggregated_df)


In [None]:
aggregated_df.to_csv("all_drivers.csv", index=False, encoding='utf-8')

In [None]:
aggregated_df = aggregated_df[aggregated_df.ne('Unknown').all(axis=1)]

In [None]:
aggregated_df.to_csv("all_drivers_withiut_unknown.csv", index=False, encoding='utf-8')

In [None]:
drivers_won.to_csv("winning_drivers.csv", index=False, encoding='utf-8')

In [None]:
drivers_won = drivers_won[drivers_won.ne('Unknown').all(axis=1)]

In [None]:
drivers_won.to_csv("winning_drivers_without_unknown.csv", index=False, encoding='utf-8')

In [None]:
drivers_lost.to_csv("losing_drivers.csv", index=False, encoding='utf-8')

In [None]:
drivers_lost = drivers_lost[drivers_lost.ne('Unknown').all(axis=1)]

In [None]:
drivers_lost.to_csv("losing_drivers_without_unknown.csv", index=False, encoding='utf-8')