In [138]:
# importing required libraries 

import pandas as pd
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
import matplotlib.pyplot as plt 
import numpy as np
from sklearn.model_selection import train_test_split
import warnings
warnings.simplefilter("ignore")
pd.set_option('display.max_columns', None)

In [139]:
tyre_data_df = pd.read_csv("data/tyre_data.csv")

In [140]:
tyre_data_df.head()

Unnamed: 0.1,Unnamed: 0,Driver,GP,Tyres,From,To,#Laps,year
0,0,Jenson Button,Australia,Soft Used,1,16,16,2012
1,1,Jenson Button,Australia,Medium New,17,36,20,2012
2,2,Jenson Button,Australia,Medium New,37,58,22,2012
3,3,Sebastian Vettel,Australia,Soft Used,1,16,16,2012
4,4,Sebastian Vettel,Australia,Soft Used,17,37,21,2012


In [141]:
tyre_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13302 entries, 0 to 13301
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  13302 non-null  int64 
 1   Driver      13297 non-null  object
 2   GP          13302 non-null  object
 3   Tyres       13302 non-null  object
 4   From        13302 non-null  int64 
 5   To          13302 non-null  int64 
 6   #Laps       13302 non-null  int64 
 7   year        13302 non-null  int64 
dtypes: int64(5), object(3)
memory usage: 831.5+ KB


In [142]:
tyre_data_df.isna().sum()

Unnamed: 0    0
Driver        5
GP            0
Tyres         0
From          0
To            0
#Laps         0
year          0
dtype: int64

In [143]:
tyre_data_df.year.unique()

array([2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022,
       2023])

In [144]:
tyre_data_df = tyre_data_df.drop("Unnamed: 0", axis=1)

In [145]:
tyre_data_df.columns = tyre_data_df.columns.str.lower()

In [146]:
tyre_data_df.head()

Unnamed: 0,driver,gp,tyres,from,to,#laps,year
0,Jenson Button,Australia,Soft Used,1,16,16,2012
1,Jenson Button,Australia,Medium New,17,36,20,2012
2,Jenson Button,Australia,Medium New,37,58,22,2012
3,Sebastian Vettel,Australia,Soft Used,1,16,16,2012
4,Sebastian Vettel,Australia,Soft Used,17,37,21,2012


In [147]:
tyre_data_df.tyres.unique()

array(['Soft Used', 'Medium New', 'Soft New', 'Medium Used',
       'Intermediate New', 'Wet New', 'Hard New', 'Intermediate Used',
       'Lluvia Extrema Usado', 'Hard Used', 'Supersoft Used',
       'Supersoft New', 'Ultrasoft New', 'Ultrasoft Used',
       'Hiper Blando Usado', 'Hiper Blando Nuevo'], dtype=object)

In [148]:
# Remove "Used" and "New" from the "tyres" column values
#tyre_data_df['tyres'] = tyre_data_df['tyres'].str.replace(' Used', '').str.replace(' New', '')


In [149]:
# Remove " Used" and " New" from the 'tyres' column
tyre_data_df['tyres'] = tyre_data_df['tyres'].str.replace(' Used', '').str.replace(' New', '')

# Print the DataFrame to verify the changes
print(tyre_data_df)


                 driver           gp         tyres  from  to  #laps  year
0         Jenson Button    Australia          Soft     1  16     16  2012
1         Jenson Button    Australia        Medium    17  36     20  2012
2         Jenson Button    Australia        Medium    37  58     22  2012
3      Sebastian Vettel    Australia          Soft     1  16     16  2012
4      Sebastian Vettel    Australia          Soft    17  37     21  2012
...                 ...          ...           ...   ...  ..    ...   ...
13297       Zhou Guanyu  Netherlands          Soft    38  60     23  2023
13298    Logan Sargeant  Netherlands          Soft     1  14     14  2023
13299    Max Verstappen  Netherlands  Intermediate    65  72      8  2023
13300      Pierre Gasly  Netherlands          Soft    47  60     14  2023
13301   Nico Hulkenberg  Netherlands        Medium    17  48     32  2023

[13302 rows x 7 columns]


In [150]:
tyre_data_df.tyres.unique()

array(['Soft', 'Medium', 'Intermediate', 'Wet', 'Hard',
       'Lluvia Extrema Usado', 'Supersoft', 'Ultrasoft',
       'Hiper Blando Usado', 'Hiper Blando Nuevo'], dtype=object)

In [151]:
# Define a dictionary to map old values to new values
tyres_mapping = {
    'Hiper Blando Usado': 'Hyper Soft',
    'Hiper Blando Nuevo': 'Hyper Soft',
    'Lluvia Extrema Usado': 'Full Wet'
}

# Use the replace() method to update the values in the "tyres" column
tyre_data_df['tyres'] = tyre_data_df['tyres'].replace(tyres_mapping)


In [152]:
tyred_data_df = tyre_data_df

In [153]:
tyre_data_df.tyres.unique()

array(['Soft', 'Medium', 'Intermediate', 'Wet', 'Hard', 'Full Wet',
       'Supersoft', 'Ultrasoft', 'Hyper Soft'], dtype=object)

In [154]:
tyre_data_df.gp.unique()

array(['Australia', 'Malaysia', 'China', 'Bahrain', 'Spain', 'Monaco',
       'Canada', 'Europa', 'Great Britain', 'Germany', 'Hungary',
       'Belgium', 'Italy', 'Singapore', 'Japan', 'Korea', 'India',
       'Abu Dhabi', 'United States', 'Brazil', 'Austria', 'Russia',
       'Mexico', 'Europe', 'Azerbaijan', 'France', 'Styria',
       '70TH Anniversary', 'Tuscany', 'Russian', 'Portugal', 'Imola',
       'Turkey', 'Sakhir', 'Netherlands', 'Qatar', 'Saudi Arabia',
       'Italia', 'Miami', 'Francia'], dtype=object)

In [155]:
tyre_data_df.gp.nunique()

40

USA Grand Prix is in Austin

In [156]:
desired_values = ['Europa', 'Europe']

# Create a boolean mask to filter rows where the 'gp' column matches the desired values
mask = tyre_data_df['gp'].isin(desired_values)

# Use the mask to select the rows and print them
filtered_rows = tyre_data_df[mask]
print(filtered_rows)


               driver      gp      tyres  from  to  #laps  year
530   Fernando Alonso  Europa       Soft     1  15     15  2012
531   Fernando Alonso  Europa       Soft    16  28     13  2012
532   Fernando Alonso  Europa     Medium    29  57     29  2012
533    Kimi Räikkönen  Europa       Soft     1  14     14  2012
534    Kimi Räikkönen  Europa       Soft    15  28     14  2012
...               ...     ...        ...   ...  ..    ...   ...
5538  Pascal Wehrlein  Europe       Soft     1  29     29  2016
5539  Pascal Wehrlein  Europe     Medium    30  39     10  2016
5540     Carlos Sainz  Europe       Soft     5  27     23  2016
5541     Carlos Sainz  Europe  Supersoft    28  31      4  2016
5542  Marcus Ericsson  Europe  Supersoft    17  25      9  2016

[125 rows x 7 columns]


In [157]:
# List of years from 2012 to 2016
years = [2012, 2013, 2014, 2015, 2016]

# Loop through the years and print unique values for each year
for year in years:
    # Filter the DataFrame for the current year
    tyre_data_year = tyre_data_df[tyre_data_df['year'] == year]
    
    # Get the unique values from the "gp" column for the current year
    unique_gp_values = tyre_data_year['gp'].unique()
    
    # Print the year and its unique values
    print(f"Year {year}:")
    print(unique_gp_values)
    print()


Year 2012:
['Australia' 'Malaysia' 'China' 'Bahrain' 'Spain' 'Monaco' 'Canada'
 'Europa' 'Great Britain' 'Germany' 'Hungary' 'Belgium' 'Italy'
 'Singapore' 'Japan' 'Korea' 'India' 'Abu Dhabi' 'United States' 'Brazil']

Year 2013:
['Australia' 'Malaysia' 'China' 'Bahrain' 'Spain' 'Monaco' 'Canada'
 'Great Britain' 'Germany' 'Hungary' 'Belgium' 'Italy' 'Singapore' 'Korea'
 'Japan' 'India' 'Abu Dhabi' 'United States' 'Brazil']

Year 2014:
['Australia' 'Malaysia' 'Bahrain' 'China' 'Spain' 'Monaco' 'Canada'
 'Austria' 'Great Britain' 'Germany' 'Hungary' 'Belgium' 'Italy'
 'Singapore' 'Japan' 'Russia' 'United States' 'Brazil' 'Abu Dhabi']

Year 2015:
['Australia' 'Malaysia' 'China' 'Bahrain' 'Spain' 'Monaco' 'Canada'
 'Austria' 'Great Britain' 'Hungary' 'Belgium' 'Italy' 'Singapore' 'Japan'
 'Russia' 'United States' 'Mexico' 'Brazil' 'Abu Dhabi']

Year 2016:
['Australia' 'Bahrain' 'China' 'Russia' 'Spain' 'Monaco' 'Canada' 'Europe'
 'Austria' 'Great Britain' 'Hungary' 'Germany' 'Belgium' 'It

Click [here](https://en.wikipedia.org/wiki/European_Grand_Prix) If you want to know more about the Grand Prix of "Europe" 

Click [here](https://en.wikipedia.org/wiki/70th_Anniversary_Grand_Prix) to know more about the '70TH Anniversary' Grand Prix.

In [158]:
import plotly.express as px

# List of street circuit names
street_circuits = ['Monaco', 'Australia', 'Singapore', 'Russia', 'Azerbaijan','Miami', 'Saudi Arabia', 'Canada']

# Filter the DataFrame to select rows where the "gp" column is in the list of street circuits
street_circuit_rows = tyre_data_df[tyre_data_df['gp'].isin(street_circuits)]

# Create a non-stacked histogram to visualize the distribution of updated tire types
fig = px.histogram(street_circuit_rows, x='tyres', color='gp', title='Tire Type Distribution on Street Circuits',
                   labels={'tyres': 'Tyre Types', 'gp': 'Street Circuit'},
                   barmode='group')  # Set barmode to 'group' for non-stacked bars

# Customize the layout
fig.update_layout(xaxis_tickangle=-45, xaxis_title=None, yaxis_title='Count')

# Show the plot
fig.show()


In [159]:
import plotly.express as px

# List of street circuit names
street_circuits = ['Monaco', 'Australia', 'Singapore', 'Russia', 'Azerbaijan','Miami', 'Saudi Arabia','Canada']

# Filter the DataFrame to select rows where the "gp" column is in the list of street circuits
street_circuit_rows = tyre_data_df[tyre_data_df['gp'].isin(street_circuits)]

# Calculate the percentage distribution of tire types
percentage_distribution = street_circuit_rows['tyres'].value_counts(normalize=True) * 100

# Create a DataFrame with the calculated percentages
percentage_df = percentage_distribution.reset_index()
percentage_df.columns = ['Tyre Types', 'Percentage (%)']

# Create a bar chart to visualize the percentage distribution of updated tire types
fig = px.bar(percentage_df, x='Tyre Types', y='Percentage (%)',
             title='Tyre Type Distribution on Street Circuits 2012-2023<br><sup>Street Circuits: Monaco, Australia, Singapore, Russia, Azerbaijan, Saudi Arabia, Miami and Canada</sup>',
             color_discrete_sequence=["#00A08B"],
             text=percentage_df['Percentage (%)'].apply(lambda x: f'{x:.0f}%'))

# Update marker properties for the bars
fig.update_traces(
    marker=dict(line=dict(width=1, color='black'))
)

# Increase title font size
fig.update_layout(
    title_font=dict(size=24),
    xaxis_title_font=dict(size=18),  # Increase x-axis label font size
    yaxis_title_font=dict(size=18)   # Increase y-axis label font size
)

# Show the plot
fig.show()


In [161]:
race_circuits = ['Malaysia', 'China', 'Bahrain', 'Spain',
       'Europa', 'Great Britain', 'Germany', 'Hungary',
       'Belgium', 'Italy','Japan', 'Korea', 'India',
       'Abu Dhabi', 'United States', 'Brazil', 'Austria',
       'Mexico', 'Europe','France', 'Styria',
       '70TH Anniversary', 'Tuscany', 'Russian', 'Portugal', 'Imola',
       'Turkey', 'Sakhir', 'Netherlands', 'Qatar',
       'Italia', 'Francia']

# Filter the DataFrame to select rows where the "gp" column is in the list of street circuits
race_circuit_rows = tyre_data_df[tyre_data_df['gp'].isin(race_circuits)]

# Calculate the percentage distribution of tire types
percentage_distribution = race_circuit_rows['tyres'].value_counts(normalize=True) * 100

# Create a DataFrame with the calculated percentages
percentage_df = percentage_distribution.reset_index()
percentage_df.columns = ['Tyre Types', 'Percentage (%)']

# Create a bar chart to visualize the percentage distribution of updated tire types
fig = px.bar(percentage_df, x='Tyre Types', y='Percentage (%)',
             title='Tyre Type Distribution on Race Circuits 2012-2023',
             color_discrete_sequence=["#00A08B"],
             text=percentage_df['Percentage (%)'].apply(lambda x: f'{x:.0f}%'))

# Update marker properties for the bars
fig.update_traces(
    marker=dict(line=dict(width=1, color='black'))
)

# Increase title font size
fig.update_layout(
    title_font=dict(size=24),
    xaxis_title_font=dict(size=18),  # Increase x-axis label font size
    yaxis_title_font=dict(size=18)   # Increase y-axis label font size
)

# Show the plot
fig.show()

In [163]:
# Export the cleaned DataFrame to a CSV file
tyre_data_df.to_csv('tyre_data_df_cleaned.csv', index=False)

# The 'index=False' argument ensures that the index column is not included in the CSV file

In [164]:
tyre_data_df.head()

Unnamed: 0,driver,gp,tyres,from,to,#laps,year
0,Jenson Button,Australia,Soft,1,16,16,2012
1,Jenson Button,Australia,Medium,17,36,20,2012
2,Jenson Button,Australia,Medium,37,58,22,2012
3,Sebastian Vettel,Australia,Soft,1,16,16,2012
4,Sebastian Vettel,Australia,Soft,17,37,21,2012


In [165]:
# Split the "Driver" column into "Forename" and "Surname"
tyre_data_df[['forename', 'surname']] = tyre_data_df['driver'].str.split(' ', n=1, expand=True)

# Drop the original "Driver" column
tyre_data_df.drop('driver', axis=1, inplace=True)

# Print the updated DataFrame
print(tyre_data_df)


                gp         tyres  from  to  #laps  year   forename     surname
0        Australia          Soft     1  16     16  2012     Jenson      Button
1        Australia        Medium    17  36     20  2012     Jenson      Button
2        Australia        Medium    37  58     22  2012     Jenson      Button
3        Australia          Soft     1  16     16  2012  Sebastian      Vettel
4        Australia          Soft    17  37     21  2012  Sebastian      Vettel
...            ...           ...   ...  ..    ...   ...        ...         ...
13297  Netherlands          Soft    38  60     23  2023       Zhou      Guanyu
13298  Netherlands          Soft     1  14     14  2023      Logan    Sargeant
13299  Netherlands  Intermediate    65  72      8  2023        Max  Verstappen
13300  Netherlands          Soft    47  60     14  2023     Pierre       Gasly
13301  Netherlands        Medium    17  48     32  2023       Nico  Hulkenberg

[13302 rows x 8 columns]


In [166]:
tyre_data_df.head()

Unnamed: 0,gp,tyres,from,to,#laps,year,forename,surname
0,Australia,Soft,1,16,16,2012,Jenson,Button
1,Australia,Medium,17,36,20,2012,Jenson,Button
2,Australia,Medium,37,58,22,2012,Jenson,Button
3,Australia,Soft,1,16,16,2012,Sebastian,Vettel
4,Australia,Soft,17,37,21,2012,Sebastian,Vettel


In [167]:
# Export the cleaned DataFrame to a CSV file
tyre_data_df.to_csv('tyre_data_df_cleaned.csv', index=False)

# The 'index=False' argument ensures that the index column is not included in the CSV file