In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import seaborn as sns


drivers = pd.read_csv('../dataset/drivers.csv')
constructors = pd.read_csv("../dataset/constructors.csv")
race_results = pd.read_csv('../dataset/results.csv')
races = pd.read_csv('../dataset/races.csv')
circuits = pd.read_csv('../dataset/circuits.csv')
pit_stops = pd.read_csv('../dataset/pitStops.csv')

# stock data
driver_results = pd.merge(drivers, race_results, on='driverId')
driver_race_results = pd.merge(driver_results, races, on='raceId')
driver_race_circuit_results = pd.merge(driver_race_results, circuits, on= 'circuitId')
driver_constructor_race_circuit_results = pd.merge(driver_race_circuit_results, constructors, on="constructorId")

filtered_stock = driver_constructor_race_circuit_results[['raceId', 'driverId', 'driverRef', 'nationality_x', "name", 'positionOrder', 'points', 'grid', 'country', 'alt']].rename(columns={"nationality_x": "nationality", "name": "constructor"})

#calculated data
# pit num
pit_cols = pit_stops[["raceId", "driverId", "stop"]]
pit_num = pit_cols.groupby(["raceId", "driverId"]).count().reset_index()

# pit avg time
pit_time_cols = pit_stops[["raceId", "driverId", "milliseconds"]]
pit_time = pit_time_cols.groupby(["raceId", "driverId"]).mean().reset_index()

pits = pd.merge(pit_num, pit_time, on=["raceId", "driverId"], how="left")

# data to use
data = pd.merge(filtered_stock, pits,  on=["raceId", "driverId"], how="right").drop(columns=["driverId", "raceId"])
data.head()

ModuleNotFoundError: No module named 'pandas'

In [None]:
data.describe()

In [None]:
sns.heatmap(data.corr(), annot=True, lw=1)

In [None]:
Y = data["positionOrder"]
X = pd.get_dummies(data=data.drop(columns=["positionOrder", "points"]), drop_first=True)
X.head()

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4, random_state=101)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

In [None]:
model = LinearRegression()
model.fit(X_train, Y_train)
print(model.intercept_)

In [None]:
coeff_param = pd.DataFrame(model.coef_, X.columns, columns=["Coefficient"])
coeff_param.head()

In [None]:
predictions = model.predict(X_test)
sns.regplot(Y_test, predictions)

In [None]:
X_train_Sm = sm.add_constant(X_train)
ls = sm.OLS(Y_train, X_train_Sm).fit()
print(ls.summary())