# Outline

# Import libraries and data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, make_scorer
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve, plot_roc_curve
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsClassifier

from lightgbm import LGBMRegressor

import os

In [None]:
# Unzip archive
!Unzip -o archive.zip

## Merge raw datas

In [None]:
# Create empty DataFrame availbe for accpet values
# F1_data = pd.DataFrame()
# F1_data_year = pd.DataFrame()

In [None]:
# Utility Function
# def get_DataFrame_F1(dirname):
    #global F1_data
    #global F1_data_year
    #foldernames = os.listdir(dirname)
    #for folder in foldernames:
        #foldername = os.path.join(dirname, folder)
        #for file in foldername:
            #filenames = os.listdir(foldername)
            #for file in filenames:
                #filename = os.path.join(foldername, file)
                #F1_data_temp = pd.read_csv(filename)
                #F1_data_year = pd.concat([F1_data_year, F1_data_temp])
    #F1_data = pd.concat([F1_data, F1_data_year])
    #return F1_data

In [None]:
# Define dirname and create full DataFrame for EDA
# dirname = 'race_wise_data'
# get_DataFrame_F1(dirname)

## Load merged data

In [None]:
F1_data = pd.read_csv('/Users/escortkwon/Code/MiniProjects_Kaggle/Upvoted/10. F1 Race Data Prediction/final.csv')
F1_data

## Dataset Overview

# Data Preprocessing

In [None]:
# Check features 'F1_data' contains
print('Features: ', F1_data.columns)
F1_data.head()

In [None]:
# Check whether 'crimes' contains any Null or NaN values
F1_data.isnull().sum()

In [None]:
# Additional arranging 'F1_data'
F1_data.sort_values(by=['season', 'round'], ascending=True, inplace=True)
F1_data.drop('Unnamed: 0', axis=1, inplace=True)
F1_data.reset_index(drop=True, inplace=True)
F1_data

In [None]:
# Create another DataFrame for training
F1_data_train = F1_data.copy()
F1_data_train.drop(['season', 'round', 'circuit_id', 'driver', 'nationality', 'constructor'], axis=1, inplace=True)

weathers = ['weather_warm', 'weather_cold', 'weather_dry', 'weather_wet', 'weather_cloudy']
for weather in weathers:
    F1_data_train[weather] = F1_data_train[weather].apply(lambda x : 1 if x == True else 0)

F1_data_train

## Summary
The dataset has already been preprocessed, so there're nothing left to do.  
But, it contains 21 features which are relatively more than any other datasets.


# Decomposition

In [None]:
# Check corrleation of 'F1_data'
plt.figure(figsize=(15, 15))
plt.title('Corrleation Heatmap of F1_data')
sns.heatmap(F1_data.corr(), annot=True, fmt='.1g', linewidths=.3)

## Description
As you can see heatmap above, featrues from 'grid' to 'constructor_standing_pos' are heavily related with dataset.  
Therefore, using PCA, we are gonna process decomposition

In [None]:
features_dec = ['grid', 'podium', 'driver_points', 'driver_standings_pos', 'constructor_points', 'constructor_wins', 'constructor_standings_pos']
F1_data_scaled = F1_data.copy()

scaler = StandardScaler()
F1_data_scaled = scaler.fit_transform(F1_data[features_dec])
pca = PCA(n_components=2)
pca.fit(F1_data_scaled)
print('Variability by PCA Components: ', pca.explained_variance_ratio_)

## Description
We can explain explain the variance of 7 features with 2 PCA components  
The total variance is about 73% and the first axis was the highest with 55%

In [None]:
# Set X, y as features and label
F1_data_scaled = F1_data_train.copy()

X_features = F1_data_scaled.drop('driver_wins', axis=1, inplace=False)
y_label = F1_data_scaled['driver_wins']
print('Shape of X_features: {0} / Shape of y_label: {1}'.format(X_features.shape, y_label.shape))

In [None]:
# Evaluate accuracy score without PCA
rf_clf = RandomForestClassifier(n_estimators=300, random_state=11)
scores = cross_val_score(rf_clf, X_features, y_label, scoring='accuracy', cv=3)

print('Accuracy by each fold: ', scores)
print('Average Accuracy: {0:.4f}'.format(np.mean(scores)))

In [None]:
# Evaluate accuracy score with PCA
pca = PCA(n_components=7)
df_pca = pca.fit_transform(X_features)
scores_pca = cross_val_score(rf_clf, df_pca, y_label, scoring='accuracy', cv=3)

print('Accuracy by each fold PCA converted: ', scores_pca)
print('Average Accuracy PCA converted: {0:.4f}'.format(np.mean(scores_pca)))

## Summary
The number of columns was decreased from 14 to 7, so the decreased rate is 50%  
But, the accuracy score was decreased from 88.99% to 83.44%, which means 6% decreased  

# Visualization

## Distribution of Weather

In [None]:
# Extract data for plotting distribution
F1_data_weather = F1_data.iloc[:, [3, 4, 5, 6, 7]]

# Check distribution of each features
fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(12, 6))

for i, feature in enumerate(F1_data_weather.columns):
    row = int(i/3)
    col = i%3
    sns.distplot(F1_data_weather.iloc[:, i], ax=axs[row][col])

plt.suptitle('Distirbution of Weather')
plt.tight_layout

In [None]:
# Create dataframes for pie charts
F1_data_weather_binary = F1_data_train.iloc[:, range(0, 5)]
F1_weather_warm = pd.value_counts(F1_data_weather_binary['weather_warm'])
F1_weather_cold = pd.value_counts(F1_data_weather_binary['weather_cold'])
F1_weather_dry = pd.value_counts(F1_data_weather_binary['weather_dry'])
F1_weather_wet = pd.value_counts(F1_data_weather_binary['weather_wet'])
F1_weather_cloudy = pd.value_counts(F1_data_weather_binary['weather_cloudy'])

In [None]:
fig, axs = plt.subplots(ncols=5, figsize=(24, 3))

F1_weather_warm.plot.pie(ax=axs[0])
F1_weather_cold.plot.pie(ax=axs[1])
F1_weather_dry.plot.pie(ax=axs[2])
F1_weather_wet.plot.pie(ax=axs[3])
F1_weather_cloudy.plot.pie(ax=axs[4])

plt.suptitle('Distribution of Weather by Pie Chart')

## Description
That plot shows the distribution of weather.  
Generally, it was warm and moderate

## Distribution of Nationality

In [None]:
# Create DataFrame for Pie Chart
F1_data_nation = pd.DataFrame(data=F1_data['nationality'].value_counts())
F1_data_nation.reset_index(inplace=True)
F1_data_nation.rename({'index' : 'nationality', 'nationality' : 'count'}, axis=1, inplace=True)
F1_data_nation.sort_values(by='count', ascending=False, inplace=True)
F1_data_nation

In [None]:
# Encode features
le = LabelEncoder()

le.fit(F1_data_nation['nationality'])
F1_data_nation['nationality_le'] = le.transform(F1_data_nation['nationality'])
F1_data_nation

In [None]:
# Pie Chart
pie, ax = plt.subplots(figsize=[10, 10])
labels = F1_data_nation['nationality']
plt.pie(F1_data_nation['count'], autopct="%.1f%%", labels=labels, pctdistance=0.5)
plt.title("Distribution of Nationality", fontsize=14)

In [None]:
# Pie Chart
pie, ax = plt.subplots(figsize=[10, 10])
labels = F1_data_nation.iloc[0:10, 0]
plt.pie(F1_data_nation.iloc[0:10, 1], autopct="%.1f%%", labels=labels, pctdistance=0.5)
plt.title("Distribution of Nationality [Top 10]", fontsize=14)

## Description
Usually, countries in Europe were superior to other countries.

## Distribution of Age

In [None]:
# Plot density plot
plt.figure(figsize=(10, 6))
plt.title('Distribution of Age')
sns.distplot(F1_data['driver_age'])

## Description
The distribution of age of drivers were much similiar as Normal Distribution which is compatible for training.

# Create Datasets

In [None]:
# Scaling
for feature in F1_data_train.columns:
    scaler = StandardScaler()
    scaler = scaler.fit(np.array(F1_data_train[feature]).reshape(-1, 1))
    F1_data_train[feature] = scaler.transform(np.array(F1_data_train[feature]).reshape(-1, 1))

F1_data_train

In [None]:
# Set X, y for features and label
X = F1_data_train.drop('driver_wins', axis=1, inplace=False)
y = F1_data_train['driver_wins'].astype(int)

In [None]:
# Split datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

print('Shape of X_train: ', X_train.shape)
print('Shape of X_test: ', X_test.shape)
print('Shape of y_train: ', y_train.shape)
print('Shape of y_test: ', y_test.shape)

# Classification

In [None]:
# Utility Function
def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred, average='macro')
    recall = recall_score(y_test, pred, average='macro')
    f1 = f1_score(y_test, pred, average='macro')
    roc_auc = roc_auc_score(y_test, pred_proba, multi_class='ovr', average='macro')
    print('Confusion Matrix')
    print(confusion)
    print('Accuracy: {0:.4f}, Precision: {1:.4f}, Recall {2:.4f}, F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

In [None]:
# Utility Function
def get_clf_eval_edit(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred, average='macro')
    recall = recall_score(y_test, pred, average='macro')
    f1 = f1_score(y_test, pred, average='macro')
    #roc_auc = roc_auc_score(y_test, pred_proba, multi_class='ovo', average='macro')
    print('Confusion Matrix')
    print(confusion)
    print('Accuracy: {0:.4f}, Precision: {1:.4f}, Recall {2:.4f}, F1: {3:.4f}'.format(accuracy, precision, recall, f1))

## Logistic Regression

In [None]:
# Create Estimator CLass
lr_clf = LogisticRegression()

# Fitting
lr_clf.fit(X_train, y_train)

# Prediction
lr_pred = lr_clf.predict(X_test)

# Pred_Proba
lr_pred_proba = lr_clf.predict_proba(X_test)

# Evaluation
get_clf_eval_edit(y_test, lr_pred, lr_pred_proba)

## Comment
In fact, estimators such as LogisticRegression or DecisionTreeClassifier is very fundamental but powerful on binary class problems.  
Although I choosed for LogisticRegreesion, its accuracy and AUC are 94.27% and 99.19% each.  
LogisticRegression is known as good estimator to use not only binary problems but also mutliclass problems.  

But, in this dataset, there are no binary features.  
Therefore, it might be good idea to binarize features with threshold values in custom.  
I will try for an update soon.

Thanks for reviewing my Notebook!