# Testing One-Hot Encoding with MasterData5

Our original "Testing One-Hot Encoding" workbook used MasterData3 and looked at encoding for our multi-class models. Here we'll use MasterData5 and look at encoding for our binary models, with Completion Status as our target variable instead of familyStatus.

## Set Up Libraries and Data

In [1]:
# Import necessary data libraries.
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
import category_encoders as ce
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [2]:
# Set up URLs.
master_data_url = 'https://raw.githubusercontent.com/georgetown-analytics/Formula1/main/data/processed/MasterData5.csv'

In [3]:
# Set up dataframes.
master_data = pd.read_csv(master_data_url, sep = ',', engine = 'python')

## Establishing Variables

In [4]:
# Are there any NAs in our data?
master_data = master_data.drop(columns=['laps'],axis=1,inplace=False)
master_data.isna().sum()

raceId               0
driverId             0
constructorId        0
grid                 0
familyStatus         0
Completion Status    0
year                 0
circuitId            0
country              0
alt                  0
isHistoric           0
trackType            0
nationality          0
total_lap_time       0
average_lap_time     0
minimum_lap_time     0
PRCP                 0
TAVG                 0
TMAX                 0
TMIN                 0
binned_circuits      0
dtype: int64

In [5]:
# Looking at how many uniques we have in each major column
#the columns below are what we would like to oneHot
print(
    #master_data['Completion Status'].value_counts()
    #master_data['isHistoric'].value_counts()
    #master_data['trackType'].value_counts()
    master_data['binned_circuits'].value_counts()
)

Tier1    2558
Tier2    2262
Tier3    1771
Tier4    1218
Tier5    1030
Tier6     419
Name: binned_circuits, dtype: int64


In [6]:
def circuit_binner3(row):
    if row['circuitId'] in (9, 4, 11, 14, 6): #500s
        val = "1"
    elif row['circuitId'] in (18, 7, 1, 22, 13): #400s
        val = "2"
    elif row['circuitId'] in (2, 10, 3, 17, 20): #300s
        val = "3"
    elif row['circuitId'] in (8, 21, 15, 24, 70): #200s
        val = "4"
    elif row['circuitId'] in (5, 69, 23, 19, 71, 12, 32, 73): #100s
        val = "5"
    elif row['circuitId'] in (35, 68, 25, 34, 16, 75, 26, 27, 76): #<100s
        val = "6"
    else:
        val = "error"
    return val



In [63]:
master_data.head()

Unnamed: 0,raceId,driverId,constructorId,grid,familyStatus,CompletionStatus,year,circuitId,country,alt,...,trackType,nationality,total_lap_time,average_lap_time,minimum_lap_time,PRCP,TAVG,TMAX,TMIN,binned_circuits
0,1,2,2,9,4,1,2009,1,Australia,10,...,2,German,5662869,97635.672414,88283,0.0,72.0,78.0,66.0,2
1,1,3,3,5,4,1,2009,1,Australia,10,...,2,German,5661506,97612.172414,87706,0.0,72.0,78.0,66.0,2
2,1,4,4,10,4,1,2009,1,Australia,10,...,2,Spanish,5660663,97597.637931,88712,0.0,72.0,78.0,66.0,2
3,1,6,3,11,1,0,2009,1,Australia,10,...,2,Japanese,1560978,91822.235294,89923,0.0,72.0,78.0,66.0,2
4,1,7,5,17,4,1,2009,1,Australia,10,...,2,French,5662082,97622.103448,89823,0.0,72.0,78.0,66.0,2


# Quick Column Rename

In [7]:
# Rename Completion Status so it doesn't have any spaces. This will make it easier to use in the code below.
master_data = master_data.rename(columns={"Completion Status": "CompletionStatus"})

# X,y Setup

In [13]:
X = master_data.loc[:, ['average_lap_time', #Numeric
                        'trackType', #categorical
                        'alt', #numeric
                        'grid', #numeric
                        'average_lap_time', #numeric
                        'minimum_lap_time', #numeric
                        'year', #numeric
                        'PRCP', #numeric
                        'TAVG', #numeric
                        'isHistoric', #categorical
                        #'binned_circuits' #categorical
                       ]]
y = master_data.loc[:, 'CompletionStatus'] #categorical

# Basic Linear Regression Test

In [14]:
logreg = LogisticRegression(solver = 'lbfgs')

In [15]:
cross_val_score(logreg, X, y, cv=5, scoring='accuracy').mean()

0.7664731456815708

In [67]:
#notice we are NOT beating null hypothesis here... so with literally one X variable, we aren't getting much juice
y.value_counts(normalize=True)

1    0.767768
0    0.232232
Name: CompletionStatus, dtype: float64

# Transformations

In [19]:
column_trans =  make_column_transformer(
    (circuit_binner3(), ['binned_circuits']),
    (OneHotEncoder(), ['trackType', 'isHistoric', 'binned_circuits'],
    remainder='passthrough')

TypeError: circuit_binner3() missing 1 required positional argument: 'row'

In [18]:
column_trans.fit_transform(X)

NameError: name 'column_trans' is not defined

In [70]:
pipe = make_pipeline(column_trans, logreg) 

In [71]:
cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()

0.7666891867451252

## Start One-Hot Encoding

We used these sites (https://towardsdatascience.com/target-encoding-for-multi-class-classification-c9a7bcb1a53 and https://www.analyticsvidhya.com/blog/2021/05/how-to-perform-one-hot-encoding-for-multi-categorical-variables/) as foundations for our code.

In [32]:
"""
Encode and transform CompletionStatus using the one-hot encoding code from Towards Data Science.
Note that the target variable must be a string here.
"""
encodeFamilyStatus = ce.OneHotEncoder().fit(master_data.CompletionStatus.astype(str))
y_onehot = encodeFamilyStatus.transform(master_data.CompletionStatus.astype(str))
y_onehot

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,CompletionStatus_1,CompletionStatus_2
0,1,0
1,0,1
2,0,1
3,0,1
4,1,0
...,...,...
9461,0,1
9462,0,1
9463,0,1
9464,0,1


In [33]:
"""
One-hot encode the country column using the for loop shown in the Towards Data Science article.
"""
class_names = y_onehot.columns
for class_ in class_names:
  encodeCountry = ce.TargetEncoder(smoothing = 0)
  print(encodeCountry.fit_transform(master_data["country"], y_onehot[class_]))

       country
0     0.351812
1     0.351812
2     0.351812
3     0.351812
4     0.351812
...        ...
9461  0.271930
9462  0.271930
9463  0.271930
9464  0.271930
9465  0.271930

[9466 rows x 1 columns]
       country
0     0.648188
1     0.648188
2     0.648188
3     0.648188
4     0.648188
...        ...
9461  0.728070
9462  0.728070
9463  0.728070
9464  0.728070
9465  0.728070

[9466 rows x 1 columns]


  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


In [34]:
"""
The Towards Data Science article gives a function, target_encode_multiclass,
in which one-hot encodes the entire dataset with the given target variable. That function is below.
"""
def target_encode_multiclass(X,y): #X,y are pandas df and series
    y = y.astype(str)   #convert to string to onehot encode
    enc = ce.OneHotEncoder().fit(y)
    y_onehot = enc.transform(y)
    class_names = y_onehot.columns  #names of onehot encoded columns
    X_obj = X.select_dtypes('object') #separate categorical columns
    X = X.select_dtypes(exclude='object') 
    for class_ in class_names:
      
        enc = ce.TargetEncoder()
        enc.fit(X_obj,y_onehot[class_]) #convert all categorical 
        temp = enc.transform(X_obj)       #columns for class_
        temp.columns = [str(x)+'_'+str(class_) for x in temp.columns]
        X = pd.concat([X,temp],axis=1)    #add to original dataset
      
    return X

In [35]:
# Use the above function to one-hot encode our master_data dataset with CompletionStatus as our target variable.
onehot_data = target_encode_multiclass(master_data, master_data["CompletionStatus"])

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


In [36]:
# Take a look at the new dataset using the describe() function.
onehot_data.describe()

Unnamed: 0,raceId,driverId,constructorId,grid,position,positionOrder,laps,fastestLap,rank,fastestLapSpeed,...,trackType_CompletionStatus_1,nationality_CompletionStatus_1,bundled_circuitId_CompletionStatus_1,binned_circuits_CompletionStatus_1,positionText_CompletionStatus_2,country_CompletionStatus_2,trackType_CompletionStatus_2,nationality_CompletionStatus_2,bundled_circuitId_CompletionStatus_2,binned_circuits_CompletionStatus_2
count,9466.0,9466.0,9466.0,9466.0,7302.0,9466.0,9466.0,6721.0,6762.0,6721.0,...,9466.0,9466.0,9466.0,9466.0,9466.0,9466.0,9466.0,9466.0,9466.0,9466.0
mean,500.169977,249.438411,36.706634,11.070357,8.744043,10.817135,52.982252,42.216039,10.692399,202.509826,...,0.249102,0.249099,0.249102,0.249102,0.750949,0.750898,0.750898,0.750901,0.750898,0.750898
std,408.988287,355.593273,63.937258,6.24087,5.090236,6.043638,17.737604,17.000168,6.059511,21.342117,...,0.021213,0.054796,0.017268,0.02031,0.407406,0.06322,0.021213,0.054796,0.017268,0.02031
min,1.0,1.0,1.0,0.0,1.0,1.0,0.0,2.0,0.0,89.54,...,0.237243,0.078947,0.159763,0.213611,0.0,0.515625,0.712955,0.428572,0.703704,0.722412
25%,121.0,15.0,4.0,6.0,4.0,6.0,49.0,32.0,5.0,192.346,...,0.237243,0.209566,0.250732,0.235686,0.877551,0.720901,0.762757,0.707641,0.749268,0.746549
50%,236.0,35.0,9.0,11.0,8.0,11.0,56.0,45.0,11.0,203.989,...,0.237243,0.240838,0.250732,0.253451,0.96868,0.746711,0.762757,0.759162,0.749268,0.746549
75%,934.0,810.0,20.0,16.0,13.0,16.0,66.0,54.0,16.0,215.688,...,0.237243,0.292359,0.250732,0.253451,0.997835,0.780303,0.762757,0.790434,0.749268,0.764314
max,1060.0,854.0,214.0,24.0,24.0,24.0,87.0,85.0,24.0,257.32,...,0.287045,0.571428,0.296296,0.277588,1.0,0.9,0.762757,0.921053,0.840237,0.786389


### Create a CSV file with our new one-hot encoded dataset.

In [37]:
# Use pandas.DataFrame.to_csv to create the CSV file.
onehot_data.to_csv("data/processed/OneHot_MasterData5.csv", index = False)