In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from category_encoders import TargetEncoder
from sklearn.impute import SimpleImputer

In [3]:
data = pd.read_csv('football_data.csv',low_memory=False)
target = data['Win_Prob']
data = data.drop(columns=['Win_Prob'])
data.size

1010000

In [4]:
data = data.loc[:, (data != 0).any(axis=0)]
data.size

1010000

In [5]:
if 'Date' in data.columns:
    data['Date'] = pd.to_datetime(data['Date'])
    data['Year'] = data['Date'].dt.year
    data['Month'] = data['Date'].dt.month
    data['Day'] = data['Date'].dt.day
    data.drop(columns=['Date'], inplace=True)

if 'time' in data.columns:
    data['Time'] = pd.to_datetime(data['time'], format='%H:%M:%S', errors='coerce')
    data['Hour'] = data['Time'].dt.hour
    data['Minute'] = data['Time'].dt.minute
    data.drop(columns=['Time'], inplace=True)

In [6]:
threshold = len(data) * 0.01  # 1% of total rows to keep
data = data.dropna(thresh=threshold, axis=1)
data.size

970000

In [7]:
distinct_threshold = 0.99  # 99% distinct values
data = data.loc[:, data.nunique() / len(data) < distinct_threshold]
data.size

970000

In [8]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

In [9]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [10]:
categorical_columns = X_train.select_dtypes(include=['object']).columns.tolist()
target_encoder = TargetEncoder(cols=categorical_columns)

In [11]:
# Fit the encoder on the training data and transform
X_train_encoded = target_encoder.fit_transform(X_train[categorical_columns], y_train)

# Non-categorical columns in the training set
non_categorical_columns = X_train.select_dtypes(exclude=['object']).columns.tolist()

# Concatenating encoded categorical features with the non-categorical columns
X_train_encoded = pd.concat([X_train_encoded, X_train[non_categorical_columns].reset_index(drop=True)], axis=1)

# Transform validation and test data
X_val_encoded = target_encoder.transform(X_val[categorical_columns])
X_val_encoded = pd.concat([X_val_encoded, X_val[non_categorical_columns].reset_index(drop=True)], axis=1)

X_test_encoded = target_encoder.transform(X_test[categorical_columns])
X_test_encoded = pd.concat([X_test_encoded, X_test[non_categorical_columns].reset_index(drop=True)], axis=1)

In [12]:
# Step 10: Impute missing values
imputer = SimpleImputer(strategy='mean')  
X_train_encoded_imputed = imputer.fit_transform(X_train_encoded)
X_train_encoded = pd.DataFrame(X_train_encoded_imputed, columns=X_train_encoded.columns)

X_val_encoded_imputed = imputer.transform(X_val_encoded)
X_val_encoded = pd.DataFrame(X_val_encoded_imputed, columns=X_val_encoded.columns)

X_test_encoded_imputed = imputer.transform(X_test_encoded)
X_test_encoded = pd.DataFrame(X_test_encoded_imputed, columns=X_test_encoded.columns)

In [13]:
# Display final shapes
print("Train shape:", X_train_encoded.shape)
print("Validation shape:", X_val_encoded.shape)
print("Test shape:", X_test_encoded.shape)

Train shape: (8691, 97)
Validation shape: (2946, 97)
Test shape: (3590, 97)


In [14]:
data.shape

(10000, 97)