In [4]:
%load_ext autoreload

In [3]:
%autoreload 2

import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from data_prep import load_and_preprocess_data, split_data


In [None]:
# Assume 'path_to_data_folder' is the path to the folder containing all player CSV files
data_folder = 'path_to_data_folder'  # Change this to the actual path
window_size = 6

# Load and preprocess the data
df = load_and_preprocess_data(data_folder, window_size)

# Define numerical and categorical features
numerical_features = ['goals', 'assists', 'other_numerical_features']
categorical_features = ['position']  # Assuming 'position' is the categorical feature

# Create transformers for numerical and categorical features
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a pipeline to apply the transformations
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Apply the transformations to the features
df['features'] = pipeline.fit_transform(df['features'])

# Split data into train, validation, and test sets
X_train, y_train, X_val, y_val, X_test, y_test = split_data(df)

# Print the shapes of the resulting arrays
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)