In [None]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load data
# Reads a comma-separated values (CSV) file into a DataFrame
data = pd.read_csv('dataset.csv')

# Separate features and target
X = data.drop('target_column', axis=1)  # Removes target_column, leaving feature columns
y = data['target_column']  # Selects target_column as the target variable

# Handle missing values
# Get numerical and categorical features based on their data types
numerical_features = [column for column in X.columns if X[column].dtype != 'object']
categorical_features = [column for column in X.columns if X[column].dtype == 'object']

# Define numerical transformer
# A pipeline that first fills missing numerical values with the mean, then scales them to have mean=0 and variance=1
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Fills missing numerical values with the mean
    ('scaler', StandardScaler())                 # Scales numerical features
])

# Define categorical transformer
# A pipeline that first fills missing categorical values with the most frequent category, then one-hot encodes them
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Fills missing categorical values with the most frequent category
    ('onehot', OneHotEncoder(handle_unknown='ignore'))    # One-hot encodes categorical features
])

# Combine numerical and categorical transformers
# Applies the numerical and categorical transformations defined above to the corresponding columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Split data
# Splits the data into training (80%) and testing (20%) sets, with a random seed for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply preprocessing on training data
# Fits and transforms the training data using the preprocessor defined above
X_train_preprocessed = preprocessor.fit_transform(X_train)

# Apply preprocessing on testing data
# Transforms the testing data using the same preprocessor (without fitting, to prevent data leakage)
X_test_preprocessed = preprocessor.transform(X_test)


In [None]:
#NBA Data Preprocessing Template

# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define the columns for the dependent (y) and independent (X) variables
y_column = 'WL_encoded'
x_excluded_independent_variables = ['GAME_DATE','TEAM_ABBREVIATION','WL','ORB%','DRB%','FGA','TOV','TOV%', 'TEAM_NAME','MATCHUP',
                                    'USG%','FG3A','OREB','PF','MIN','PTS','FGM','FG3M','FTM','FTA','PTS','REB','AST','PF','STL','BLK','DREB']

# Combine y_column with x_excluded_independent_variables
columns_to_drop = [y_column] + x_excluded_independent_variables

# Separate features and target
X = season_data.drop(columns=columns_to_drop, axis=1)  # Corrected, removed extra square brackets
y = season_data['WL_encoded']  # Selects target_column as the target variable

# Handle missing values
# Get numerical and categorical features based on their data types
numerical_features = [column for column in X.columns if X[column].dtype != 'object']
categorical_features = [column for column in X.columns if X[column].dtype == 'object']

# Define numerical transformer
# A pipeline that first fills missing numerical values with the mean, then scales them to have mean=0 and variance=1
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Fills missing numerical values with the mean
    ('scaler', StandardScaler())                 # Scales numerical features
])

# Define categorical transformer
# A pipeline that first fills missing categorical values with the most frequent category, then one-hot encodes them
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Fills missing categorical values with the most frequent category
    ('onehot', OneHotEncoder(handle_unknown='ignore'))    # One-hot encodes categorical features
])

# Combine numerical and categorical transformers
# Applies the numerical and categorical transformations defined above to the corresponding columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])
print(X.head())
# Split data
# Splits the data into training (80%) and testing (20%) sets, with a random seed for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply preprocessing on training data
# Fits and transforms the training data using the preprocessor defined above
X_train_preprocessed = preprocessor.fit_transform(X_train)

# Apply preprocessing on testing data
# Transforms the testing data using the same preprocessor (without fitting, to prevent data leakage)
X_test_preprocessed = preprocessor.transform(X_test)
