# Capstone Two: Pre‐processing & Training Data Development


In [59]:
# 0) imports

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline       import Pipeline
from sklearn.compose        import ColumnTransformer
from sklearn.impute         import SimpleImputer
from sklearn.preprocessing  import StandardScaler, OneHotEncoder


In [60]:
# load the three CSVs
adv = pd.read_csv("advanced_player_stats_checked.csv")   # advanced metrics
raw = pd.read_csv("nba_player_stats_checked.csv")       # basic stats
sal = pd.read_csv("nba_salary_checked.csv")             # salaries




In [61]:
# Clean the Salary column
def clean_currency(x):
    if isinstance(x, str):
        return float(x.replace('$', '').replace(',', '').strip())
    return x

df['Salary'] = df['Salary'].apply(clean_currency)


In [63]:
# Define X and y, then split
y = df['Salary']
drop_cols = ['Player','Team','Rk_x','Rk_y','Age_x','Age_y','Pos_x','Pos_y']
X = df.drop(columns=drop_cols + ['Salary'])


In [64]:
#Split into train & test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)


In [65]:
#  Identify which columns are numeric vs. categorical
numeric_feats     = X_train.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_feats = X_train.select_dtypes(include=['object']).columns.tolist()

In [66]:
# Build preprocessing pipeline
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler',   StandardScaler())
])
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot',  OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline,     numeric_feats),
    ('cat', categorical_pipeline, categorical_feats)
])

In [67]:
#Fit the preprocessor on the training data, transform both sets
X_train_proc = preprocessor.fit_transform(X_train)
X_test_proc  = preprocessor.transform(X_test)

In [68]:
# 8) Recover column names
ohe = preprocessor.named_transformers_['cat']['onehot']
dummy_cols = list(ohe.get_feature_names_out(categorical_feats))
all_cols   = numeric_feats + dummy_cols

X_train_df = pd.DataFrame(X_train_proc, columns=all_cols, index=X_train.index)
X_test_df  = pd.DataFrame(X_test_proc,  columns=all_cols, index=X_test.index)

In [70]:
# 10) Quick  checks
print("Train set shape (after preprocess):", X_train_df.shape)
print("Test set shape (after preprocess):", X_test_df.shape)
print("y_train length:", len(y_train), "y_test length:", len(y_test))

Train set shape (after preprocess): (202, 75)
Test set shape (after preprocess): (68, 75)
y_train length: 202 y_test length: 68
