In [1]:
import wandb
import pandas as pd
import numpy as np
# from wandb import WandbCallback
from sklearn.model_selection import train_test_split

# wandb.init(  entity="drewmacl-uchicago", project="hw2-experiments")

## 1) Load in Athletes.csv

In [2]:
cols = ['candj','snatch', 'deadlift', 'backsq']
data = pd.read_csv('athletes.csv')
data['total_lift'] = data[cols].sum(axis=1)
data.head(1)

Unnamed: 0,athlete_id,name,region,team,affiliate,gender,age,height,weight,fran,...,deadlift,backsq,pullups,eat,train,background,experience,schedule,howlong,total_lift
0,2554.0,Pj Ablang,South West,Double Edge,Double Edge CrossFit,Male,24.0,70.0,166.0,,...,400.0,305.0,,,I workout mostly at a CrossFit Affiliate|I hav...,I played youth or high school level sports|I r...,I began CrossFit with a coach (e.g. at an affi...,I do multiple workouts in a day 2x a week|,4+ years|,925.0


In [3]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from catboost import CatBoostRegressor


## 2) Use cleaning Pipeline. Have ML pipeline also attached that could be used

In [4]:
# Using ChatGPT-4o along with previous data cleaning from assignemnt 1 and https://www.turing.com/kb/building-ml-pipeline-in-python-with-scikit-learn
class DropMissingSubset(BaseEstimator, TransformerMixin):
    def __init__(self, subset): self.subset = subset
    def fit(self, X, y=None): return self
    def transform(self, X): return X.dropna(subset=self.subset).copy()

class DropColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns): self.columns = columns
    def fit(self, X, y=None): return self
    def transform(self, X): return X.drop(columns=self.columns).copy()

class FilterOutliers(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X):
        df = X.copy()
        df = df[df['weight'] < 1500]
        df = df[df['gender'] != '--']
        df = df[df['age'] >= 18]
        df = df[(df['height'] < 96) & (df['height'] > 48)]
        df = df[((df['deadlift'] > 0) & ((df['deadlift'] <= 1105) |
               ((df['gender'] == 'Female') & (df['deadlift'] <= 636)))) &
                (df['candj'] > 0) & (df['candj'] <= 395) &
                (df['snatch'] > 0) & (df['snatch'] <= 496) &
                (df['backsq'] > 0) & (df['backsq'] <= 1069)]
        return df

class CleanSurvey(BaseEstimator, TransformerMixin):
    def __init__(self, decline_dict, subset):
        self.decline_dict = decline_dict
        self.subset = subset
    def fit(self, X, y=None): return self
    def transform(self, X):
        df = X.replace(self.decline_dict).copy()
        return df.dropna(subset=self.subset)

class DropAllMissing(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X): return X.dropna().copy()

# --- Combine into cleaning pipeline ---
cleaning_pipeline = Pipeline([
    ("drop_missing", DropMissingSubset(subset=[
        'region','age','weight','height','howlong','gender','eat','train',
        'background','experience','schedule','howlong',
        'deadlift','candj','snatch','backsq'
    ])),
    ("drop_cols", DropColumns(columns=[
        'affiliate','team','name','fran','helen','grace',
        'filthy50','fgonebad','run400','run5k','pullups','train'
    ])),
    ("outliers", FilterOutliers()),
    ("survey", CleanSurvey(
        decline_dict={'Decline to answer|': np.nan},
        subset=['background','experience','schedule','howlong','eat']
    )),
    ("drop_all_na", DropAllMissing())
])

# --- Custom full pipeline including split + model ---
class FullCatBoostPipeline:
    def __init__(self, cleaning_pipeline, target_col="total_lift", test_size=0.2, random_state=42):
        self.cleaning_pipeline = cleaning_pipeline
        self.target_col = target_col
        self.test_size = test_size
        self.random_state = random_state
        self.model = None

    def fit(self, data):
        # Step 1: Clean
        cleaned = self.cleaning_pipeline.fit_transform(data)

        # Step 2: Separate target
        X = cleaned.drop(columns=[self.target_col])
        y = cleaned[self.target_col]

        # Step 3: Train/test split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=self.test_size, random_state=self.random_state
        )

        # Step 4: Detect categorical features
        categorical_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
        self.categorical_cols = categorical_cols

        # Step 5: Fit CatBoost
        self.model = CatBoostRegressor(
            iterations=500,
            learning_rate=0.1,
            depth=6,
            cat_features=categorical_cols,
            verbose=0
        )
        self.model.fit(X_train, y_train)

        # Step 6: Evaluate
        y_pred = self.model.predict(X_test)
        print(f"MSE: {mean_squared_error(y_test, y_pred):.4f}")
        print(f"MAE: {mean_absolute_error(y_test, y_pred):.4f}")
        print(f"R²:  {r2_score(y_test, y_pred):.4f}")

        # Save for later use
        self.X_train, self.X_test = X_train, X_test
        self.y_train, self.y_test = y_train, y_test
        self.y_pred = y_pred

    def predict(self, X_new):
        return self.model.predict(X_new)

    def get_model(self):
        return self.model

In [5]:
data = cleaning_pipeline.fit_transform(data)

## 3) Using Feast Feature Store 

In [6]:
# Creating feature store with help from: https://pypi.org/project/feast/ and Chat Gpt-4o. Did so within Bash
from feast import FeatureStore

store = FeatureStore(repo_path="hw2_feature_repo/feature_repo")


## 4) Load data (using this logic in the files within feature_views folder. Differences in versions come from using different subsets

In [7]:
from datetime import datetime
data["event_timestamp"] = datetime.now()
data.to_parquet("hw2_feature_repo/feature_repo/data/athlete_features.parquet", index=False)

data_v2 = data[['athlete_id','gender', 'height', 'weight', 'total_lift']].copy()
data_v2["event_timestamp"] = datetime.now()
data_v2.to_parquet("hw2_feature_repo/feature_repo/data/athlete_features_v2.parquet", index=False)

In [8]:
# Put into python file feature_views/athlete_features.py

# did the same for athlete_features_v2.py

# from datetime import datetime
# data["event_timestamp"] = datetime.now()
# data.to_parquet("hw2_feature_repo/feature_repo/data/athlete_features.parquet", index=False)

# from feast import Entity, FeatureView, Field
# from feast.types import Float32, Int64, String
# from feast.infra.offline_stores.file_source import FileSource

# athlete = Entity(
#     name="athlete_id",
#     join_keys=["athlete_id"],
#     # value_type=Int64,
#     description="Unique athlete identifier"
# )

# athlete_source = FileSource(
#     path="data/athlete_features.parquet",
#     event_timestamp_column="event_timestamp",
# )

# athlete_features = FeatureView(
#     name="athlete_features",
#     entities=[athlete],
#     ttl=None,
#     schema=[
#         Field(name="gender", dtype=String),
#         Field(name="age", dtype=Float32),
#         Field(name="height", dtype=Float32),
#         Field(name="weight", dtype=Float32),
#         Field(name="candj", dtype=Float32),
#         Field(name="snatch", dtype=Float32),
#         Field(name="deadlift", dtype=Float32),
#         Field(name="backsq", dtype=Float32),
#         Field(name="eat", dtype=String),
#         Field(name="train", dtype=String),
#         Field(name="background", dtype=String),
#         Field(name="experience", dtype=String),
#         Field(name="schedule", dtype=String),
#         Field(name="howlong", dtype=String),
#     ],
#     source=athlete_source,
#     online=True,
# )

In [9]:
data.head(1)

Unnamed: 0,athlete_id,region,gender,age,height,weight,candj,snatch,deadlift,backsq,eat,background,experience,schedule,howlong,total_lift,event_timestamp
21,21269.0,Southern California,Male,30.0,71.0,200.0,235.0,175.0,385.0,315.0,I eat whatever is convenient|,I played youth or high school level sports|I p...,I began CrossFit by trying it alone (without a...,I do multiple workouts in a day 1x a week|I ty...,1-2 years|,1110.0,2025-07-17 16:58:25.943233


In [10]:
data.shape

(30015, 17)

In [11]:
data_v2.head(1)

Unnamed: 0,athlete_id,gender,height,weight,total_lift,event_timestamp
21,21269.0,Male,71.0,200.0,1110.0,2025-07-17 16:58:25.976887


In [12]:
data_v2.shape

(30015, 6)

In [13]:
entity_df = data[['athlete_id', 'event_timestamp']]
entity_df2 = data_v2[['athlete_id', 'event_timestamp']]

## 5a) Created 2 different versions of features

In [14]:
training_df_v1 = store.get_historical_features(
    entity_df=entity_df,
    features=[
        "athlete_features:gender",
        "athlete_features:candj",
        "athlete_features:snatch",
        "athlete_features:deadlift",
        "athlete_features:backsq",
        "athlete_features:eat",
        "athlete_features:background",
        "athlete_features:experience",
        "athlete_features:schedule",
        "athlete_features:howlong",
        "athlete_features:total_lift",
    ]
).to_df()

training_df_v1.head()

Unnamed: 0,athlete_id,event_timestamp,gender,candj,snatch,deadlift,backsq,eat,background,experience,schedule,howlong,total_lift
0,21269.0,2025-07-17 16:58:25.943233+00:00,Male,235.0,175.0,385.0,315.0,I eat whatever is convenient|,I played youth or high school level sports|I p...,I began CrossFit by trying it alone (without a...,I do multiple workouts in a day 1x a week|I ty...,1-2 years|,1110.0
1,209747.0,2025-07-17 16:58:25.943233+00:00,Male,165.0,145.0,300.0,245.0,I eat quality foods but don't measure the amount|,I have no athletic background besides CrossFit|,I began CrossFit with a coach (e.g. at an affi...,I usually only do 1 workout a day|,1-2 years|,855.0
2,537592.0,2025-07-17 16:58:25.943233+00:00,Male,155.0,105.0,275.0,205.0,I eat quality foods but don't measure the amount|,I played youth or high school level sports|I r...,I began CrossFit with a coach (e.g. at an affi...,I usually only do 1 workout a day|,6-12 months|,740.0
3,496716.0,2025-07-17 16:58:25.943233+00:00,Male,176.0,99.0,265.0,220.0,I eat quality foods but don't measure the amou...,I have no athletic background besides CrossFit|,I began CrossFit with a coach (e.g. at an affi...,I usually only do 1 workout a day|,Less than 6 months|,760.0
4,272938.0,2025-07-17 16:58:25.943233+00:00,Male,165.0,121.0,298.0,209.0,I eat quality foods but don't measure the amou...,I played professional sports|I regularly play ...,I began CrossFit by trying it alone (without a...,I usually only do 1 workout a day|I do multipl...,6-12 months|,793.0


In [15]:
# V2 has limited number of features. 
training_df_v2 = store.get_historical_features(
    entity_df=entity_df2,
    features=[
        "athlete_features_v2:gender",
        "athlete_features_v2:height",
        "athlete_features_v2:weight",
        "athlete_features_v2:total_lift",
    ]
).to_df()

training_df_v2.head()

Unnamed: 0,athlete_id,event_timestamp,gender,height,weight,total_lift
0,21269.0,2025-07-17 16:58:25.976887+00:00,Male,71.0,200.0,1110.0
1,209747.0,2025-07-17 16:58:25.976887+00:00,Male,72.0,203.0,855.0
2,537592.0,2025-07-17 16:58:25.976887+00:00,Male,70.0,195.0,740.0
3,496716.0,2025-07-17 16:58:25.976887+00:00,Male,71.0,167.0,760.0
4,272938.0,2025-07-17 16:58:25.976887+00:00,Male,72.0,167.0,793.0


## 5b) Train algorithm with 2 different versions of features

In [16]:
# using chat-gpt4-o for help

In [17]:
class FullCatBoostPipeline:
    def __init__(self,target_col="total_lift", test_size=0.2, random_state=42, catboost_params=None):
        self.target_col = target_col
        self.test_size = test_size
        self.random_state = random_state
        self.catboost_params = catboost_params or {
            "iterations": 500,
            "learning_rate": 0.1,
            "depth": 6,
            "verbose": 0
        }
        self.model = None

    def fit(self, data):

        X = data.drop(columns=[self.target_col, 'event_timestamp'])
        y = data[self.target_col]

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=self.test_size, random_state=self.random_state
        )

        categorical_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
        self.categorical_cols = categorical_cols

        self.model = CatBoostRegressor(
            cat_features=categorical_cols,
            **self.catboost_params
        )
        self.model.fit(X_train, y_train)

        y_pred = self.model.predict(X_test)
        print(f"MSE: {mean_squared_error(y_test, y_pred):.4f}")
        print(f"MAE: {mean_absolute_error(y_test, y_pred):.4f}")
        print(f"R²:  {r2_score(y_test, y_pred):.4f}")

        self.X_train, self.X_test = X_train, X_test
        self.y_train, self.y_test = y_train, y_test
        self.y_pred = y_pred

In [18]:
feature_versions = {
    "v1_more_columns": training_df_v1,
    "v2_age_height_gender": training_df_v2
}

param_sets = [
    {"iterations": 300, "learning_rate": 0.05, "depth": 4, "verbose": 0},
    {"iterations": 600, "learning_rate": 0.1, "depth": 6, "verbose": 0}
]

In [19]:
import os
os.environ["WANDB_NOTEBOOK_NAME"] = "debug_notebook"
os.environ["WANDB_MODE"] = "online"  # force online sync

import wandb
wandb.login()  # or wandb.login(key="your-key")

[34m[1mwandb[0m: Currently logged in as: [33mdrewmacl[0m ([33mdrewmacl-uchicago[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [24]:
from codecarbon import EmissionsTracker

In [26]:
# 4 DIFFERENT RUNS WITH DIFFERENT DATA VERSIONS AND HYPER PARAMETERS

for version_name, df in feature_versions.items():
    for i, params in enumerate(param_sets):
        run = wandb.init(
            project="hw2-feature-versioning",
            name=f"catboost_{version_name}_hp{i+1}",
            config={
                "features_version": version_name,
                **params
            }
        )

        tracker = EmissionsTracker()
        tracker.start()
        
        pipe = FullCatBoostPipeline(
            catboost_params=params
        )

        pipe.fit(df)

        emissions_data = tracker.stop()
        # Log evaluation metrics
        wandb.log({
            "mse": mean_squared_error(pipe.y_test, pipe.y_pred),
            "mae": mean_absolute_error(pipe.y_test, pipe.y_pred),
            "r2": r2_score(pipe.y_test, pipe.y_pred),
            "carbon_emissions_kg": emissions_data 
        })
        run.finish()

  self.scope.user = {"email": email}
  self.scope.user = {"email": email}


[codecarbon INFO @ 17:05:00] [setup] RAM Tracking...
[codecarbon INFO @ 17:05:00] [setup] CPU Tracking...
 Mac OS and ARM processor detected: Please enable PowerMetrics sudo to measure CPU

[codecarbon INFO @ 17:05:00] CPU Model on constant consumption mode: Apple M1
[codecarbon INFO @ 17:05:00] [setup] GPU Tracking...
[codecarbon INFO @ 17:05:00] No GPU found.
[codecarbon INFO @ 17:05:00] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: global constant
                GPU Tracking Method: Unspecified
            
[codecarbon INFO @ 17:05:00] >>> Tracker's metadata:
[codecarbon INFO @ 17:05:00]   Platform system: macOS-13.5.2-arm64-arm-64bit
[codecarbon INFO @ 17:05:00]   Python version: 3.10.18
[codecarbon INFO @ 17:05:00]   CodeCarbon version: 3.0.4
[codecarbon INFO @ 17:05:00]   Available RAM : 16.000 GB
[codecarbon INFO @ 17:05:00]   CPU count: 8 thread(s) in 1 physical CPU(s)
[codecarb

MSE: 35.6330
MAE: 4.5134
R²:  0.9995


0,1
carbon_emissions_kg,▁
mae,▁
mse,▁
r2,▁

0,1
carbon_emissions_kg,0.0
mae,4.5134
mse,35.63302
r2,0.99955


  self.scope.user = {"email": email}
  self.scope.user = {"email": email}


[codecarbon INFO @ 17:05:04] [setup] RAM Tracking...
[codecarbon INFO @ 17:05:04] [setup] CPU Tracking...
 Mac OS and ARM processor detected: Please enable PowerMetrics sudo to measure CPU

[codecarbon INFO @ 17:05:04] CPU Model on constant consumption mode: Apple M1
[codecarbon INFO @ 17:05:04] [setup] GPU Tracking...
[codecarbon INFO @ 17:05:04] No GPU found.
[codecarbon INFO @ 17:05:04] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: global constant
                GPU Tracking Method: Unspecified
            
[codecarbon INFO @ 17:05:04] >>> Tracker's metadata:
[codecarbon INFO @ 17:05:04]   Platform system: macOS-13.5.2-arm64-arm-64bit
[codecarbon INFO @ 17:05:04]   Python version: 3.10.18
[codecarbon INFO @ 17:05:04]   CodeCarbon version: 3.0.4
[codecarbon INFO @ 17:05:04]   Available RAM : 16.000 GB
[codecarbon INFO @ 17:05:04]   CPU count: 8 thread(s) in 1 physical CPU(s)
[codecarb

MSE: 9.1734
MAE: 1.9461
R²:  0.9999


0,1
carbon_emissions_kg,▁
mae,▁
mse,▁
r2,▁

0,1
carbon_emissions_kg,0.0
mae,1.94614
mse,9.1734
r2,0.99988


  self.scope.user = {"email": email}
  self.scope.user = {"email": email}


[codecarbon INFO @ 17:05:10] [setup] RAM Tracking...
[codecarbon INFO @ 17:05:10] [setup] CPU Tracking...
 Mac OS and ARM processor detected: Please enable PowerMetrics sudo to measure CPU

[codecarbon INFO @ 17:05:10] CPU Model on constant consumption mode: Apple M1
[codecarbon INFO @ 17:05:10] [setup] GPU Tracking...
[codecarbon INFO @ 17:05:10] No GPU found.
[codecarbon INFO @ 17:05:10] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: global constant
                GPU Tracking Method: Unspecified
            
[codecarbon INFO @ 17:05:10] >>> Tracker's metadata:
[codecarbon INFO @ 17:05:10]   Platform system: macOS-13.5.2-arm64-arm-64bit
[codecarbon INFO @ 17:05:10]   Python version: 3.10.18
[codecarbon INFO @ 17:05:10]   CodeCarbon version: 3.0.4
[codecarbon INFO @ 17:05:10]   Available RAM : 16.000 GB
[codecarbon INFO @ 17:05:10]   CPU count: 8 thread(s) in 1 physical CPU(s)
[codecarb

MSE: 25759.3896
MAE: 124.8895
R²:  0.6727


0,1
carbon_emissions_kg,▁
mae,▁
mse,▁
r2,▁

0,1
carbon_emissions_kg,0.0
mae,124.88946
mse,25759.38962
r2,0.67269


  self.scope.user = {"email": email}
  self.scope.user = {"email": email}


[codecarbon INFO @ 17:05:11] [setup] RAM Tracking...
[codecarbon INFO @ 17:05:11] [setup] CPU Tracking...
 Mac OS and ARM processor detected: Please enable PowerMetrics sudo to measure CPU

[codecarbon INFO @ 17:05:12] CPU Model on constant consumption mode: Apple M1
[codecarbon INFO @ 17:05:12] [setup] GPU Tracking...
[codecarbon INFO @ 17:05:12] No GPU found.
[codecarbon INFO @ 17:05:12] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: global constant
                GPU Tracking Method: Unspecified
            
[codecarbon INFO @ 17:05:12] >>> Tracker's metadata:
[codecarbon INFO @ 17:05:12]   Platform system: macOS-13.5.2-arm64-arm-64bit
[codecarbon INFO @ 17:05:12]   Python version: 3.10.18
[codecarbon INFO @ 17:05:12]   CodeCarbon version: 3.0.4
[codecarbon INFO @ 17:05:12]   Available RAM : 16.000 GB
[codecarbon INFO @ 17:05:12]   CPU count: 8 thread(s) in 1 physical CPU(s)
[codecarb

MSE: 25906.6816
MAE: 125.0986
R²:  0.6708


0,1
carbon_emissions_kg,▁
mae,▁
mse,▁
r2,▁

0,1
carbon_emissions_kg,0.0
mae,125.0986
mse,25906.68155
r2,0.67081
