#### Data Exploration code

In [None]:
import os
import warnings
from concurrent.futures import ThreadPoolExecutor, as_completed

import pandas as pd
from prophet import Prophet
from tqdm import tqdm

warnings.filterwarnings("ignore")

In [None]:
import pandas as pd


class ActigraphAggregation:
    def __init__(self, root_dir):
        self.root_dir = root_dir

    def load_data(self, participant_id):
        file_path = os.path.join(self.root_dir, f"id={participant_id}", "part-0.parquet")
        data = pd.read_parquet(file_path)
        data["id"] = participant_id
        print(f"Loaded data columns for {participant_id}: {data.columns.tolist()}")
        return data

    def aggregate_actigraphy(self, data):
        """
        Aggregate the actigraphy data for a single participant.

        Parameters:
        - data (DataFrame): Actigraphy data for a participant.

        Returns:
        - DataFrame: Aggregated actigraphy data with one row per 'id'.
        """
        aggregated_df = (
            data.groupby("id")
            .agg(
                {
                    "X": ["mean", "std", "max", "min"],
                    "Y": ["mean", "std", "max", "min"],
                    "Z": ["mean", "std", "max", "min"],
                    "enmo": ["mean", "std", "max", "min"],
                    "anglez": "mean",
                    "non-wear_flag": "sum",
                    "light": ["mean", "std", "max", "min"],
                    "battery_voltage": "mean",
                }
            )
            .reset_index()
        )

        # Flatten column names
        aggregated_df.columns = ["_".join(col).strip() if col[1] else col[0] for col in aggregated_df.columns.values]
        return aggregated_df

    def process_participant_data(self, participant_id):
        data = self.load_data(participant_id)
        aggregated_data = self.aggregate_actigraphy(data)
        return aggregated_data

    def process_all_participants(self):
        all_data = []
        with ThreadPoolExecutor() as executor:
            futures = []
            for id_folder in os.listdir(self.root_dir):
                if not id_folder.startswith("id="):
                    continue
                participant_id = id_folder.split("=")[-1]
                futures.append(executor.submit(self.process_participant_data, participant_id))

            for future in tqdm(as_completed(futures), total=len(futures), desc="Processing participants"):
                participant_data = future.result()
                all_data.append(participant_data)

        feature_table = pd.concat(all_data, ignore_index=True)
        return feature_table

In [None]:
# Initialize with the root directory containing participant data
aggregator = ActigraphAggregation(root_dir="../../data/series_test.parquet/")

# Process all participants and get the final aggregated feature table
feature_table = aggregator.process_all_participants()

# Inspect the feature table
print(feature_table.head())

In [None]:
import numpy as np
import pandas as pd


class ActigraphAggregation:
    def __init__(self, root_dir):
        self.root_dir = root_dir

    def load_data(self, participant_id):
        file_path = os.path.join(self.root_dir, f"id={participant_id}", "part-0.parquet")
        if not os.path.exists(file_path):
            print(f"File not found for participant {participant_id}")
            return pd.DataFrame()  # Return an empty DataFrame if file doesn't exist
        data = pd.read_parquet(file_path)
        data["id"] = participant_id
        print(f"Loaded data columns for {participant_id}: {data.columns.tolist()}")
        return data

    def temporal_aggregations(self, data):
        # Classify weekday/weekend
        data["weekday_flag"] = data["weekday"].apply(lambda x: "weekday" if x < 5 else "weekend")

        # Classify time of day
        conditions = [
            (data["time_of_day"] < 6 * 3600),  # Midnight to 6 AM
            (data["time_of_day"] >= 6 * 3600) & (data["time_of_day"] < 12 * 3600),  # 6 AM to Noon
            (data["time_of_day"] >= 12 * 3600) & (data["time_of_day"] < 18 * 3600),  # Noon to 6 PM
            (data["time_of_day"] >= 18 * 3600),  # 6 PM to Midnight
        ]
        choices = ["night", "morning", "afternoon", "evening"]
        data["time_period"] = np.select(conditions, choices, default="unknown")

        # Aggregate by weekday/weekend and time of day
        temporal_agg = (
            data.groupby(["id", "weekday_flag", "time_period"])
            .agg({"enmo": "mean", "light": "mean", "non-wear_flag": "sum"})
            .unstack(fill_value=0)
        )
        temporal_agg.columns = ["_".join(col).strip() for col in temporal_agg.columns.values]

        return temporal_agg.reset_index()

    def process_participant_data(self, participant_id):
        data = self.load_data(participant_id)
        if data.empty:  # Skip processing if data is empty
            return pd.DataFrame()
        temporal_data = self.temporal_aggregations(data)
        return temporal_data

    def process_all_participants(self):
        all_data = []
        with ThreadPoolExecutor() as executor:
            futures = []
            for id_folder in os.listdir(self.root_dir):
                if not id_folder.startswith("id="):
                    continue
                participant_id = id_folder.split("=")[-1]
                futures.append(executor.submit(self.process_participant_data, participant_id))

            for future in tqdm(as_completed(futures), total=len(futures), desc="Processing participants"):
                participant_data = future.result()
                if not participant_data.empty:  # Only append non-empty data
                    all_data.append(participant_data)

        feature_table = pd.concat(all_data, ignore_index=True)
        return feature_table


# Initialize with the root directory containing participant data
temporal = ActigraphAggregation(root_dir="../../data/series_test.parquet/")

# Process all participants and get the final aggregated feature table
feature_table = temporal.process_all_participants()

# Inspect the feature table
print(feature_table.head())

In [None]:
import pandas as pd


class ActigraphAggregation:
    def __init__(self, root_dir):
        self.root_dir = root_dir

    def load_data(self, participant_id):
        file_path = os.path.join(self.root_dir, f"id={participant_id}", "part-0.parquet")
        if not os.path.exists(file_path):
            print(f"File not found for participant {participant_id}")
            return pd.DataFrame()  # Return an empty DataFrame if file doesn't exist
        data = pd.read_parquet(file_path)
        data["id"] = participant_id
        print(f"Loaded data columns for {participant_id}: {data.columns.tolist()}")
        return data

    def aggregate_actigraphy(self, data):
        """
        Aggregate the actigraphy data for each participant with summary statistics.

        Parameters:
        - data (DataFrame): Actigraphy data for a participant.

        Returns:
        - DataFrame: Aggregated actigraphy data with summary statistics for each `id`.
        """
        aggregated_df = (
            data.groupby("id")
            .agg(
                {
                    "X": ["mean", "std", "max", "min"],
                    "Y": ["mean", "std", "max", "min"],
                    "Z": ["mean", "std", "max", "min"],
                    "enmo": ["mean", "std", "max", "min"],
                    "anglez": "mean",  # Mean of the angle metric
                    "non-wear_flag": "sum",  # Total non-wear time
                    "light": ["mean", "std", "max", "min"],
                    "battery_voltage": "mean",  # Average battery voltage
                }
            )
            .reset_index()
        )

        # Flatten the multi-level column names
        aggregated_df.columns = ["_".join(col).strip() if col[1] else col[0] for col in aggregated_df.columns.values]
        return aggregated_df

    def temporal_aggregations(self, data):
        # Classify weekday/weekend
        data["weekday_flag"] = data["weekday"].apply(lambda x: "weekday" if x < 5 else "weekend")

        # Classify time of day
        conditions = [
            (data["time_of_day"] < 6 * 3600),  # Midnight to 6 AM
            (data["time_of_day"] >= 6 * 3600) & (data["time_of_day"] < 12 * 3600),  # 6 AM to Noon
            (data["time_of_day"] >= 12 * 3600) & (data["time_of_day"] < 18 * 3600),  # Noon to 6 PM
            (data["time_of_day"] >= 18 * 3600),  # 6 PM to Midnight
        ]
        choices = ["night", "morning", "afternoon", "evening"]
        data["time_period"] = np.select(conditions, choices, default="unknown")

        # Aggregate by weekday/weekend and time of day
        temporal_agg = (
            data.groupby(["id", "weekday_flag", "time_period"])
            .agg({"enmo": "mean", "light": "mean", "non-wear_flag": "sum"})
            .unstack(fill_value=0)
        )
        temporal_agg.columns = ["_".join(col).strip() for col in temporal_agg.columns.values]

        return temporal_agg.reset_index()

    def activity_ratios(self, data):
        """
        Calculate the ratio of non-wear time to total measurement time for each participant.

        Parameters:
        - data (DataFrame): Actigraphy data for a participant.

        Returns:
        - DataFrame: Contains `non_wear_ratio` for each `id`.
        """
        total_time = data.groupby("id").size().rename("total_time")
        non_wear_time = data.groupby("id")["non-wear_flag"].sum().rename("non_wear_time")

        # Calculate ratio
        ratios = pd.concat([total_time, non_wear_time], axis=1)
        ratios["non_wear_ratio"] = ratios["non_wear_time"] / ratios["total_time"]

        return ratios[["non_wear_ratio"]].reset_index()

    def process_participant_data(self, participant_id):
        data = self.load_data(participant_id)
        if data.empty:  # Skip processing if data is empty
            return pd.DataFrame()

        # Calculate aggregate statistics, temporal aggregation, and activity ratios
        aggregate_data = self.aggregate_actigraphy(data)
        temporal_data = self.temporal_aggregations(data)
        ratio_data = self.activity_ratios(data)

        # Merge all feature data on `id`
        participant_data = aggregate_data.merge(temporal_data, on="id", how="left")
        participant_data = participant_data.merge(ratio_data, on="id", how="left")
        return participant_data

    def process_all_participants(self):
        all_data = []
        with ThreadPoolExecutor() as executor:
            futures = []
            for id_folder in os.listdir(self.root_dir):
                if not id_folder.startswith("id="):
                    continue
                participant_id = id_folder.split("=")[-1]
                futures.append(executor.submit(self.process_participant_data, participant_id))

            for future in tqdm(as_completed(futures), total=len(futures), desc="Processing participants"):
                participant_data = future.result()
                if not participant_data.empty:  # Only append non-empty data
                    all_data.append(participant_data)

        feature_table = pd.concat(all_data, ignore_index=True)
        return feature_table


# Initialize with the root directory containing participant data
aggregator = ActigraphAggregation(root_dir="../../data/series_test.parquet/")

# Process all participants and get the final aggregated feature table
feature_table = aggregator.process_all_participants()

# Inspect the feature table
print(feature_table.head())

In [None]:
class ActigraphyDataProcessor:
    def __init__(self, root_dir, feature_table_path):
        self.root_dir = root_dir
        self.feature_table_path = feature_table_path
        self.features_to_forecast = ["enmo", "light", "battery_voltage"]

    def load_data(self, participant_id):
        file_path = os.path.join(self.root_dir, f"id={participant_id}", "part-0.parquet")
        data = pd.read_parquet(file_path)
        data["id"] = participant_id
        print(f"Loaded data columns for {participant_id}: {data.columns.tolist()}")
        return data

    def compute_daily_summary(self, data):
        daily_summary = (
            data.groupby(["id", "relative_date_PCIAT"])
            .agg(
                {
                    "X": ["mean", "max", "min", "std"],
                    "Y": ["mean", "max", "min", "std"],
                    "Z": ["mean", "max", "min", "std"],
                    "enmo": ["mean", "max", "std"],
                    "anglez": "mean",
                    "non-wear_flag": "sum",
                    "light": ["mean", "max", "min"],
                    "battery_voltage": "mean",
                }
            )
            .reset_index()
        )
        daily_summary.columns = ["_".join(col).strip() if col[1] else col[0] for col in daily_summary.columns.values]
        print(f"Daily summary columns after aggregation: {daily_summary.columns.tolist()}")
        return daily_summary

    def add_temporal_features(self, daily_summary):
        for col in ["enmo_mean", "light_mean", "battery_voltage_mean"]:
            if col in daily_summary.columns:
                daily_summary[f"{col}_lag1"] = daily_summary.groupby("id")[col].shift(1)
                daily_summary[f"{col}_rolling3"] = (
                    daily_summary.groupby("id")[col].rolling(window=3).mean().reset_index(0, drop=True)
                )
            else:
                print(f"Warning: Column {col} not found in daily_summary.")
        return daily_summary

    def forecast_features(self, data, participant_id):
        date_mapping = data[["time_of_day", "relative_date_PCIAT"]].drop_duplicates()
        date_mapping["ds"] = pd.to_datetime(date_mapping["time_of_day"], unit="s", errors="coerce")

        predictions = []
        for feature in self.features_to_forecast:
            df = data[["time_of_day", feature]].dropna().rename(columns={"time_of_day": "ds", feature: "y"})
            max_time = df["ds"].max()
            if max_time > 10**12:
                df["ds"] = pd.to_datetime(df["ds"] / 10**3, unit="s", errors="coerce")
            else:
                df["ds"] = pd.to_datetime(df["ds"], unit="s", errors="coerce")
            df = df.dropna(subset=["ds"])

            model = Prophet(daily_seasonality=True, weekly_seasonality=True)
            model.fit(df)
            future = model.make_future_dataframe(periods=7, freq="D")
            forecast = model.predict(future)

            forecast["id"] = participant_id
            forecast = forecast[["id", "ds", "yhat"]].rename(columns={"yhat": f"{feature}_forecast"})
            predictions.append(forecast)

        predictions_df = pd.concat(predictions, axis=1)
        predictions_df = predictions_df.loc[:, ~predictions_df.columns.duplicated()]
        predictions_df = pd.merge(predictions_df, date_mapping[["ds", "relative_date_PCIAT"]], on="ds", how="left")

        return predictions_df

    def process_participant_data(self, participant_id):
        data = self.load_data(participant_id)
        daily_summary = self.compute_daily_summary(data)
        print(f"Daily summary columns for {participant_id}: {daily_summary.columns.tolist()}")
        daily_summary = self.add_temporal_features(daily_summary)
        forecasts = self.forecast_features(data, participant_id)
        print(f"Forecast columns for {participant_id}: {forecasts.columns.tolist()}")
        processed_data = pd.merge(daily_summary, forecasts, on=["id", "relative_date_PCIAT"], how="left")
        return processed_data

    def process_all_participants(self):
        all_data = []
        with ThreadPoolExecutor() as executor:
            futures = []
            for id_folder in os.listdir(self.root_dir):
                if not id_folder.startswith("id="):
                    continue
                participant_id = id_folder.split("=")[-1]
                futures.append(executor.submit(self.process_participant_data, participant_id))

            for future in tqdm(as_completed(futures), total=len(futures), desc="Processing participants"):
                participant_data = future.result()
                all_data.append(participant_data)

        feature_table = pd.concat(all_data, ignore_index=True)
        return feature_table

    def save_feature_table(self):
        os.makedirs(os.path.dirname(self.feature_table_path), exist_ok=True)
        feature_table = self.process_all_participants()
        feature_table.to_parquet(self.feature_table_path, index=False)
        print(f"Feature table saved at {self.feature_table_path}")

#### Main Codes

In [None]:
import joblib
import lightgbm as lgb
import matplotlib.pyplot as plt
import pandas as pd
import yaml
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split


class MultiClassLightGBM:
    def __init__(self, config_path):
        self.config = self.load_config(config_path)
        self.model = None

    def load_config(self, config_path):
        """Load configuration file."""
        with open(config_path, "r") as file:
            config = yaml.safe_load(file)
        return config

    def load_data(self, filepath):
        """Load dataset and split into features and target based on config."""
        data = pd.read_csv(filepath)
        X = data[self.config["num_features"] + self.config["cat_features"]]
        y = data[self.config["target"]]
        return X, y

    def preprocess_data(self, X):
        """Preprocess the data by encoding categorical variables."""
        X = pd.get_dummies(X, columns=self.config["cat_features"], drop_first=True)
        return X

    def train(self, X, y):
        """Train the LightGBM model."""
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

        # LightGBM dataset
        train_data = lgb.Dataset(X_train, label=y_train)
        val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

        # Set LightGBM parameters for multiclass classification
        params = {
            "objective": "multiclass",
            "num_class": len(y.unique()),
            "learning_rate": self.config["model_parameters"]["learning_rate"],
            "n_estimators": self.config["model_parameters"]["n_estimators"],
            "max_depth": self.config["model_parameters"]["max_depth"],
            "metric": "multi_logloss",
        }

        # Train the model
        self.model = lgb.train(
            params, train_data, valid_sets=[train_data, val_data], early_stopping_rounds=50, verbose_eval=10
        )

        # Save the model
        joblib.dump(self.model, "lgbm_model.pkl")
        print("Model training complete and saved as 'lgbm_model.pkl'")

    def evaluate(self, X, y):
        """Evaluate the model on the test set."""
        y_pred = self.model.predict(X, num_iteration=self.model.best_iteration)
        y_pred = y_pred.argmax(axis=1)  # Get class with max probability

        # Print classification report
        print("Evaluation Results:")
        print(classification_report(y, y_pred))
        print("Accuracy:", accuracy_score(y, y_pred))

    def feature_importance(self, X):
        """Display the feature importance."""
        importance = self.model.feature_importance(importance_type="gain")
        feature_names = X.columns
        feature_importance = pd.DataFrame({"feature": feature_names, "importance": importance})
        feature_importance = feature_importance.sort_values(by="importance", ascending=False)

        # Plotting feature importance
        plt.figure(figsize=(10, 8))
        plt.barh(feature_importance["feature"], feature_importance["importance"], color="skyblue")
        plt.xlabel("Importance")
        plt.title("Feature Importance")
        plt.gca().invert_yaxis()
        plt.show()

    def load_model(self, model_path="lgbm_model.pkl"):
        """Load a saved model."""
        self.model = joblib.load(model_path)
        print("Model loaded from", model_path)

    def predict(self, X):
        """Make predictions with the trained model."""
        X_preprocessed = self.preprocess_data(X)
        y_pred = self.model.predict(X_preprocessed, num_iteration=self.model.best_iteration)
        return y_pred.argmax(axis=1)


if __name__ == "__main__":
    # Configuration and paths
    config_path = "project_config.yml"
    train_path = "train.csv"

    # Initialize model class
    lgbm_classifier = MultiClassLightGBM(config_path)

    # Load and preprocess data
    X, y = lgbm_classifier.load_data(train_path)
    X = lgbm_classifier.preprocess_data(X)

    # Train model
    lgbm_classifier.train(X, y)

    # Evaluate model
    lgbm_classifier.evaluate(X, y)

    # Display feature importance
    lgbm_classifier.feature_importance(X)

In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler


class TrainDataProcessor:
    def __init__(self, train_df, config_path):
        self.train_df = train_df
        self.config = self.load_config(config_path)
        self.num_features = self.config["num_features"]
        self.cat_features = self.config["cat_features"]
        self.target = self.config["target"]

    def load_config(self, config_path):
        """Load the project configuration file."""
        with open(config_path, "r") as file:
            config = yaml.safe_load(file)
        return config

    def preprocess_data(self):
        """Preprocess the train dataset by handling missing values and data type conversions."""
        self.handle_missing_values()
        self.convert_data_types()
        return self.train_df

    def handle_missing_values(self):
        """Handle missing values in the train dataset."""
        # Fill numeric columns with mean
        numeric_cols = self.train_df[self.num_features]
        imputer = SimpleImputer(strategy="mean")
        self.train_df[self.num_features] = imputer.fit_transform(numeric_cols)

        # Fill categorical columns with mode
        for col in self.cat_features:
            self.train_df[col].fillna(self.train_df[col].mode()[0], inplace=True)

    def convert_data_types(self):
        """Convert categorical columns to appropriate data types."""
        # Convert 'Sex' to binary encoding if it's part of numerical features
        if "Basic_Demos-Sex" in self.num_features:
            self.train_df["Basic_Demos-Sex"] = self.train_df["Basic_Demos-Sex"].map({"Male": 1, "Female": 0})

    def feature_engineering(self):
        """Perform feature engineering to create new features."""
        self.add_age_groups()
        self.one_hot_encode_seasons()
        self.calculate_behavioral_scores()
        self.add_interaction_features()
        return self.train_df

    def add_age_groups(self):
        """Add age groups based on age."""
        if "Basic_Demos-Age" in self.num_features:
            self.train_df["Age_Group"] = pd.cut(
                self.train_df["Basic_Demos-Age"], bins=[0, 12, 17, 25], labels=["Child", "Teen", "Young Adult"]
            )

    def one_hot_encode_seasons(self):
        """One-hot encode season columns."""
        for col in self.cat_features:
            if "Season" in col:
                one_hot = pd.get_dummies(self.train_df[col], prefix=col)
                self.train_df = pd.concat([self.train_df, one_hot], axis=1)

    def calculate_behavioral_scores(self):
        """Calculate behavioral and psychological indicators."""
        # Bin PCIAT total score
        if "PCIAT-PCIAT_Total" in self.num_features:
            self.train_df["PCIAT_Bin"] = pd.cut(
                self.train_df["PCIAT-PCIAT_Total"], bins=[0, 20, 40, 60], labels=["Mild", "Moderate", "Severe"]
            )

        # Categorize internet use
        if "PreInt_EduHx-computerinternet_hoursday" in self.num_features:
            self.train_df["Internet_Use_Category"] = pd.cut(
                self.train_df["PreInt_EduHx-computerinternet_hoursday"],
                bins=[0, 1, 3, 6, np.inf],
                labels=["Low", "Moderate", "High", "Very High"],
            )

    def add_interaction_features(self):
        """Add interaction features, such as age-adjusted scores."""
        # Age-adjusted CGAS Score
        if "CGAS-CGAS_Score" in self.num_features and "Basic_Demos-Age" in self.num_features:
            self.train_df["Age_Adjusted_CGAS"] = self.train_df["CGAS-CGAS_Score"] / self.train_df["Basic_Demos-Age"]

        # BMI Categories
        if "Physical-BMI" in self.num_features:
            self.train_df["BMI_Category"] = pd.cut(
                self.train_df["Physical-BMI"],
                bins=[0, 18.5, 25, 30, np.inf],
                labels=["Underweight", "Normal", "Overweight", "Obese"],
            )

    def scale_numeric_features(self):
        """Scale numeric features in the final dataset."""
        scaler = StandardScaler()
        self.train_df[self.num_features] = scaler.fit_transform(self.train_df[self.num_features])

    def process(self):
        """Run the complete processing pipeline."""
        self.preprocess_data()
        self.feature_engineering()
        self.scale_numeric_features()
        return self.train_df


# Example usage:

# Load train data
train_df = pd.read_csv("path/to/train.csv")

# Initialize and process the data using the config file
processor = TrainDataProcessor(train_df, "project_config.yml")
processed_df = processor.process()

# Inspect the processed dataframe
print(processed_df.head())

In [1]:
import pandas as pd


class ActigraphAggregation:
    def __init__(self, root_dir):
        self.root_dir = root_dir

    def load_data(self, participant_id):
        file_path = os.path.join(self.root_dir, f"id={participant_id}", "part-0.parquet")
        if not os.path.exists(file_path):
            print(f"File not found for participant {participant_id}")
            return pd.DataFrame()  # Return an empty DataFrame if file doesn't exist
        data = pd.read_parquet(file_path)
        data["id"] = participant_id
        # print(f"Loaded data columns for {participant_id}: {data.columns.tolist()}")
        return data

    def aggregate_actigraphy(self, data):
        """
        Aggregate the actigraphy data for each participant with summary statistics.

        Parameters:
        - data (DataFrame): Actigraphy data for a participant.

        Returns:
        - DataFrame: Aggregated actigraphy data with summary statistics for each `id`.
        """
        aggregated_df = (
            data.groupby("id")
            .agg(
                {
                    "X": ["mean", "std", "max", "min"],
                    "Y": ["mean", "std", "max", "min"],
                    "Z": ["mean", "std", "max", "min"],
                    "enmo": ["mean", "std", "max", "min"],
                    "anglez": "mean",  # Mean of the angle metric
                    "non-wear_flag": "sum",  # Total non-wear time
                    "light": ["mean", "std", "max", "min"],
                    "battery_voltage": "mean",  # Average battery voltage
                }
            )
            .reset_index()
        )

        # Flatten the multi-level column names
        aggregated_df.columns = ["_".join(col).strip() if col[1] else col[0] for col in aggregated_df.columns.values]
        return aggregated_df

    def temporal_aggregations(self, data):
        # Classify weekday/weekend
        data["weekday_flag"] = data["weekday"].apply(lambda x: "weekday" if x < 5 else "weekend")

        # Classify time of day
        conditions = [
            (data["time_of_day"] < 6 * 3600),  # Midnight to 6 AM
            (data["time_of_day"] >= 6 * 3600) & (data["time_of_day"] < 12 * 3600),  # 6 AM to Noon
            (data["time_of_day"] >= 12 * 3600) & (data["time_of_day"] < 18 * 3600),  # Noon to 6 PM
            (data["time_of_day"] >= 18 * 3600),  # 6 PM to Midnight
        ]
        choices = ["night", "morning", "afternoon", "evening"]
        data["time_period"] = np.select(conditions, choices, default="unknown")

        # Aggregate by weekday/weekend and time of day
        temporal_agg = (
            data.groupby(["id", "weekday_flag", "time_period"])
            .agg({"enmo": "mean", "light": "mean", "non-wear_flag": "sum"})
            .unstack(fill_value=0)
        )
        temporal_agg.columns = ["_".join(col).strip() for col in temporal_agg.columns.values]

        return temporal_agg.reset_index()

    def activity_ratios(self, data):
        """
        Calculate the ratio of non-wear time to total measurement time for each participant.

        Parameters:
        - data (DataFrame): Actigraphy data for a participant.

        Returns:
        - DataFrame: Contains `non_wear_ratio` for each `id`.
        """
        total_time = data.groupby("id").size().rename("total_time")
        non_wear_time = data.groupby("id")["non-wear_flag"].sum().rename("non_wear_time")

        # Calculate ratio
        ratios = pd.concat([total_time, non_wear_time], axis=1)
        ratios["non_wear_ratio"] = ratios["non_wear_time"] / ratios["total_time"]

        return ratios[["non_wear_ratio"]].reset_index()

    def process_participant_data(self, participant_id):
        data = self.load_data(participant_id)
        if data.empty:  # Skip processing if data is empty
            return pd.DataFrame()

        # Calculate aggregate statistics, temporal aggregation, and activity ratios
        aggregate_data = self.aggregate_actigraphy(data)
        temporal_data = self.temporal_aggregations(data)
        ratio_data = self.activity_ratios(data)

        # Merge all feature data on `id`
        participant_data = aggregate_data.merge(temporal_data, on="id", how="left")
        participant_data = participant_data.merge(ratio_data, on="id", how="left")
        return participant_data

    def process_all_participants(self):
        all_data = []
        with ThreadPoolExecutor() as executor:
            futures = []
            for id_folder in os.listdir(self.root_dir):
                if not id_folder.startswith("id="):
                    continue
                participant_id = id_folder.split("=")[-1]
                futures.append(executor.submit(self.process_participant_data, participant_id))

            for future in tqdm(as_completed(futures), total=len(futures), desc="Processing participants"):
                participant_data = future.result()
                if not participant_data.empty:  # Only append non-empty data
                    all_data.append(participant_data)

        feature_table = pd.concat(all_data, ignore_index=True)
        return feature_table


# Initialize with the root directory containing participant data
aggregator = ActigraphAggregation(root_dir="../../data/series_train.parquet/")

# Process all participants and get the final aggregated feature table
feature_table = aggregator.process_all_participants()

# Inspect the feature table
print(feature_table.head())

Processing participants: 100%|██████████| 996/996 [02:09<00:00,  7.72it/s]


         id    X_mean     X_std     X_max     X_min    Y_mean     Y_std  \
0  6b6467f4 -0.425470  0.471329  1.006836 -0.992422  0.056812  0.513878   
1  92bb8516 -0.360749  0.487472  2.691724 -2.232233 -0.065675  0.473331   
2  92bb8516 -0.360749  0.487472  2.691724 -2.232233 -0.065675  0.473331   
3  0d01bbf2 -0.478973  0.429476  1.159667 -3.298790 -0.037643  0.518888   
4  0d01bbf2 -0.478973  0.429476  1.159667 -3.298790 -0.037643  0.518888   

      Y_max     Y_min    Z_mean  ...  light_min  battery_voltage_mean  \
0  1.009336 -3.549023 -0.307925  ...        0.0           4095.802734   
1  1.429382 -2.847736  0.091206  ...        0.0           3948.139404   
2  1.429382 -2.847736  0.091206  ...        0.0           3948.139404   
3  2.525316 -3.262288 -0.215956  ...        0.0           3876.515625   
4  2.525316 -3.262288 -0.215956  ...        0.0           3876.515625   

   weekday_flag  enmo_evening  enmo_night  light_evening  light_night  \
0       weekday      0.053129    0.01

In [3]:
feature_table.columns

Index(['id', 'X_mean', 'X_std', 'X_max', 'X_min', 'Y_mean', 'Y_std', 'Y_max',
       'Y_min', 'Z_mean', 'Z_std', 'Z_max', 'Z_min', 'enmo_mean', 'enmo_std',
       'enmo_max', 'enmo_min', 'anglez_mean', 'non-wear_flag_sum',
       'light_mean', 'light_std', 'light_max', 'light_min',
       'battery_voltage_mean', 'weekday_flag', 'enmo_evening', 'enmo_night',
       'light_evening', 'light_night', 'non-wear_flag_evening',
       'non-wear_flag_night', 'non_wear_ratio'],
      dtype='object')

In [4]:
# merge train csv with feature table
df_train = pd.read_csv("../../data/childhealth.csv")

df_merge = df_train.merge(feature_table, on="id", how="left")

In [6]:
df_merge.shape

(4940, 113)

In [7]:
df_train.shape

(3960, 82)

In [8]:
feature_table.shape

(1976, 32)

In [None]:
import pandas as pd


class ChildHealthModel:
    def __init__(self, config_path):
        self.config = self.load_config(config_path)
        self.model = None

    def load_config(self, config_path):
        """Load configuration file."""
        with open(config_path, "r") as file:
            config = yaml.safe_load(file)
        return config

    def load_data(self, filepath):
        """Load dataset and split into features and target based on config."""
        data = pd.read_csv(filepath)
        X = data[self.config["num_features"] + self.config["cat_features"]]
        y = data[self.config["target"]]
        return X, y

    def preprocess_data(self, X):
        """Preprocess the data by encoding categorical variables."""
        X = pd.get_dummies(X, columns=self.config["cat_features"], drop_first=True)
        return X

    def train(self, X, y):
        """Train the LightGBM model."""
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

        # LightGBM dataset
        train_data = lgb.Dataset(X_train, label=y_train)
        val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

        # Set LightGBM parameters for multiclass classification
        params = {
            "objective": "multiclass",
            "num_class": len(y.unique()),
            "learning_rate": self.config["model_parameters"]["learning_rate"],
            "n_estimators": self.config["model_parameters"]["n_estimators"],
            "max_depth": self.config["model_parameters"]["max_depth"],
            "metric": "multi_logloss",
        }

        # Train the model
        self.model = lgb.train(
            params, train_data, valid_sets=[train_data, val_data], early_stopping_rounds=50, verbose_eval=10
        )

        # Save the model
        joblib.dump(self.model, "lgbm_model.pkl")
        print("Model training complete and saved as 'lgbm_model.pkl'")

    def evaluate(self, X, y):
        """Evaluate the model on the test set."""
        y_pred = self.model.predict(X, num_iteration=self.model.best_iteration)
        y_pred = y_pred.argmax(axis=1)  # Get class with max probability

        # Print classification report
        print("Evaluation Results:")
        print(classification_report(y, y_pred))
        print("Accuracy:", accuracy_score(y, y_pred))

    def feature_importance(self, X):
        """Display the feature importance."""
        importance = self.model.feature_importance(importance_type="gain")
        feature_names = X.columns
        feature_importance = pd.DataFrame({"feature": feature_names, "importance": importance})
        feature_importance = feature_importance.sort_values(by="importance", ascending=False)

        # Plotting feature importance
        plt.figure(figsize=(10, 8))
        plt.barh(feature_importance["feature"], feature_importance["importance"], color="skyblue")
        plt.xlabel("Importance")
        plt.title("Feature Importance")
        plt.gca().invert_yaxis()
        plt.show()

    def load_model(self, model_path="lgbm_model.pkl"):
        """Load a saved model."""
        self.model = joblib.load(model_path)
        print("Model loaded from", model_path)

    def predict(self, X):
        """Make predictions with the trained model."""
        X_preprocessed = self.preprocess_data(X)
        y_pred = self.model.predict(X_preprocessed, num_iteration=self.model.best_iteration)
        return y_pred.argmax(axis=1)


if __name__ == "__main__":
    # Configuration and paths
    config_path = "project_config.yml"
    train_path = "train.csv"

    # Initialize model class
    child_health_model = ChildHealthModel(config_path)

    # Load and preprocess data
    X, y = child_health_model.load_data(train_path)
    X = child_health_model.preprocess_data(X)

    # Train model
    child_health_model.train(X, y)

    # Evaluate model
    child_health_model.evaluate(X, y)

    # Display feature importance
    child_health_model.feature_importance(X)