In [None]:
import pandas as pd 
import numpy as np 
import os 
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.inspection import permutation_importance
from collections import defaultdict

Preprocessing before applying imputation

In [None]:
# First, make sure that all raw hourly files have the same features.
# If not, add nan

desired_columns = ['timestamp', 'hr', 'magnitude_mAcc', 'bvp_positive', 
                       'bvp_negative', 'temp', 'magnitude_e4Acc', 'lat', 'lon', 'accuracy',
                       'magnitude_mMag', 'eda', 'magnitude_mGyr']

directory = "./train_dataset/sensor_data/hourly_remade"
#desired_columns = ['timestamp', 'action','condition', 'place', 'emotionPositive', 'emotionTension', 'activity']                   
#directory = "./train_dataset/sensor_data/labels"

# Iterate through the files in the directory
for filename in os.listdir(directory):
    if 'hourly' in filename:
        filepath = os.path.join(directory, filename)
        df = pd.read_csv(filepath)
        for col in desired_columns:
            if col not in df.columns:
                df[col] = ''
        
        userId = filename.split('_hourly.csv')[0]
        df['userId'] = userId
        df['date'] = pd.to_datetime(df['timestamp']).dt.date
        df = df[['userId', 'date'] + desired_columns]
        # Save the modified DataFrame to the same file
        df.to_csv(filepath, index=False)

print("Files have been processed and saved successfully.")

Label's emotion features -> sensor data

In [None]:
# Directories containing the CSV files
hourly_directory = './train_dataset/sensor_data/hourly_remade'
labels_directory = './train_dataset/sensor_data/labels'

# Columns to move
columns_to_move = ['emotionPositive', 'emotionTension']

# List all files in the hourly and labels directories
hourly_files = [file for file in os.listdir(hourly_directory) if file.endswith('.csv')]
labels_files = [file for file in os.listdir(labels_directory) if file.endswith('.csv')]

# Function to extract user ID from filename
def extract_user_id(filename):
    return filename.split('_')[0]

# Create a dictionary to map user IDs to their corresponding label files
labels_map = {extract_user_id(file): file for file in labels_files}

# Process each hourly file
for hourly_file in hourly_files:
    user_id = extract_user_id(hourly_file)
    if user_id in labels_map:
        hourly_file_path = os.path.join(hourly_directory, hourly_file)
        label_file_path = os.path.join(labels_directory, labels_map[user_id])
        
        # Load the CSV files
        hourly_df = pd.read_csv(hourly_file_path)
        label_df = pd.read_csv(label_file_path)
        
        # Check if the columns to move exist in the label file
        if all(column in label_df.columns for column in columns_to_move):
            # Move the specified columns
            hourly_df[columns_to_move] = label_df[columns_to_move]
            
            # Remove the specified columns from the label DataFrame
            label_df.drop(columns=columns_to_move, inplace=True)
        
            # Save the modified hourly DataFrame back to the CSV file
            hourly_df.to_csv(hourly_file_path, index=False)
            
            # Save the modified label DataFrame back to the CSV file
            label_df.to_csv(label_file_path, index=False)

SAITS imputer for numeric columns

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from pypots.imputation import SAITS
from pypots.utils.metrics import calc_mae
from sklearn.impute import SimpleImputer

def mcar(data, missing_rate):
    np.random.seed(0)
    data_with_nan = data.copy()
    missing_mask = np.random.rand(*data.shape) < missing_rate
    data_with_nan[missing_mask] = np.nan
    return data_with_nan

def impute_SAITS(path):
    data = pd.read_csv(path)
    
    if 'timestamp' not in data.columns:
        raise KeyError("The 'timestamp' column is not found in the dataset.")
    
    time_column = 'timestamp'
    #additional_columns = ['userId', 'date']
    #feature_columns = data.columns.drop([time_column] + additional_columns)
    feature_columns = data.columns.drop([time_column])

    # Separate numeric and non-numeric columns
    numeric_columns = data[feature_columns].select_dtypes(include=[np.number]).columns
    non_numeric_columns = data[feature_columns].select_dtypes(exclude=[np.number]).columns

    # Data preprocessing
    num_samples = data[time_column].nunique()
    data_numeric = data[numeric_columns]
    data_non_numeric = data[non_numeric_columns]
    
    # Standardize numeric data
    scaler = StandardScaler()
    data_numeric_scaled = pd.DataFrame(scaler.fit_transform(data_numeric), columns=numeric_columns)
    
    # Apply missing values to numeric data
    X_numeric = data_numeric_scaled.to_numpy().reshape(num_samples, -1, len(numeric_columns))
    X_numeric_ori = X_numeric  # Keep original numeric data for validation
    X_numeric = mcar(X_numeric, 0.1)  # Randomly hold out 10% observed values as ground truth
    dataset_numeric = {"X": X_numeric}
    print(X_numeric.shape)  # (num_samples, 1, len(numeric_columns))

    # Model training
    saits = SAITS(
        n_steps=X_numeric.shape[1], 
        n_features=X_numeric.shape[2], 
        n_layers=2, 
        d_model=256, 
        d_ffn=128, 
        n_heads=4, 
        d_k=64, 
        d_v=64, 
        dropout=0.1, 
        epochs=10
    )

    # Use the whole dataset as the training set because ground truth is not visible to the model, you can also split it into train/val/test sets
    saits.fit(dataset_numeric)
    imputation = saits.impute(dataset_numeric)  # Impute the originally-missing values and artificially-missing values
    indicating_mask = np.isnan(X_numeric) ^ np.isnan(X_numeric_ori)  # Indicating mask for imputation error calculation
    mae = calc_mae(imputation, np.nan_to_num(X_numeric_ori), indicating_mask)  # Calculate mean absolute error on the ground truth (artificially-missing values)

    print(f"Mean Absolute Error for numeric data: {mae}")

    # Convert imputed data back to DataFrame
    imputed_numeric_data = imputation.reshape(-1, len(numeric_columns))
    imputed_numeric_df = pd.DataFrame(imputed_numeric_data, columns=numeric_columns)
    imputed_numeric_df = pd.DataFrame(scaler.inverse_transform(imputed_numeric_df), columns=numeric_columns)

    # Check if there are non-numeric columns and impute them using mode imputation grouped by 'date'
    if len(non_numeric_columns) > 0:
        imputed_non_numeric_df = data_non_numeric.copy()
        
        for col in non_numeric_columns:
            def mode_impute(series):
                mode_val = series.mode()
                if mode_val.empty:
                    return series
                return series.fillna(mode_val.iloc[0])
            
            imputed_non_numeric_df[col] = data.groupby('date')[col].transform(mode_impute)
        
        # Handle any remaining missing values
        for col in non_numeric_columns:
            if imputed_non_numeric_df[col].isnull().any():
                print(f"Handling remaining missing values in column: {col}")
                fallback_imputer = SimpleImputer(strategy='most_frequent')
                imputed_non_numeric_df[col] = fallback_imputer.fit_transform(imputed_non_numeric_df[[col]])
        
        imputed_df = pd.concat([imputed_numeric_df, imputed_non_numeric_df], axis=1)
    else:
        imputed_df = imputed_numeric_df

    # Add back 'userId', 'date', and 'timestamp' columns
    #imputed_df['userId'] = data['userId']
    #imputed_df['date'] = data['date']
    imputed_df['timestamp'] = data['timestamp']

    return imputed_df


path = "./train_dataset/sensor_data/hourly_remade/user30_hourly.csv"
try:
    imputed_df = impute_SAITS(path)
    #import ace_tools as tools; tools.display_dataframe_to_user(name="Imputed Data", dataframe=imputed_df)
    print(imputed_df.head())
except ValueError as e:
    print(f"ValueError: {e}")
except Exception as e:
    print(f"Unexpected error: {e}")

#imputed_df.to_csv('./train_dataset/sensor_data_imputed/user01_sensor_imputed.csv')

mode imputer for non-numeric columns

In [None]:
def impute_by_mode(df, group_col, cols_to_impute):
    for col in cols_to_impute:
        mode_func = lambda x: x.mode().iloc[0] if not x.mode().empty else x
        df[col] = df.groupby(group_col)[col].transform(lambda x: x.fillna(mode_func(x)))
    return df

def impute_missing_values(file_path):
    df = pd.read_csv(file_path)
    columns_to_impute = ['action', 'condition', 'place', 'activity']
    imputed_df = impute_by_mode(df, 'date', columns_to_impute)
    
    # Ensure all missing values are filled
    for col in columns_to_impute:
        if imputed_df[col].isnull().any():
            overall_mode = imputed_df[col].mode().iloc[0]
            imputed_df[col].fillna(overall_mode, inplace=True)
    
    return imputed_df

# Directories
input_dir = './train_dataset/sensor_data/labels'
output_dir = './train_dataset/sensor_data_imputed'

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Process each file in the input directory
for filename in os.listdir(input_dir):
    if filename.endswith('.csv'):
        # Load and impute missing values
        file_path = os.path.join(input_dir, filename)
        imputed_df = impute_missing_values(file_path)
        
        # Extract userId from the filename
        user_id = imputed_df['userId'].iloc[0]
        
        # Save the imputed DataFrame
        output_file_path = os.path.join(output_dir, f'{user_id}_state_imputed.csv')
        imputed_df.to_csv(output_file_path, index=False)

print("Imputation and saving completed.")

combine data

In [None]:
# Directories
input_dir = './train_dataset/sensor_data_imputed'
output_dir = './train_dataset/sensor_data_merged'

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Function to merge sensor and state files for each user
def merge_files(sensor_file, state_file):
    sensor_df = pd.read_csv(os.path.join(input_dir, sensor_file))
    state_df = pd.read_csv(os.path.join(input_dir, state_file))
    
    # Drop 'Unnamed: 0' column if it exists
    sensor_df = sensor_df.loc[:, ~sensor_df.columns.str.contains('^Unnamed')]
    state_df = state_df.loc[:, ~state_df.columns.str.contains('^Unnamed')]
    
    merged_df = pd.merge(sensor_df, state_df, on=['userId', 'timestamp'])
    
    # Handle 'date_x' and 'date_y' columns
    if 'date_x' in merged_df.columns and 'date_y' in merged_df.columns:
        merged_df['date'] = merged_df['date_x'].combine_first(merged_df['date_y'])
        merged_df.drop(columns=['date_x', 'date_y'], inplace=True)
    elif 'date_x' in merged_df.columns:
        merged_df.rename(columns={'date_x': 'date'}, inplace=True)
    elif 'date_y' in merged_df.columns:
        merged_df.rename(columns={'date_y': 'date'}, inplace=True)
    
    # Ensure the correct column order
    column_order = [
        'userId', 'date', 'timestamp','hr', 'magnitude_mAcc', 'bvp_positive', 'bvp_negative', 'temp', 
        'magnitude_e4Acc', 'lat', 'lon', 'accuracy', 'magnitude_mMag', 'eda', 
        'magnitude_mGyr', 'emotionPositive', 'emotionTension', 'action', 'condition', 'place', 'activity'
    ]
    
    # Ensure that columns not in column_order are added at the end
    merged_df = merged_df[[col for col in column_order if col in merged_df.columns] + 
                          [col for col in merged_df.columns if col not in column_order]]
    
    return merged_df

# Get list of files in the input directory
files = os.listdir(input_dir)

# Group files by userId
file_groups = {}
for filename in files:
    if filename.endswith('.csv'):
        user_id = filename.split('_')[0]
        if user_id not in file_groups:
            file_groups[user_id] = {'sensor': None, 'state': None}
        if 'sensor' in filename:
            file_groups[user_id]['sensor'] = filename
        elif 'state' in filename:
            file_groups[user_id]['state'] = filename

# Process each group of files
for user_id, file_pair in file_groups.items():
    if file_pair['sensor'] and file_pair['state']:
        merged_df = merge_files(file_pair['sensor'], file_pair['state'])
        
        # Save the merged DataFrame
        output_file_path = os.path.join(output_dir, f'{user_id}_imputed.csv')
        merged_df.to_csv(output_file_path, index=False)

print("Merging and saving completed.")


In [None]:
directory = './train_dataset/sensor_data_merged'

# List all CSV files in the directory
csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]

# Initialize an empty list to store dataframes
dataframes = []

# Read each CSV file and append the dataframe to the list
for csv_file in csv_files:
    file_path = os.path.join(directory, csv_file)
    df = pd.read_csv(file_path)
    dataframes.append(df)

# Concatenate all dataframes into a single dataframe
merged_df = pd.concat(dataframes, ignore_index=True)
merged_df.rename(columns={'hr':'heart_rate', 'lat':'latitude', 'lon':'longitude'}, inplace=True)
merged_df.to_csv("./combined_after_imputation.csv")

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from pygrinder import mcar
import SAITS
from pypots.utils.metrics import calc_mae

# load data
#file_path = "./train_dataset/sensor_data_imputed/combined_sensor_data.csv"
data = pd.read_csv(file_path)

# split df into time_column and features_columns
time_column = 'timestamp'  
feature_columns = data.columns.drop(time_column)

# data preprocessing
num_samples = data[time_column].nunique()
data = data.drop([time_column], axis=1)
X = data.to_numpy().reshape(num_samples, -1, len(feature_columns))
X_ori = X  # keep X_ori for validation
X = mcar(X, 0.1)  # randomly hold out 10% observed values as ground truth
dataset = {"X": X}
print(X.shape)  # (684, 1, 13)


# model training
saits = SAITS(n_steps=X.shape[1], n_features=X.shape[2], n_layers=2, d_model=256, d_ffn=128, n_heads=4, d_k=64, d_v=64, dropout=0.1, epochs=10)

# here we use the whole dataset as the training set because ground truth is not visible to the model, you can also split it into train/val/test sets
saits.fit(dataset)
imputation = saits.impute(dataset)  # impute the originally-missing values and artificially-missing values
indicating_mask = np.isnan(X) ^ np.isnan(X_ori)  # indicating mask for imputation error calculation
mae = calc_mae(imputation, np.nan_to_num(X_ori), indicating_mask)  # calculate mean absolute error on the ground truth (artificially-missing values)

print(f"Mean Absolute Error: {mae}")

# convert imputed data back to DataFrame
imputed_data = imputation.reshape(-1, len(feature_columns))
imputed_df = pd.DataFrame(imputed_data, columns=feature_columns)

imputed_df.drop(columns=['index'], inplace=True)
#imputed_df.to_csv("./train_dataset/sensor_data_imputed/combined_sensor_data_imputed.csv")


density plot before and after train imputation

In [None]:
def plot_density_distributions(before_path, after_path, n_cols=3):
    original_data = pd.read_csv(before_path)
    imputed_data = pd.read_csv(after_path)
    
    # Identify numeric and categorical columns
    numeric_columns = original_data.select_dtypes(include=[np.number]).columns
    categorical_columns = original_data.select_dtypes(exclude=[np.number]).columns
    
    # Plot numeric columns
    n_rows_numeric = int(np.ceil(len(numeric_columns) / n_cols))
    fig, axes = plt.subplots(n_rows_numeric, n_cols, figsize=(15, 5 * n_rows_numeric))
    axes = axes.flatten()

    for i, column in enumerate(numeric_columns):
        original_col = original_data[column].dropna().values
        imputed_col = imputed_data[column].dropna().values

        sns.kdeplot(original_col, ax=axes[i], label='Original', fill=False, alpha=0.5)
        sns.kdeplot(imputed_col, ax=axes[i], label='Imputed', fill=False, alpha=0.5)
        axes[i].set_title(column)
        axes[i].legend()

    for ax in axes[len(numeric_columns):]:
        ax.remove()

    plt.tight_layout()
    plt.show()

    # Plot categorical columns
    n_rows_categorical = int(np.ceil(len(categorical_columns) / n_cols))
    fig, axes = plt.subplots(n_rows_categorical, n_cols, figsize=(15, 5 * n_rows_categorical))
    axes = axes.flatten()

    for i, column in enumerate(categorical_columns):
        original_counts = original_data[column].value_counts()
        imputed_counts = imputed_data[column].value_counts()

        width = 0.35  # width of the bars
        original_counts.plot(kind='bar', ax=axes[i], position=0, width=width, label='Original', color='blue', alpha=0.5)
        imputed_counts.plot(kind='bar', ax=axes[i], position=1, width=width, label='Imputed', color='orange', alpha=0.5)

        axes[i].set_title(column)
        axes[i].legend()

    for ax in axes[len(categorical_columns):]:
        ax.remove()

    plt.tight_layout()
    plt.show()

# File paths
before_imputation_path = './combined_before_imputation.csv'
after_imputation_path = './combined_after_imputation.csv'

# Plot the density distributions before and after imputation
plot_density_distributions(before_imputation_path, after_imputation_path)


Re-build train set

In [None]:
sensor_state_imputed = pd.read_csv('./combined_after_imputation.csv')
#sensor_state_original = pd.read_csv('./combined_before_imputation.csv')

survey = pd.read_csv("./train_dataset/user_survey_2020_transformed.csv")
columns_to_replace = ['caffeine', 'cAmount(ml)', 'alcohol', 'aAmount(ml)']
survey[columns_to_replace] = survey[columns_to_replace].fillna('unknown')
survey.drop(columns=['amPm', 'startInput', 'endInput'], inplace=True)
labels = pd.read_csv("./train_dataset/train_label.csv")
sleep = pd.read_csv('./train_dataset/user_sleep_2020.csv')
sleep.drop(columns=[ 'timezone', 'startDt', 'endDt', 'lastUpdate'], inplace=True)

labels.rename(columns={'subject_id':'userId'}, inplace=True)
labels.drop(columns=['Unnamed: 0'], inplace=True)
labels_sensor_state_merged = pd.merge(labels, sensor_state_imputed, on=['userId', 'date'], how='left')
labels_sensor_state_merged['date'].nunique()
labels_sensor_state_survey_merged = pd.merge(labels_sensor_state_merged, survey, on=['userId', 'date'], how='left')
labels_sensor_state_survey_merged['date'].nunique()
all_merged = pd.merge(labels_sensor_state_survey_merged, sleep, on=['userId', 'date'], how='left')
all_merged['date'].nunique()
all_merged.drop(columns=['Unnamed: 0', 'hr_min', 'hr_max', 'rr_min', 'rr_max'], inplace=True)
all_merged = all_merged.dropna(subset=['timestamp']) # judged as corrupted so dropped these

Impute once again for the post-merging dataset

In [None]:
def impute_by_mode(df, group_cols, cols_to_impute):
    for col in cols_to_impute:
        mode_func = lambda x: x.mode().iloc[0] if not x.mode().empty else x
        df[col] = df.groupby(group_cols)[col].transform(lambda x: x.fillna(mode_func(x)))
    return df

def impute_nonnumeric_missing_values(df):
    columns_to_impute = ['action', 'condition', 'place', 'activity']
    group_cols = ['userId', 'date']
    imputed_df = impute_by_mode(df, group_cols, columns_to_impute)
    
    # Ensure all missing values are filled
    for col in columns_to_impute:
        if imputed_df[col].isnull().any():
            overall_mode = imputed_df[col].mode().iloc[0]
            imputed_df[col].fillna(overall_mode, inplace=True)
    
    return imputed_df


all_merged = impute_nonnumeric_missing_values(all_merged)
#all_merged[all_merged['condition'].isna()]

def impute_by_median(df, group_cols, cols_to_impute):
    for col in cols_to_impute:
        if pd.api.types.is_numeric_dtype(df[col]):
            median_func = lambda x: x.median()
            df[col] = df.groupby(group_cols)[col].transform(lambda x: x.fillna(median_func(x)))
    
    # Ensure all missing values are filled
    for col in cols_to_impute:
        if df[col].isnull().any():
            overall_median = df[col].median()
            df[col].fillna(overall_median, inplace=True)
    
    return df

cols_to_impute = ['heart_rate', 'magnitude_mAcc', 'temp', 'magnitude_e4Acc', 'latitude', 'longitude', 'accuracy', 'magnitude_mMag', 'magnitude_mGyr',
 'wakeupduration', 'lightsleepduration', 'deepsleepduration', 'wakeupcount', 'durationtosleep', 'remsleepduration', 'durationtowakeup',
 'hr_average', 'rr_average', 'breathing_disturbances_intensity', 'snoring', 'snoringepisodecount', 'sleep_score']
group_cols = ['userId', 'date']

# Apply the imputation function
all_merged = impute_by_median(all_merged, group_cols, cols_to_impute)
na_values_after = all_merged.isna().sum()
print(f"NA values {na_values_after}")

all_merged.rename(columns={'Q1':'daily_Q1',
                              'Q2':'daily_Q2',
                              'Q3':'daily_Q3',
                              'S1':'daily_S1',
                              'S2':'daily_S2',
                              'S3':'daily_S3',
                              'S4':'daily_S4'}, inplace=True)

One last feature engineering: encoding categorical features

In [None]:
all_merged.select_dtypes(exclude=[int, float])

specified_columns = ['action', 'condition', 'place', 'activity', 'caffeine', 'alcohol', 'cAmount(ml)', 'aAmount(ml)']
one_hot_encoded_df = pd.get_dummies(all_merged[specified_columns], drop_first=False)
one_hot_encoded_df_int = one_hot_encoded_df.astype(int)
# Concatenate the one-hot encoded columns back to the original DataFrame, excluding the original columns
final_df = pd.concat([all_merged.drop(columns=specified_columns), one_hot_encoded_df_int], axis=1)
final_df.head()
final_df.to_csv("train_imputed.csv")
with open('imputed_train_with_labels.pickle', 'wb') as f:
    pickle.dump(final_df, f)

Validation imputation

Step 1: Load the Data and Separate Features and Labels

In [None]:
with open('./imputed_train_with_labels.pickle', 'rb') as f:
    train_data = pickle.load(f)

with open('./validation_with_labels.pickle', 'rb') as f:
    validation_data = pickle.load(f)

# Separate features and target labels
train_features = train_data.drop(columns=[col for col in train_data.columns if col.startswith('daily_')])
train_features.drop(columns=['userId', 'date', 'timestamp'], inplace=True)
train_labels = train_data[[col for col in train_data.columns if col.startswith('daily_')]]

Step 2: Standardization

In [None]:
X_train = train_features
y_train = train_labels

# Split the data into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize data 
scaler = StandardScaler()

prefixes = ['action', 'condition', 'place', 'activity', 'caffeine', 'alcohol', 'cAmount(ml)', 'aAmount(ml)']

# Identify columns that start with the specified prefixes
onehot_cols = [col for col in train_features.columns if any(col.startswith(prefix) for prefix in prefixes)]
# Convert identified categorical columns to uint8
train_features[onehot_cols] = train_features[onehot_cols].astype('uint8')

numeric_cols = train_features.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = train_features.select_dtypes(include=['uint8', 'bool']).columns

# Standardize numeric columns 
X_train_numeric_scaled = scaler.fit_transform(X_train[numeric_cols])
#X_test_numeric_scaled = scaler.transform(X_test[numeric_cols])

# Convert scaled numeric data back to DataFrame
X_train_numeric_scaled_df = pd.DataFrame(X_train_numeric_scaled, columns=numeric_cols, index=X_train.index)
#X_test_numeric_scaled_df = pd.DataFrame(X_test_numeric_scaled, columns=numeric_cols, index=X_test.index)

# Combine scaled numeric columns with categorical columns
X_train_scaled = pd.concat([X_train_numeric_scaled_df, X_train[categorical_cols]], axis=1)
#X_test_scaled = pd.concat([X_test_numeric_scaled_df, X_test[categorical_cols]], axis=1)


Step 3: Preprocess validation data

In [None]:
validation_features = validation_data.drop(columns=[col for col in validation_data.columns if col.startswith('daily_')])
validation_features.drop(columns=['subject_id', 'date', 'timestamp'], inplace=True)
validation_labels = validation_data[[col for col in validation_data.columns if col.startswith('daily_')]]

val = set(validation_features.columns)
train = set(train_features.columns)
in_val_not_in_train = val - train
only_in_val = list(in_val_not_in_train)
#print(f"In val not in train: {in_val_not_in_train}, In train not in val: {in_train_not_in_val}")

prefixes = ['action', 'condition', 'place', 'activity', 'caffeine', 'alcohol', 'cAmount(ml)', 'aAmount(ml)']
onehot_cols = [col for col in train_features.columns if any(col.startswith(prefix) for prefix in prefixes)]
# Identify numeric columns (those that are not one-hot encoded)
numeric_cols = train_features.columns.difference(onehot_cols).tolist()

# Identify features that are missing in the validation set
# Assuming features is the DataFrame used for training which contains all columns
missing_features = [col for col in train_features.columns if col not in validation_features.columns]

# Append the missing features to the validation set with placeholder values (e.g., NaN)
for feature in missing_features:
    validation_features[feature] = float('nan')

X_missing = validation_features.drop(columns=only_in_val)
for feature in missing_features:
    X_missing[feature] = float('nan')

below two blocks are for aligning the column orders

In [None]:
X_missing = X_missing[['heart_rate',
 'magnitude_mAcc',
 'bvp_positive',
 'bvp_negative',
 'temp',
 'magnitude_e4Acc',
 'latitude',
 'longitude',
 'accuracy',
 'magnitude_mMag',
 'eda',
 'magnitude_mGyr',
 'emotionPositive',
 'emotionTension',
 'sleep',
 'sleepProblem',
 'dream',
 'amCondition',
 'amEmotion',
 'pmEmotion',
 'pmStress',
 'pmFatigue',
 'wakeupduration',
 'lightsleepduration',
 'deepsleepduration',
 'wakeupcount',
 'durationtosleep',
 'remsleepduration',
 'durationtowakeup',
 'hr_average',
 'rr_average',
 'breathing_disturbances_intensity',
 'snoring',
 'snoringepisodecount',
 'sleep_score',
 'action_care_housemem',
 'action_community_interaction',
 'action_entertainment',
 'action_hobby',
 'action_household',
 'action_meal',
 'action_outdoor_act',
 'action_personal_care',
 'action_recreation_etc',
 'action_recreation_media',
 'action_shop',
 'action_sleep',
 'action_socialising',
 'action_study',
 'action_travel',
 'action_work',
 'condition_ALONE',
 'condition_WITH_MANY',
 'condition_WITH_ONE',
 'place_home',
 'place_other_indoor',
 'place_outdoor',
 'place_restaurant',
 'place_workplace',
 'activity_IN_VEHICLE',
 'activity_ON_FOOT',
 'activity_STILL',
 'activity_UNKNOWN',
 'caffeine_caffeinated drink',
 'caffeine_coffee',
 'caffeine_coke',
 'caffeine_tea',
 'caffeine_unknown',
 'alcohol_beer',
 'alcohol_beer&rice wine',
 'alcohol_not specified',
 'alcohol_soju',
 'alcohol_soju&beer',
 'alcohol_unknown',
 'alcohol_wine',
 'cAmount(ml)_100.0',
 'cAmount(ml)_150.0',
 'cAmount(ml)_200.0',
 'cAmount(ml)_250.0',
 'cAmount(ml)_260.0',
 'cAmount(ml)_280.0',
 'cAmount(ml)_300.0',
 'cAmount(ml)_350.0',
 'cAmount(ml)_355.0',
 'cAmount(ml)_360.0',
 'cAmount(ml)_400.0',
 'cAmount(ml)_450.0',
 'cAmount(ml)_500.0',
 'cAmount(ml)_560.0',
 'cAmount(ml)_600.0',
 'cAmount(ml)_700.0',
 'cAmount(ml)_750.0',
 'cAmount(ml)_900.0',
 'cAmount(ml)_1000.0',
 'cAmount(ml)_1500.0',
 'cAmount(ml)_unknown',
 'aAmount(ml)_200.0',
 'aAmount(ml)_250.0',
 'aAmount(ml)_300.0',
 'aAmount(ml)_330.0',
 'aAmount(ml)_400.0',
 'aAmount(ml)_500.0',
 'aAmount(ml)_600.0',
 'aAmount(ml)_700.0',
 'aAmount(ml)_720.0',
 'aAmount(ml)_750.0',
 'aAmount(ml)_800.0',
 'aAmount(ml)_900.0',
 'aAmount(ml)_1000.0',
 'aAmount(ml)_1500.0',
 'aAmount(ml)_2000.0',
 'aAmount(ml)_3000.0',
 'aAmount(ml)_3500.0',
 'aAmount(ml)_4000.0',
 'aAmount(ml)_unknown']]

X_train_scaled = X_train_scaled[['heart_rate',
 'magnitude_mAcc',
 'bvp_positive',
 'bvp_negative',
 'temp',
 'magnitude_e4Acc',
 'latitude',
 'longitude',
 'accuracy',
 'magnitude_mMag',
 'eda',
 'magnitude_mGyr',
 'emotionPositive',
 'emotionTension',
 'sleep',
 'sleepProblem',
 'dream',
 'amCondition',
 'amEmotion',
 'pmEmotion',
 'pmStress',
 'pmFatigue',
 'wakeupduration',
 'lightsleepduration',
 'deepsleepduration',
 'wakeupcount',
 'durationtosleep',
 'remsleepduration',
 'durationtowakeup',
 'hr_average',
 'rr_average',
 'breathing_disturbances_intensity',
 'snoring',
 'snoringepisodecount',
 'sleep_score',
 'action_care_housemem',
 'action_community_interaction',
 'action_entertainment',
 'action_hobby',
 'action_household',
 'action_meal',
 'action_outdoor_act',
 'action_personal_care',
 'action_recreation_etc',
 'action_recreation_media',
 'action_shop',
 'action_sleep',
 'action_socialising',
 'action_study',
 'action_travel',
 'action_work',
 'condition_ALONE',
 'condition_WITH_MANY',
 'condition_WITH_ONE',
 'place_home',
 'place_other_indoor',
 'place_outdoor',
 'place_restaurant',
 'place_workplace',
 'activity_IN_VEHICLE',
 'activity_ON_FOOT',
 'activity_STILL',
 'activity_UNKNOWN',
 'caffeine_caffeinated drink',
 'caffeine_coffee',
 'caffeine_coke',
 'caffeine_tea',
 'caffeine_unknown',
 'alcohol_beer',
 'alcohol_beer&rice wine',
 'alcohol_not specified',
 'alcohol_soju',
 'alcohol_soju&beer',
 'alcohol_unknown',
 'alcohol_wine',
 'cAmount(ml)_100.0',
 'cAmount(ml)_150.0',
 'cAmount(ml)_200.0',
 'cAmount(ml)_250.0',
 'cAmount(ml)_260.0',
 'cAmount(ml)_280.0',
 'cAmount(ml)_300.0',
 'cAmount(ml)_350.0',
 'cAmount(ml)_355.0',
 'cAmount(ml)_360.0',
 'cAmount(ml)_400.0',
 'cAmount(ml)_450.0',
 'cAmount(ml)_500.0',
 'cAmount(ml)_560.0',
 'cAmount(ml)_600.0',
 'cAmount(ml)_700.0',
 'cAmount(ml)_750.0',
 'cAmount(ml)_900.0',
 'cAmount(ml)_1000.0',
 'cAmount(ml)_1500.0',
 'cAmount(ml)_unknown',
 'aAmount(ml)_200.0',
 'aAmount(ml)_250.0',
 'aAmount(ml)_300.0',
 'aAmount(ml)_330.0',
 'aAmount(ml)_400.0',
 'aAmount(ml)_500.0',
 'aAmount(ml)_600.0',
 'aAmount(ml)_700.0',
 'aAmount(ml)_720.0',
 'aAmount(ml)_750.0',
 'aAmount(ml)_800.0',
 'aAmount(ml)_900.0',
 'aAmount(ml)_1000.0',
 'aAmount(ml)_1500.0',
 'aAmount(ml)_2000.0',
 'aAmount(ml)_3000.0',
 'aAmount(ml)_3500.0',
 'aAmount(ml)_4000.0',
 'aAmount(ml)_unknown']]

In [None]:
# standardize X_missing
col_order = X_train_scaled.columns.tolist()
#X_missing.drop(columns=['Unnamed: 0'], inplace=True)
numeric_cols = ['heart_rate', 'latitude', 'longitude', 'magnitude_mAcc']

scaler = StandardScaler()
X_missing_numeric_scaled = scaler.fit_transform(X_missing[numeric_cols])
# Convert back to df
X_missing_numeric_scaled_df = pd.DataFrame(X_missing_numeric_scaled, columns=numeric_cols, index=X_missing.index)
# Combine scaled numeric columns with categorical columns
categorical_df = X_missing.drop(columns=['heart_rate', 'latitude', 'longitude', 'magnitude_mAcc'])
X_missing_scaled = pd.concat([X_missing_numeric_scaled_df, categorical_df], axis=1)
X_missing_scaled = X_missing_scaled[col_order]

Step 4: Impute validation data

In [None]:
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge

# Initialize the IterativeImputer
imputer = IterativeImputer(estimator=BayesianRidge(), random_state=0, max_iter=20, sample_posterior=True)

# Fit the imputer on the training data
imputer.fit(X_train_scaled)

# Transform the data with missing values
X_imputed = imputer.transform(X_missing_scaled)

# X_imputed now has the missing values imputed based on the learned model from X_train

validation_imputed = pd.DataFrame(X_imputed, columns=X_missing.columns)

with open('imputed_validation.pickle', 'wb') as f:
    pickle.dump(validation_imputed, f)


Step 5: Visualize X_train_scaled and validation_imputed

In [None]:
num_vars = X_train_scaled.shape[1]  # Number of variables (columns) in X_train_scaled
fig, axes = plt.subplots(nrows=num_vars, ncols=1, figsize=(8, 4 * num_vars))

# Loop through each variable
for i, var in enumerate(X_train_scaled.columns):
    ax = axes[i] if num_vars > 1 else axes  # Select the appropriate subplot
    sns.histplot(X_train_scaled[var], kde=True, color='blue', label=f'X_train_scaled {var}', ax=ax)
    sns.histplot(validation_imputed[var], kde=True, color='red', label=f'validation_imputed {var}', ax=ax)
    ax.set_title(f'Distribution of {var}')
    ax.legend()

# Adjust layout
plt.tight_layout()
plt.show()

Step 6: Impute test data

In [None]:
with open('./imputed_train_with_labels.pickle', 'rb') as f:
    train_data = pickle.load(f)

with open('./test.pickle', 'rb') as f:
    test_data = pickle.load(f)

# Separate features and target labels
train_features = train_data.drop(columns=[col for col in train_data.columns if col.startswith('daily_')])
train_features.drop(columns=['userId', 'date', 'timestamp'], inplace=True)

test_features = test_data.drop(columns=[col for col in test_data.columns if col.startswith('daily_')])
test_features.drop(columns=['subject_id', 'date', 'timestamp'], inplace=True)
test = set(test_features.columns)
train = set(train_features.columns)
in_test_not_in_train = test - train
only_in_test = list(in_test_not_in_train)
#print(f"In test not in train: {in_test_not_in_train}, In train not in test: {in_train_not_in_test}")

prefixes = ['action', 'condition', 'place', 'activity', 'caffeine', 'alcohol', 'cAmount(ml)', 'aAmount(ml)']
onehot_cols = [col for col in train_features.columns if any(col.startswith(prefix) for prefix in prefixes)]
# Identify numeric columns (those that are not one-hot encoded)
numeric_cols = train_features.columns.difference(onehot_cols).tolist()

# Identify features that are missing in the test set
# Assuming features is the DataFrame used for training which contains all columns
missing_features = [col for col in train_features.columns if col not in test_features.columns]

# Append the missing features to the test set with placeholder testues (e.g., NaN)
for feature in missing_features:
    test_features[feature] = float('nan')

X_missing_test = test_features.drop(columns=only_in_test)
for feature in missing_features:
    X_missing_test[feature] = float('nan')

X_missing_test = X_missing_test[['heart_rate',
 'magnitude_mAcc',
 'bvp_positive',
 'bvp_negative',
 'temp',
 'magnitude_e4Acc',
 'latitude',
 'longitude',
 'accuracy',
 'magnitude_mMag',
 'eda',
 'magnitude_mGyr',
 'emotionPositive',
 'emotionTension',
 'sleep',
 'sleepProblem',
 'dream',
 'amCondition',
 'amEmotion',
 'pmEmotion',
 'pmStress',
 'pmFatigue',
 'wakeupduration',
 'lightsleepduration',
 'deepsleepduration',
 'wakeupcount',
 'durationtosleep',
 'remsleepduration',
 'durationtowakeup',
 'hr_average',
 'rr_average',
 'breathing_disturbances_intensity',
 'snoring',
 'snoringepisodecount',
 'sleep_score',
 'action_care_housemem',
 'action_community_interaction',
 'action_entertainment',
 'action_hobby',
 'action_household',
 'action_meal',
 'action_outdoor_act',
 'action_personal_care',
 'action_recreation_etc',
 'action_recreation_media',
 'action_shop',
 'action_sleep',
 'action_socialising',
 'action_study',
 'action_travel',
 'action_work',
 'condition_ALONE',
 'condition_WITH_MANY',
 'condition_WITH_ONE',
 'place_home',
 'place_other_indoor',
 'place_outdoor',
 'place_restaurant',
 'place_workplace',
 'activity_IN_VEHICLE',
 'activity_ON_FOOT',
 'activity_STILL',
 'activity_UNKNOWN',
 'caffeine_caffeinated drink',
 'caffeine_coffee',
 'caffeine_coke',
 'caffeine_tea',
 'caffeine_unknown',
 'alcohol_beer',
 'alcohol_beer&rice wine',
 'alcohol_not specified',
 'alcohol_soju',
 'alcohol_soju&beer',
 'alcohol_unknown',
 'alcohol_wine',
 'cAmount(ml)_100.0',
 'cAmount(ml)_150.0',
 'cAmount(ml)_200.0',
 'cAmount(ml)_250.0',
 'cAmount(ml)_260.0',
 'cAmount(ml)_280.0',
 'cAmount(ml)_300.0',
 'cAmount(ml)_350.0',
 'cAmount(ml)_355.0',
 'cAmount(ml)_360.0',
 'cAmount(ml)_400.0',
 'cAmount(ml)_450.0',
 'cAmount(ml)_500.0',
 'cAmount(ml)_560.0',
 'cAmount(ml)_600.0',
 'cAmount(ml)_700.0',
 'cAmount(ml)_750.0',
 'cAmount(ml)_900.0',
 'cAmount(ml)_1000.0',
 'cAmount(ml)_1500.0',
 'cAmount(ml)_unknown',
 'aAmount(ml)_200.0',
 'aAmount(ml)_250.0',
 'aAmount(ml)_300.0',
 'aAmount(ml)_330.0',
 'aAmount(ml)_400.0',
 'aAmount(ml)_500.0',
 'aAmount(ml)_600.0',
 'aAmount(ml)_700.0',
 'aAmount(ml)_720.0',
 'aAmount(ml)_750.0',
 'aAmount(ml)_800.0',
 'aAmount(ml)_900.0',
 'aAmount(ml)_1000.0',
 'aAmount(ml)_1500.0',
 'aAmount(ml)_2000.0',
 'aAmount(ml)_3000.0',
 'aAmount(ml)_3500.0',
 'aAmount(ml)_4000.0',
 'aAmount(ml)_unknown']]


#X_missing.drop(columns=['Unnamed: 0'], inplace=True)
numeric_cols = ['heart_rate', 'latitude', 'longitude', 'magnitude_mAcc']
col_order = X_train_scaled.columns.tolist()

scaler = StandardScaler()
X_missing_test_numeric_scaled = scaler.fit_transform(X_missing_test[numeric_cols])
# Convert back to df
X_missing_test_numeric_scaled_df = pd.DataFrame(X_missing_test_numeric_scaled, columns=numeric_cols, index=X_missing_test.index)

# Combine scaled numeric columns with categorical columns
categorical_df = X_missing_test.drop(columns=['heart_rate', 'latitude', 'longitude', 'magnitude_mAcc'])
X_missing_test_scaled = pd.concat([X_missing_test_numeric_scaled_df, categorical_df], axis=1)
X_missing_test_scaled = X_missing_test_scaled[col_order]


# Assuming X_train_scaled is your training data without missing values
# Assuming X_test_missing is your test data with missing values

# Initialize the IterativeImputer
imputer = IterativeImputer(estimator=BayesianRidge(), random_state=0, max_iter=20, sample_posterior=True)

# Fit the imputer on the training data
imputer.fit(X_train_scaled)

# Transform the data with missing values
X_imputed = imputer.transform(X_missing_test_scaled)

test_imputed = pd.DataFrame(X_imputed, columns=X_missing_test.columns)

with open('imputed_test.pickle', 'wb') as f:
    pickle.dump(test_imputed, f)

Two-stage modeling

- Set 1: Features only in the training set.
- Set 2: Features shared by the training and validation sets.
- Set 3: Features only in the validation set.

In [None]:
set1_features = list(X_train_scaled.columns)  # same as list(validation_imputed.columns)
set2_features = list(X_train_scaled.columns)  # same as list(validation_imputed.columns)
set3_features = ['highest_prob_sound', 'altitude', 'speed_x', 'speed_y', 'm_light', 'total_time_sum', 'burned_calories', 'distance', 'running_steps', 'steps', 'step_frequency', 'walking_steps']

In [None]:
set1_features = ['heart_rate',
 'magnitude_mAcc',
 'bvp_positive',
 'bvp_negative',
 'temp',
 'magnitude_e4Acc',
 'latitude',
 'longitude',
 'accuracy',
 'magnitude_mMag',
 'eda',
 'magnitude_mGyr',
 'emotionPositive',
 'emotionTension',
 'sleep',
 'sleepProblem',
 'dream',
 'amCondition',
 'amEmotion',
 'pmEmotion',
 'pmStress',
 'pmFatigue',
 'wakeupduration',
 'lightsleepduration',
 'deepsleepduration',
 'wakeupcount',
 'durationtosleep',
 'remsleepduration',
 'durationtowakeup',
 'hr_average',
 'rr_average',
 'breathing_disturbances_intensity',
 'snoring',
 'snoringepisodecount',
 'sleep_score',
 'action_care_housemem',
 'action_community_interaction',
 'action_entertainment',
 'action_hobby',
 'action_household',
 'action_meal',
 'action_outdoor_act',
 'action_personal_care',
 'action_recreation_etc',
 'action_recreation_media',
 'action_shop',
 'action_sleep',
 'action_socialising',
 'action_study',
 'action_travel',
 'action_work',
 'condition_ALONE',
 'condition_WITH_MANY',
 'condition_WITH_ONE',
 'place_home',
 'place_other_indoor',
 'place_outdoor',
 'place_restaurant',
 'place_workplace',
 'activity_IN_VEHICLE',
 'activity_ON_FOOT',
 'activity_STILL',
 'activity_UNKNOWN',
 'caffeine_caffeinated drink',
 'caffeine_coffee',
 'caffeine_coke',
 'caffeine_tea',
 'caffeine_unknown',
 'alcohol_beer',
 'alcohol_beer&rice wine',
 'alcohol_not specified',
 'alcohol_soju',
 'alcohol_soju&beer',
 'alcohol_unknown',
 'alcohol_wine',
 'cAmount(ml)_100.0',
 'cAmount(ml)_150.0',
 'cAmount(ml)_200.0',
 'cAmount(ml)_250.0',
 'cAmount(ml)_260.0',
 'cAmount(ml)_280.0',
 'cAmount(ml)_300.0',
 'cAmount(ml)_350.0',
 'cAmount(ml)_355.0',
 'cAmount(ml)_360.0',
 'cAmount(ml)_400.0',
 'cAmount(ml)_450.0',
 'cAmount(ml)_500.0',
 'cAmount(ml)_560.0',
 'cAmount(ml)_600.0',
 'cAmount(ml)_700.0',
 'cAmount(ml)_750.0',
 'cAmount(ml)_900.0',
 'cAmount(ml)_1000.0',
 'cAmount(ml)_1500.0',
 'cAmount(ml)_unknown',
 'aAmount(ml)_200.0',
 'aAmount(ml)_250.0',
 'aAmount(ml)_300.0',
 'aAmount(ml)_330.0',
 'aAmount(ml)_400.0',
 'aAmount(ml)_500.0',
 'aAmount(ml)_600.0',
 'aAmount(ml)_700.0',
 'aAmount(ml)_720.0',
 'aAmount(ml)_750.0',
 'aAmount(ml)_800.0',
 'aAmount(ml)_900.0',
 'aAmount(ml)_1000.0',
 'aAmount(ml)_1500.0',
 'aAmount(ml)_2000.0',
 'aAmount(ml)_3000.0',
 'aAmount(ml)_3500.0',
 'aAmount(ml)_4000.0',
 'aAmount(ml)_unknown']

In [None]:
set2_features = ['heart_rate',
 'magnitude_mAcc',
 'bvp_positive',
 'bvp_negative',
 'temp',
 'magnitude_e4Acc',
 'latitude',
 'longitude',
 'accuracy',
 'magnitude_mMag',
 'eda',
 'magnitude_mGyr',
 'emotionPositive',
 'emotionTension',
 'sleep',
 'sleepProblem',
 'dream',
 'amCondition',
 'amEmotion',
 'pmEmotion',
 'pmStress',
 'pmFatigue',
 'wakeupduration',
 'lightsleepduration',
 'deepsleepduration',
 'wakeupcount',
 'durationtosleep',
 'remsleepduration',
 'durationtowakeup',
 'hr_average',
 'rr_average',
 'breathing_disturbances_intensity',
 'snoring',
 'snoringepisodecount',
 'sleep_score',
 'action_care_housemem',
 'action_community_interaction',
 'action_entertainment',
 'action_hobby',
 'action_household',
 'action_meal',
 'action_outdoor_act',
 'action_personal_care',
 'action_recreation_etc',
 'action_recreation_media',
 'action_shop',
 'action_sleep',
 'action_socialising',
 'action_study',
 'action_travel',
 'action_work',
 'condition_ALONE',
 'condition_WITH_MANY',
 'condition_WITH_ONE',
 'place_home',
 'place_other_indoor',
 'place_outdoor',
 'place_restaurant',
 'place_workplace',
 'activity_IN_VEHICLE',
 'activity_ON_FOOT',
 'activity_STILL',
 'activity_UNKNOWN',
 'caffeine_caffeinated drink',
 'caffeine_coffee',
 'caffeine_coke',
 'caffeine_tea',
 'caffeine_unknown',
 'alcohol_beer',
 'alcohol_beer&rice wine',
 'alcohol_not specified',
 'alcohol_soju',
 'alcohol_soju&beer',
 'alcohol_unknown',
 'alcohol_wine',
 'cAmount(ml)_100.0',
 'cAmount(ml)_150.0',
 'cAmount(ml)_200.0',
 'cAmount(ml)_250.0',
 'cAmount(ml)_260.0',
 'cAmount(ml)_280.0',
 'cAmount(ml)_300.0',
 'cAmount(ml)_350.0',
 'cAmount(ml)_355.0',
 'cAmount(ml)_360.0',
 'cAmount(ml)_400.0',
 'cAmount(ml)_450.0',
 'cAmount(ml)_500.0',
 'cAmount(ml)_560.0',
 'cAmount(ml)_600.0',
 'cAmount(ml)_700.0',
 'cAmount(ml)_750.0',
 'cAmount(ml)_900.0',
 'cAmount(ml)_1000.0',
 'cAmount(ml)_1500.0',
 'cAmount(ml)_unknown',
 'aAmount(ml)_200.0',
 'aAmount(ml)_250.0',
 'aAmount(ml)_300.0',
 'aAmount(ml)_330.0',
 'aAmount(ml)_400.0',
 'aAmount(ml)_500.0',
 'aAmount(ml)_600.0',
 'aAmount(ml)_700.0',
 'aAmount(ml)_720.0',
 'aAmount(ml)_750.0',
 'aAmount(ml)_800.0',
 'aAmount(ml)_900.0',
 'aAmount(ml)_1000.0',
 'aAmount(ml)_1500.0',
 'aAmount(ml)_2000.0',
 'aAmount(ml)_3000.0',
 'aAmount(ml)_3500.0',
 'aAmount(ml)_4000.0',
 'aAmount(ml)_unknown']

In [None]:
# full validation set
# concat validation_imputed (which has the same columns as train set) and the original validation set
with open('./validation_with_labels.pickle', 'rb') as f:
    validation_data = pickle.load(f)

with open('./imputed_validation.pickle', 'rb') as f:
    validation_imputed = pickle.load(f)

validation_features = validation_data.drop(columns=[col for col in validation_data.columns if col.startswith('daily_')])
validation_data = pd.concat([validation_features, validation_imputed], axis=1)
validation_data = validation_data.drop(columns=['activity', 'subject_id', 'date', 'timestamp'])

Stage 1: Train an initial model on the training set

1. Initial model training (features used: set 2)
- Train the initial model using only the features available in the training set.
- This model will be used to generate predictions that will serve as additional features in the second stage.

In [None]:
from sklearn.ensemble import RandomForestClassifier
model_stage1 = RandomForestClassifier()
model_stage1.fit(X_train_scaled[set2_features], y_train)

2. Generate predictions for validation set
- Use the trained model (model_stage1) to predict on the validation set.
- These predictions will act as an additional feature.

In [None]:
val_predictions_stage1 = model_stage1.predict_proba(validation_imputed[set2_features])

3. Add predictions to the validation set

In [None]:
# Add stage1 predictions to the validation set

# Check the shape of the predictions
shapes = [array.shape for array in val_predictions_stage1]
for i, shape in enumerate(shapes):
    print(f"Shape of array {i+1}: {shape}")

# Ensure each prediction array matches the number of rows in validation_imputed
for i, preds in enumerate(val_predictions_stage1):
    if len(preds) != validation_imputed.shape[0]:
        raise ValueError(f"Number of predictions for output {i+1} does not match the number of rows in validation_imputed")

# Add predictions for each output to the validation set
for i, preds in enumerate(val_predictions_stage1):
    validation_imputed[f'stage1_predictions_{i+1}'] = preds[:, 1]  #probability of the positive class

validation_imputed

Stage 2: Train a second model using the predictions and additional Features

4. Train second model
- Train a new model using both the original features and the additional features, along with the predictions from the first stage.

In [None]:
features_stage2 = set2_features + set3_features + [f"stage1_predictions_{i+1}" for i in range(len(val_predictions_stage1))]
validation_data_stage2 = validation_imputed.reindex(columns=features_stage2)
model_stage2 = RandomForestClassifier()
model_stage2.fit(validation_data_stage2, validation_labels)

5. Evaluate and predict on test set
- Use the first model to generate predictions for the test set.
- Combine these predictions with the original features and the additional features of the test set to form the input for the second model.

In [None]:
# Generate predictions for test set using the first stage model
test_predictions_stage1 = model_stage1.predict_proba(test_imputed[set2_features])

# Add stage1 predictions to the test set for each output
for i, preds in enumerate(test_predictions_stage1):
    test_imputed[f"stage1_predictions_{i+1}"] = preds[:,1] # exact probability of the positive class


# Ensure the test data includes the shared features, additional features, and stage1 predictions
test_data_stage2 = test_imputed.reindex(columns=features_stage2)

# Make final predictions on the test set
final_test_predictions = model_stage2.predict(test_data_stage2)
final_test_predictions_df = pd.DataFrame(final_test_predictions, columns=validation_labels.columns)
final_test_predictions_df

In [None]:
with open('./test.pickle', 'rb') as f:
    test_data = pickle.load(f)

# Separate features and target labels
test_features = test_data.drop(columns=[col for col in test_data.columns if col.startswith('daily_')])
test_features_append = test_features[['subject_id', 'date']]
test_features_append
final_test_predictions_df
submit = pd.concat([test_features_append, final_test_predictions_df], axis=1)
submit.to_csv("submit_0625.csv", index=False)