In [None]:
import pandas as pd
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.inspection import permutation_importance
from collections import defaultdict


In [None]:
with open('train_with_labels.pickle', 'rb') as f:
    train_set = pickle.load(f)

with open('validation_with_labels.pickle', 'rb') as f:
    val_set = pickle.load(f)

In [None]:
def one_hot_encode(df, exclude_columns=None):
    if exclude_columns is None:
        exclude_columns = ['date', 'userId', 'timestamp']


    string_features = df.select_dtypes(include=['object']).columns.tolist()
    string_features = [col for col in string_features if col not in exclude_columns]


    df_encoded = pd.get_dummies(df, columns=string_features)
   
    for col in df_encoded.select_dtypes(include=['bool']).columns:
        df_encoded[col] = df_encoded[col].astype(int)


    return df_encoded


train_encoded = one_hot_encode(train_set)
train_encoded.drop(columns=['userId', 'date', 'timestamp'], inplace=True)
val_encoded = one_hot_encode(val_set)
val_encoded.drop(columns=['subject_id', 'date', 'timestamp'], inplace=True)


train_encoded.rename(columns={'Q1': 'daily_Q1',
                   'Q2': 'daily_Q2',
                   'Q3': 'daily_Q3',
                   'S1': 'daily_S1',
                   'S2': 'daily_S2',
                   'S3': 'daily_S3',
                   'S4': 'daily_S4'}, inplace=True)

val_encoded.rename(columns={'Q1': 'daily_Q1',
                   'Q2': 'daily_Q2',
                   'Q3': 'daily_Q3',
                   'S1': 'daily_S1',
                   'S2': 'daily_S2',
                   'S3': 'daily_S3',
                   'S4': 'daily_S4'}, inplace=True)

In [None]:
# Define the metrics and features
metrics = ['daily_Q1', 'daily_Q2', 'daily_Q3', 'daily_S1', 'daily_S2', 'daily_S3', 'daily_S4']
features = [
 'heart_rate',
 'magnitude_mAcc',
 'bvp_positive',
 'bvp_negative',
 'temp',
 'magnitude_e4Acc',
 'latitude',
 'longitude',
 'accuracy',
 'magnitude_mMag',
 'eda',
 'magnitude_mGyr',
 'emotionPositive',
 'emotionTension',
 'sleep',
 'sleepProblem',
 'dream',
 'amCondition',
 'amEmotion',
 'pmEmotion',
 'pmStress',
 'pmFatigue',
 'wakeupduration',
 'lightsleepduration',
 'deepsleepduration',
 'wakeupcount',
 'durationtosleep',
 'remsleepduration',
 'durationtowakeup',
 'hr_average',
 'rr_average',
 'breathing_disturbances_intensity',
 'snoring',
 'snoringepisodecount',
 'sleep_score',
 'action_care_housemem',
 'action_community_interaction',
 'action_entertainment',
 'action_hobby',
 'action_household',
 'action_meal',
 'action_outdoor_act',
 'action_personal_care',
 'action_recreation_etc',
 'action_recreation_media',
 'action_shop',
 'action_sleep',
 'action_socialising',
 'action_study',
 'action_travel',
 'action_work',
 'condition_ALONE',
 'condition_WITH_MANY',
 'condition_WITH_ONE',
 'place_home',
 'place_other_indoor',
 'place_outdoor',
 'place_restaurant',
 'place_workplace',
 'activity_IN_VEHICLE',
 'activity_ON_FOOT',
 'activity_STILL',
 'activity_UNKNOWN',
 'caffeine_caffeinated drink',
 'caffeine_coffee',
 'caffeine_coke',
 'caffeine_tea',
 'caffeine_unknown',
 'alcohol_beer',
 'alcohol_beer&rice wine',
 'alcohol_not specified',
 'alcohol_soju',
 'alcohol_soju&beer',
 'alcohol_unknown',
 'alcohol_wine']


# Dictionary to store top 3 features for each metric
top_features_dict = defaultdict(list)


# Loop through each metric
for metric in metrics:
    y = train_encoded[metric]
    X = train_encoded[features]
   
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
   
    # Train the Random Forest model
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
   
    # Compute permutation importance
    result = permutation_importance(rf, X_test, y_test, n_repeats=10, random_state=42, n_jobs=-1)
   
    # Summarize feature importance
    perm_importance_df = pd.DataFrame({
        'feature': features,
        'importance': result.importances_mean,
        'std': result.importances_std
    })
    perm_importance_df = perm_importance_df.sort_values(by='importance', ascending=False)
   
    # Store the top 3 features for this metric along with their importance and std
    top_features = perm_importance_df.head(3).apply(lambda row: f"{row['feature']} (imp: {row['importance']:.4f}, std: {row['std']:.4f})", axis=1).values
    top_features_dict[metric].extend(top_features)


# Convert the dictionary to a DataFrame
top_features_df = pd.DataFrame(top_features_dict)


# Identify unique features and highlight them
all_features = [feature.split(' ')[0] for feature in top_features_df.values.flatten()]
unique_features = set(feature for feature in all_features if all_features.count(feature) == 1)


# Function to highlight unique features with CSS
def highlight_unique(s):
    feature_names = [val.split(' ')[0] for val in s]
    return ['background-color: lightblue; color: black' if v in unique_features else 'background-color: white; color: black' for v in feature_names]


# Apply the highlighting function to the DataFrame
styled_top_features_df = top_features_df.style.apply(highlight_unique, axis=0)


# Display the styled DataFrame
styled_top_features_df
