In [1]:
import pandas as pd
import numpy as np
import sklearn

In [2]:
train_file = "./kaggle/brist1d/train.csv"
test_file = "./kaggle/brist1d/test.csv"

df = pd.read_csv(train_file)
df_test = pd.read_csv(test_file)

for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].astype('category')

df.dtypes

  df = pd.read_csv(train_file)


id               category
p_num            category
time             category
bg-5:55           float64
bg-5:50           float64
                   ...   
activity-0:15    category
activity-0:10    category
activity-0:05    category
activity-0:00    category
bg+1:00           float64
Length: 508, dtype: object

## Data Cleaning - handling missing values

In [3]:
features = df.columns
bg_cols = [col for col in features if "bg" in col]
insulin_cols = [col for col in features if "insulin" in col]
carbs_cols = [col for col in features if "carbs" in col]
hr_cols = [col for col in features if "hr" in col]
steps_cols = [col for col in features if "steps" in col]
cals_cols = [col for col in features if "cals" in col]
activity_cols = [col for col in features if "activity" in col]

Fill missing values for features based on:
- mean value: blood glucose, carbs, cals
- zeroes: insulin
- forward filled (i.e. propagates the last observation forward): heart rate, steps

In [4]:
for feature in [*bg_cols, *carbs_cols, *cals_cols]:
    df[feature] = df[feature].fillna(df[feature].mean())

for feature in insulin_cols:
    df[feature] = df[feature].fillna(0)

for feature in [*hr_cols, *steps_cols]:
    df[feature] = df[feature].fillna(method="ffill")

df.isna().sum()

id                    0
p_num                 0
time                  0
bg-5:55               0
bg-5:50               0
                  ...  
activity-0:15    174293
activity-0:10    174287
activity-0:05    174271
activity-0:00    174287
bg+1:00               0
Length: 508, dtype: int64

### Fit a simple XGBoost regressor 

In [5]:
#df = df.drop(["p_num", "id", "time", *activity_cols], axis=1)
df = df.drop(["p_num", "id", "time"], axis=1)
X = df.drop("bg+1:00", axis=1)
y = df["bg+1:00"]
df.columns

Index(['bg-5:55', 'bg-5:50', 'bg-5:45', 'bg-5:40', 'bg-5:35', 'bg-5:30',
       'bg-5:25', 'bg-5:20', 'bg-5:15', 'bg-5:10',
       ...
       'activity-0:40', 'activity-0:35', 'activity-0:30', 'activity-0:25',
       'activity-0:20', 'activity-0:15', 'activity-0:10', 'activity-0:05',
       'activity-0:00', 'bg+1:00'],
      dtype='object', length=505)

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert activity columns to category type
for col in activity_cols:
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col].astype('category')

In [8]:

# 3. Scale the features
scaler = StandardScaler()
#X_train_scaled = scaler.fit_transform(X_train)
#X_test_scaled = scaler.transform(X_test)
X_train_scaled = X_train
X_test_scaled = X_test

feature_types = ["c" if feat in activity_cols else "q" for feat in X_train.columns]
# 4. Initialize and train the XGBoost model
xgb_model = XGBRegressor(
    feature_types=feature_types,
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='rmse',
    random_state=42,
    enable_categorical=True
)

# Train the model
xgb_model.fit(
    X_train_scaled, 
    y_train,
    eval_set=[(X_train_scaled, y_train), (X_test_scaled, y_test)],
    verbose=False
)

# 5. Make predictions
y_pred = xgb_model.predict(X_test_scaled)

# 6. Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Model Performance Metrics:")
print(f"RMSE: {rmse:.4f}")
print(f"R² Score: {r2:.4f}")

Model Performance Metrics:
RMSE: 1.9605
R² Score: 0.5750


In [10]:
# X_testing = df_test.drop(["p_num", "id", "time", *activity_cols], axis=1)
X_testing = df_test.drop(["p_num", "id", "time"], axis=1)

for col in activity_cols:
    X_testing[col] = X_testing[col].astype('category')
    
y_testing_pred = xgb_model.predict(X_testing)
y_testing_pred

array([12.411012,  8.457562, 12.448939, ..., 10.205155,  9.483859,
       10.063545], dtype=float32)

### Writing the predictions to submission file
Submission file has format: p_number, prediction_value

In [None]:
df_test

Unnamed: 0,id,p_num,time,bg-5:55,bg-5:50,bg-5:45,bg-5:40,bg-5:35,bg-5:30,bg-5:25,...,activity-0:45,activity-0:40,activity-0:35,activity-0:30,activity-0:25,activity-0:20,activity-0:15,activity-0:10,activity-0:05,activity-0:00
0,p01_8459,p01,06:45:00,,9.2,,,10.2,,,...,,,,,,,,,,
1,p01_8460,p01,11:25:00,,,9.9,,,9.4,,...,,,,,,,,Walk,Walk,Walk
2,p01_8461,p01,14:45:00,,5.5,,,5.5,,,...,,,,,,,,,,
3,p01_8462,p01,04:30:00,,3.4,,,3.9,,,...,,,,,,,,,,
4,p01_8463,p01,04:20:00,,,8.3,,,10.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3639,p24_256,p24,06:40:00,7.3,7.3,7.1,6.7,6.8,6.8,6.9,...,,,,,,,,,,
3640,p24_257,p24,12:30:00,6.0,6.2,6.2,6.2,5.8,5.5,5.7,...,,,,,,,,,,
3641,p24_258,p24,03:45:00,12.4,12.5,12.7,13.1,13.2,13.3,13.0,...,,,,,,,,,,
3642,p24_259,p24,06:10:00,8.3,8.3,8.2,8.2,7.8,7.4,6.9,...,,,,,,,,,,


In [11]:
print(df_test["id"].shape, y_testing_pred.shape, type(y_testing_pred))
#out = np.zeros((2, df_test["id"].shape[0]))
out = np.column_stack([df_test["id"], y_testing_pred])
out = pd.DataFrame(out, columns=["id", "bg+1:00"])

out.to_csv("submission.csv", index=False)


(3644,) (3644,) <class 'numpy.ndarray'>


## Iteration 2.0

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt


In [None]:
# turn the df into a tensor
X = X.astype(np.float)
print(X.columns)
input = torch.from_numpy(np.array(X))
print(input.shape)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  X = X.astype(np.float)


ValueError: Cannot cast object dtype to float64

In [None]:
model = torch.nn.RNN(input_size=)