In [2]:
import pandas as pd
import numpy as np


def clean_tabular(dataframe: pd.DataFrame) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
    return (
        dataframe.drop(columns=["LABEL"]).values,
        np.array(dataframe["LABEL"].values),
        np.array(dataframe["patient_id"].values),
    )

In [6]:
from src.dataset import load_train_csv, load_test_csv

train_df, val_df = load_train_csv(fold_id=3, fold_numbers=4, add_file_path=False)
X_train, y_train, id_train = clean_tabular(train_df)
X_val, y_val, id_val = clean_tabular(val_df)

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score

scores = []
test_pred = []

total_folds = 4
X_test, y_test, id_test = clean_tabular(load_test_csv(add_file_path=False))

for fold in range(total_folds):
    train_df, val_df = load_train_csv(
        fold_id=fold, fold_numbers=total_folds, add_file_path=False
    )
    X_train, y_train, id_train = clean_tabular(train_df)
    X_val, y_val, id_val = clean_tabular(val_df)
    clf = LogisticRegression(random_state=0).fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    scores.append(balanced_accuracy_score(y_val, y_pred))
    test_pred.append(clf.predict(X_test))

print(np.mean(scores))

0.8408527535682708


In [9]:
from sklearn.ensemble import RandomForestClassifier

scores = []
test_pred = []

X_test, y_test, id_test = clean_tabular(load_test_csv(add_file_path=False))

for fold in range(5):
    train_df, val_df = load_train_csv(fold_id=fold, add_file_path=False)
    X_train, y_train, id_train = clean_tabular(train_df)
    X_val, y_val, id_val = clean_tabular(val_df)
    clf = RandomForestClassifier(random_state=0).fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    scores.append(balanced_accuracy_score(y_val, y_pred))
    test_pred.append(clf.predict(X_test))

print(np.mean(scores))

0.7881422924901187


In [14]:
prediction = np.array(test_pred)
prediction = np.mean(prediction, axis=0)
prediction = (prediction > 0.5).astype(int)

patient_list = [f"P{p}" for p in id_test]

submission = pd.DataFrame({"Id": patient_list, "Predicted": prediction})

submission.to_csv("submission.csv", index=False)