In [None]:
import pandas as pd
import numpy as np
import json

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

import utils

In [None]:
def calculate_kidscored3(row):
    """
    Calculates the KIDScore D3 from embryo kinetics.

    Arguments:
            row: a dictionary of morphokinetic parameters.
    Returns:
            The KIDScore D3 calculated from row.
    """
    tPNf, t2, t3, t5, t8 = row['tPNf'], row['t2'], row['t3'], row['t5'], row['t8']
    if t3-tPNf < 11.48:
                return 1
    elif t3 >= 42.91:
                return 2
    elif (t5-t3)/(t5-t2) < 0.3408:
                return 3
    elif (t5-t3)/(t5-t2) >= 0.5781:
                return 4
    elif t8 > 66:
                return 4
    else:
                return 5


In [None]:
df = pd.read_csv('data/clinical.csv')
df_adj = pd.read_csv('data/adjacency_and_bbox_dataset.csv')
df = df[df['id'].isin(list(df_adj['id']))]

In [None]:
df_adj['total_contacts'] = df_adj['adjacency'].apply(lambda x: sum([sum(y) for y in json.loads(x)]))
df = df.merge(df_adj[['total_contacts', 'id']], on='id')

In [None]:
df['KIDScore'] = df.apply(calculate_kidscored3, axis=1)

In [None]:
df_train = df[df['simplified_outcome'].notna()][['KIDScore', 'total_contacts', 'simplified_outcome']]
df_train['label'] = df_train['simplified_outcome'].apply(lambda x: 1 if x == 'live birth' else 0)
df_train

In [None]:
def compute_metrics(targets, preds):
    return {
        'acc': metrics.accuracy_score(targets, preds),
        'prec': metrics.precision_score(targets, preds, pos_label=0),
        'sens': metrics.recall_score(targets, preds, pos_label=0),
        'spec': metrics.recall_score(targets, preds, pos_label=1),
        'f1': metrics.f1_score(targets, preds, pos_label=0),
        'auc': metrics.roc_auc_score(targets, preds)
    }

In [None]:
def train_cv(parameter):
    val_results = []
    for i in range(10):
        kfold = StratifiedKFold(n_splits=5, shuffle=True)
        for train_idxs, val_idxs in kfold.split(df_train[parameter], df_train['label']):
            train = df_train.iloc[train_idxs]
            val = df_train.iloc[val_idxs]
            model = LogisticRegression().fit(
                np.expand_dims(np.array(train[parameter]), axis=1),
                train['label']
            )
            preds = model.predict(np.expand_dims(np.array(val[parameter]), axis=1))
            val_results.append(compute_metrics(val['label'], preds))

    return utils.mean_and_std_over_dict(val_results)

In [None]:
train_cv('KIDScore')

In [None]:
train_cv('total_contacts')