In [1]:
# %matplotlib widget
import pandas as pd
import numpy as np
import tools as t
from scipy.stats import ttest_ind, chisquare
from collections import Counter
import pickle

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.pipeline import Pipeline

import matplotlib.pyplot as plt
import datetime

import plotly.graph_objects as go

import seaborn as sns

# Init stuff

## load kcenters

In [2]:
MAX_CT = 41.7
MAX_DDXCOV = 22
MAX_LINFOCITOS = 4.2

k_centers_rename = {
    'Ct': 'ct',
    'DeltaDDXCOVSimptomes': 'ddxcov',
    'Linfocitos_ing': 'linfocitos'
}
k_centers = pd.read_csv('k_centers.csv').rename(columns=k_centers_rename)
k_centers

Unnamed: 0,ct,ddxcov,linfocitos
0,0.682468,0.635464,0.227487
1,0.626177,0.226799,0.454241
2,0.716184,0.140868,0.19112
3,0.746602,0.375494,0.19962
4,0.548729,0.330954,0.184573
5,0.454371,0.108235,0.171471


## Shared functions

In [3]:
def perc(value, decimals=1):
    return round(100*value, decimals)


def compute_perc(df, stat_columns=[], remde_delay=None, recycle_remde_delay=True, remde_dead_threshold=None):
    max_cluster = max(df.cluster)

    n_total = df.shape[0]
    if recycle_remde_delay==False:
        n_total = df[(df.remdesivir != True) | (df.remde_delay <= remde_delay)].shape[0]
    
    rows = []
    for cluster in range(1, max_cluster+1):
        row = []
        row.append(cluster) # cluster number
        patients = df[df.cluster == cluster]
        
        remde = patients[patients.remdesivir == True] 
        no_remde = patients[patients.remdesivir != True] 
        if remde_delay:
            remde = patients[(patients.remdesivir == True) & (patients.remde_delay <= remde_delay)]
            if recycle_remde_delay:
                no_remde = patients[(patients.remdesivir != True) | (patients.remde_delay > remde_delay)]
                
        if remde_dead_threshold!=None:
            remde_dead_patients = remde[remde.x_dead_after_remde_days<remde_dead_threshold]
            if not remde_dead_patients.empty:
                print(f"The following patients will move from remde to no remde:")
                display(remde_dead_patients)
                remde = remde[remde.x_dead_after_remde_days>=remde_dead_threshold]
                no_remde = pd.concat(no_remde, remde_dead_patients)
                
        total_remde = remde.shape[0]
        total_no_remde = no_remde.shape[0]
        n_cluster = total_remde + total_no_remde
        row.append(n_cluster) # n of cluster
        row.append(perc(n_cluster/n_total)) # % of total n
        row.append(total_remde)
        row.append(total_no_remde)
        
        perc_death_remde = f"{sum(remde.dead_60d)} ({perc(sum(remde.dead_60d)/total_remde)}%)" if total_remde else None
        perc_death_no_remde = f"{sum(no_remde.dead_60d)} ({perc(sum(no_remde.dead_60d)/total_no_remde)}%)" if total_no_remde else None
        total_death = sum(remde.dead_60d) + sum(no_remde.dead_60d)
        total = total_remde + total_no_remde
        perc_total_death = f"{total_death} ({perc(total_death /(total))}%)" if total else None
        row.append(perc_death_remde)
        row.append(perc_death_no_remde)
        row.append(perc_total_death)
        
        stat, pvalue = None, None
        if len(remde.dead_60d)>1 and len(no_remde.dead_60d)>1:
            stat, pvalue = ttest_ind(list(remde.dead_60d), list(no_remde.dead_60d))
            
        row.append(pvalue)
        row.append(stat)
        
        for stat_column in stat_columns:
            remde_val = f"({round(remde[stat_column].min(),1)}/{round(remde[stat_column].mean(),1)}/{round(remde[stat_column].max(),1)})"
            noremde_val = f"({round(no_remde[stat_column].min(),1)}/{round(no_remde[stat_column].mean(),1)}/{round(no_remde[stat_column].max(),1)})"
            row.append(f"{remde_val} / {noremde_val}")
            
        rows.append(row)
    
    stats = pd.DataFrame(rows, columns=['cluster', 'n', '%', 'n Remde', 'n No Remde', '% Death Remde', '% Death No Remde', '% Death', 'pvalue', 'stat'] + stat_columns)
    stats.set_index('cluster', inplace=True)
    
    return stats


def normalize(df_orig):
    df = df_orig.copy()
    df.linfocitos = df_orig.linfocitos/(MAX_LINFOCITOS)
    df.ct = df_orig.ct/MAX_CT
    df.ddxcov = df_orig.ddxcov/MAX_DDXCOV
    return df

def add_cluster_column(df, k_centers, column_name='cluster', add_to=None):
    if isinstance(add_to, None.__class__):
        add_to = df.copy()
    values = df[['ct', 'ddxcov', 'linfocitos']]
    
    # Iterate through the entries
    clusters = []
    for i in range(0,len(values)):
        distances = []
        # Compute distance to 6 centers
        pn = values.iloc[i]
        for j in range(0, len(k_centers)):
            center = k_centers.iloc[j]
            distance = np.linalg.norm(pn - center)
            distances.append(distance)

        clusters.append(distances.index(min(distances))+1)

    add_to.insert(0, column_name, clusters)
    return add_to

# Same as before with another k_centers format
def sci_add_cluster_column(df, k_centers, column_name='cluster', add_to=None):
    if isinstance(add_to, None.__class__):
        add_to = df.copy()
    values = df[['ct', 'ddxcov', 'linfocitos']]
    
    # Iterate through the entries
    clusters = []
    for i in range(0,len(values)):
        distances = []
        # Compute distance to 6 centers
        pn = values.iloc[i]
        for j in range(0, k_centers.shape[0]):
            center = k_centers[j]
            distance = np.linalg.norm(pn - center)
            distances.append(distance)

        clusters.append(distances.index(min(distances))+1)

    add_to.insert(0, column_name, clusters)
    return add_to

def from01toTrueFalse(df, columns):
    df2 = df.copy()
    for column in columns:
        df2[column] = False
        mask = df[column] == 1
        df2.loc[mask,column] = True
    return df2

def show(df, col, cluster=3):
    df1 = df.loc[df.cluster == cluster, col]
    print("mean: ", df1.mean())
    print("\nquartiles: \n", df1.quantile([0.25,0.5,0.75]))   

def show2(df1, df2, col, cluster=3):
    df1 = df1.loc[df1.cluster == cluster, col]
    df2 = df2.loc[df2.cluster == cluster, col]
    print(ttest_ind(df1, df2))
    print("mean: ", df1.mean(), df2.mean())
    print("\nquartiles: \n", df1.quantile([0.25,0.5,0.75]), "\n", df2.quantile([0.25,0.5,0.75]))
    

# Load data

## Drop deads at Remde start

# Scratchpad

In [30]:
mt_val.shape

(902, 14)