In [2]:
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt 
import numpy as np 
import warnings
import pickle as pkl 
from sklearn.utils.class_weight import compute_class_weight
import math
import csv
from sklearn.model_selection import cross_val_predict, StratifiedKFold, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, accuracy_score
warnings.filterwarnings("ignore")

# demographics and reference data 
biobank demographics data 

In [None]:
demo = pd.read_csv("../../pheno/Demographics.txt", sep="|", low_memory=False)

In [None]:
ref = pd.read_csv("../reference_data.txt", sep="\t")

# clusters 
final louvain clusters after merging 

In [None]:
clusters = pd.read_csv("fst_merged/louvain_original_001.csv")
clusters.columns = ["sid", "old_cluster", "name"]

# ancestry 
information about country of birth 

In [None]:
def set_ancestor_countries(df, columns, new_column, default_column, threshold=2): 
        
    mode_values = df[columns].mode(axis=1).iloc[:, 0]
    country_occurences = df[columns].apply(pd.value_counts, axis=1)
    most_common_country = mode_values[country_occurences.max(axis=1) >= threshold]

    df[new_column] = df[default_column]
    df.loc[most_common_country.index, new_column] = most_common_country
    

In [None]:
ancestry = pd.read_csv("../../pheno/ancestry.txt", sep="\t")

# merged datasets 

In [None]:
clusters_ancestry = ancestry.merge(clusters[["SUBJECT_ID", "name", "sid"]], 
                                  left_on="subject_id", right_on="SUBJECT_ID", 
                                  how="right")

clusters_demo = demo.merge(clusters[["SUBJECT_ID", "name", "sid"]], 
                                  left_on="subject_id", right_on="SUBJECT_ID", 
                                  how="right")

clusters_ref = clusters.merge(ref, left_on="sid", right_on="SGDP_ID", how="left")

In [None]:
# clean cluster information 

clusters_demo["religion"] = clusters_demo["religion"].replace({"UNAFFILIATED/NONE":np.nan,
                                       "UNKNOWN/UNABLE TO OBTAIN":np.nan, 
                                       "PATIENT DECLINED":np.nan})
clusters_ancestry = clusters_ancestry.replace({" USA":np.nan})
clusters_ancestry = clusters_ancestry.replace({"Unknown or Not reported":np.nan})

In [None]:
# parental origin 
birth_places = ["MOTHER_BIRTHPLACE", "FATHER_BIRTHPLACE"]
set_ancestor_countries(clusters_ancestry, birth_places, "parental_origin", "COUNTRY_OF_BIRTH", threshold=2)

In [None]:
# grandparental origin 


birth_places = ["PAT_GRANDFATHER_BIRTHPLACE", "PAT_GRANDMOTHER_BIRTHPLACE", 
               "MAT_GRANDFATHER_BIRTHPLACE", "MAT_GRANDMOTHER_BIRTHPLACE"]

set_ancestor_countries(clusters_ancestry, birth_places, "grandparental_origin", "parental_origin", threshold=3)

# PPV 

calculate positive predictive value of each characteristic per cluster 

In [None]:
def calculate_ppv(estimated_categories):
    """
    calculate the positive predictive value of predicted demographic attributes vs the known demographic attributes per cluster
    """
    max_ppv_list = []

    for cluster, subset in estimated_categories.groupby("cluster_name"):

        ppv = precision_score(subset["true_label"], subset["predicted_label"], average=None)

        true_labels = sorted(set(list(subset["true_label"]) + list(subset["predicted_label"])))
        ppv_df = pd.DataFrame({"true": true_labels, "ppv": ppv})

        ppv_df = ppv_df[ppv_df["ppv"] > 0]
        top = ppv_df.groupby("true")["ppv"].max().nlargest(3)
        max_ppv_list.append([cluster, list(zip(top.index, top.values))])
        
    return pd.DataFrame(max_ppv_list)

In [3]:
def perform_ppv_prediction(df, variable_column, cluster_column, K=5): 
    """
    predict c
    """
    
    df = df.dropna(subset=[variable_column])
    
    X = pd.get_dummies(df[cluster_column], dtype=float)
    Y = pd.factorize(df[variable_column])[0]
    
    true_labels_dict = dict(enumerate(pd.factorize(df[variable_column])[1]))
    
    logreg = LogisticRegression(multi_class="multinomial")  # perform a multinomial logisitc regression for the demographic category
    predicted = cross_val_predict(logreg, X, Y, cv=StratifiedKFold(n_splits=K), method="predict")  # do cross validation and predict the labels 
    
    estimated_categories = pd.DataFrame(zip(Y, predicted), columns=["true", "predicted"])
    
    estimated_categories["true_label"] = estimated_categories["true"].replace(true_labels_dict)
    estimated_categories["predicted_label"] = estimated_categories["predicted"].replace(true_labels_dict)
    estimated_categories["cluster_name"] = df[cluster_column].values
    
    ppv_df = calculate_ppv(estimated_categories) # calculate the ppv for the predicted labels 
    ppv_df.columns = ["cluster_name", f"{variable_column}_ppv"]
    
    return ppv_df
    

## country of origin ppv

In [None]:
birth_ppv = perform_ppv_prediction(clusters_ancestry, "COUNTRY_OF_BIRTH", "name")
parent_birth_ppv = perform_ppv_prediction(clusters_ancestry, "parental_origin", "name")
grandparent_birth_ppv = perform_ppv_prediction(clusters_ancestry, "grandparental_origin", "name")

## heritage ppv 

In [None]:
heritage_ppv = perform_ppv_prediction(clusters_ancestry, "DESCRIPT_OF_HERITAGE", "name")

## reference data ppv 

In [None]:
ref_ppv = perform_ppv_prediction(clusters_ref, "Population_ID", "name")
region_ref_ppv = perform_ppv_prediction(clusters_ref, "Region2", "name")

## religion ppv

In [5]:
religion_ppv = perform_ppv_prediction(clusters_demo, "religion", "name")

# combine ppv data

In [None]:
df_list = [ref_ppv, region_ref_ppv, religion_ppv, heritage_ppv, 
           birth_ppv, parent_birth_ppv, grandparent_birth_ppv]

ppv_df = pd.merge(df_list[0], df_list[1], on="cluster_name", how="outer")

for i in range(2, len(df_list)):
    ppv_df = pd.merge(ppv_df, df_list[i], on="cluster_name", how="outer")

In [None]:
ppv_df.head()