In [None]:
import pandas as pd
import os
import time
import dxpy

In [None]:
sample_qc_annot_file = "/mnt/project/notebooks/wes/sample_qc/data/sample_qc_annot_all.tsv"
sample_qc_annot_df = pd.read_csv(
    sample_qc_annot_file, sep="\t",
    usecols=[
        "s", "duplicate", "f_stat", "survey_sex", "array_sex", "exome_sex", "hetz_concordance_array",
        "sex_chromosome_aneuploidy", "genetic_kinship_to_other_participants", "out_hetz_missing", 
        "call_rate", "r_ti_tv_residual", "r_het_hom_var_residual", "r_insertion_deletion_residual",
        "n_singleton_residual", "r_snv_indel_residual"
    ]
)

# Sample Quality Control metrics to filter on:

1. Missing array data
2. Duplicates
3. Sex concordance  between survey, exome and array
4. Heterozygote concordance between high quality exome variants and array calls
5. Sex chromosome aneuploidy based on array calls
6. Outliers in heterozygosity and missing rates based on array calls
7. Exome variant call rate
8. Eight SD deviation from mean ancestry normalized
    - Transition/transversion ratio
    - Insertion/Deletion allele ratio
    - Heterozygous/homozygous call ratio
    - SNV/indel 
    - number of singletons

# Samples with missing array data

In [None]:
sample_qc_annot_df["missing_array"] = sample_qc_annot_df.hetz_concordance_array.isna()

In [None]:
len(sample_qc_annot_df.loc[sample_qc_annot_df.missing_array==True])

# Check number of duplicates based on high quality autosomal variants

In [None]:
len(sample_qc_annot_df.loc[sample_qc_annot_df.duplicate==True])

# Check discordant sex from survey, exome and array

In [None]:
sample_qc_annot_df["sex_discordance"] = ~((sample_qc_annot_df.survey_sex==sample_qc_annot_df.exome_sex)&(sample_qc_annot_df.survey_sex==sample_qc_annot_df.array_sex))


In [None]:
len(sample_qc_annot_df.loc[sample_qc_annot_df.sex_discordance==True])

# Heterozygote concordance <80% between high quality exomes and array data

In [None]:
sample_qc_annot_df["array_discordance"] = sample_qc_annot_df.hetz_concordance_array<0.8

In [None]:
len(sample_qc_annot_df.loc[sample_qc_annot_df.array_discordance==True])

# Sex chromosome aneuploidy

In [None]:
sample_qc_annot_df["sex_chromosome_aneuploidy"] = sample_qc_annot_df.sex_chromosome_aneuploidy=="Yes"

In [None]:
len(sample_qc_annot_df.loc[sample_qc_annot_df.sex_chromosome_aneuploidy==True])

# Outliers in heterozygosity or missingness

In [None]:
sample_qc_annot_df["out_hetz_missing"] = sample_qc_annot_df.out_hetz_missing=="Yes"

In [None]:
len(sample_qc_annot_df.loc[sample_qc_annot_df.out_hetz_missing==True])

# Less than 90% exome variant call rate

In [None]:
sample_qc_annot_df["low_call_rate"] = sample_qc_annot_df.call_rate<0.9

In [None]:
len(sample_qc_annot_df.loc[sample_qc_annot_df.low_call_rate==True])

# Deviation from metrics

In [None]:
def get_metric_stats(ser):
    mean,sd = ser.mean(), ser.std()
    return mean, sd

def get_deviation(ser):
    mean, sd = get_metric_stats(ser)
    return ser.abs()>(mean+8*sd)


In [None]:
metrics = ["r_ti_tv_residual", "r_het_hom_var_residual", "r_insertion_deletion_residual", "n_singleton_residual", "r_snv_indel_residual"]
for metric in metrics:
    print(metric)
    sample_qc_annot_df[f"{metric}_outlier"] = get_deviation(sample_qc_annot_df[metric])
    print(len(sample_qc_annot_df.loc[sample_qc_annot_df[f"{metric}_outlier"]==True]))

# Flag samples using the filters

In [None]:
filter_columns = [
    "missing_array", "duplicate", "sex_discordance", "array_discordance", 
    "sex_chromosome_aneuploidy", "out_hetz_missing", "low_call_rate", 
    "r_ti_tv_residual_outlier", "r_het_hom_var_residual_outlier", "r_insertion_deletion_residual_outlier", 
    "n_singleton_residual_outlier", "r_snv_indel_residual_outlier"
]

In [None]:
sample_qc_annot_df['filters'] = sample_qc_annot_df.apply(lambda row: "|".join({col for col in filter_columns if row[col]}), axis=1)


# Upload to pandas

In [None]:
flagged_samples_df = sample_qc_annot_df.loc[:, ["s", "filters"]+filter_columns]

In [None]:
def upload_file_to_project(filename, proj_dir):
    dxpy.upload_local_file(filename, folder=proj_dir, parents=True)
    print(f"*********{filename} uploaded!!*********")
    return


In [None]:
proj_dir = f"/notebooks/wes/sample_qc/data/"
filename = "flagged_samples.tsv"
flagged_samples_df.to_csv(filename, index=False, sep="\t")
upload_file_to_project(filename, proj_dir)