In [None]:
import pandas as pd
import os
import dxpy

In [None]:
def upload_file_to_project(filename, proj_dir):
    dxpy.upload_local_file(filename, folder=proj_dir, parents=True)
    print(f"*********{filename} uploaded!!*********")
    return



# Divide the autosomes into 5 subsets to speed up KING inference

Ref: --kinship --projection N

In [None]:
fam_file = "/mnt/project/notebooks/wes/sample_qc/high_quality_variants/autosomes/autosome_hqc.fam"

In [None]:
fam_df = pd.read_csv(fam_file, sep="\t", header=None, usecols=[0,1])

In [None]:
five_sets = len(fam_df)//5

In [None]:
fam_df[2] = 1
for i in range(2,6):
    start = five_sets*(i-1)
    end = five_sets*i
    fam_df.iloc[start:end, 2] = i

In [None]:
proj_dir = f"/notebooks/wes/sample_qc/relatedness/"
filename = "subsets.txt"
fam_df.to_csv(filename, index=False, sep="\t", header=None)
upload_file_to_project(filename, proj_dir)

In [None]:
print(five_sets)

# Create subsets of plink files based on the 5 defined subsets

**This step is run using swiss army knife**

# Get King estimates on the five subsets 

**This step is run using swiss army knife**

# Read and compile King produced files

Estimated kinship coefficient range and their corresponding relationship

 - \>0.354: duplicate/MZ twin
 - 0.177 to 0.354: 1st-degree
 - 0.0884 to 0.177: 2nd-degree
 - 0.0442 to 0.0884: 3rd-degree 


Ref: https://www.kingrelatedness.com/manual.shtml#WITHIN

In [None]:
# previously ran second degree file
second_degree_filename = "/mnt/project/notebooks/wes/sample_qc/relatedness/king.kin0"
second_degree_df = pd.read_csv(second_degree_filename, sep="\t")


In [None]:
# third degree file with subsets
third_degree_same_set_filenames = [f"/mnt/project/notebooks/wes/sample_qc/relatedness/subset{i}.kin0" for i in range(1, 6)]
third_degree_diff_set_filenames = [f"/mnt/project/notebooks/wes/sample_qc/relatedness/subset{i}{j}.kin0" for i in range(1, 6) for j in range(i+1, 6)]
third_degree_df = pd.concat([pd.read_csv(fn, sep="\t") for fn in third_degree_same_set_filenames+third_degree_diff_set_filenames])



In [None]:
# comparing the two files for second degree calls
sdk_set = set(second_degree_df.loc[:, ["ID1", "ID2"]].values.flatten())
tdk_set = set(third_degree_df.loc[third_degree_df.Kinship>=0.0884, ["ID1", "ID2"]].values.flatten())
tdk_new = tdk_set.difference(sdk_set)
third_degree_df.loc[(third_degree_df.ID1.isin(tdk_new)|third_degree_df.ID2.isin(tdk_new))&(third_degree_df.Kinship>=0.0884)]


*Six new comparisons found all with Kinship=0.0884, which are missed during initial second degree call*

# Save individuals who are related based on autosomes

In [None]:
all_ind_w_third_degree_relatives = list(set(third_degree_df.loc[:, ["ID1", "ID2"]].values.flatten()))
duplicated_ind = set(third_degree_df.loc[third_degree_df.Kinship>0.354, ["ID1", "ID2"]].values.flatten())


In [None]:
related_df = pd.DataFrame({
    "s": all_ind_w_third_degree_relatives, 
    "third_degree": [True for _ in range(len(all_ind_w_third_degree_relatives))]}
)


In [None]:
related_df["duplicate_ind"] = related_df.s.isin(duplicated_ind)

In [None]:
related_df.duplicate_ind.value_counts()

# Verify with array generated file

In [None]:
geno_sample_qc_file = "/mnt/project/fields/data/sample_qc/sample_qc_info.tsv"
geno_sample_qc_df = pd.read_csv(geno_sample_qc_file, sep="\t")


In [None]:
geno_sample_qc_df.genetic_kinship_to_other_participants.value_counts()

In [None]:
array_relatives = set(geno_sample_qc_df.loc[
    (geno_sample_qc_df.genetic_kinship_to_other_participants!="No kinship found")&
    (geno_sample_qc_df.genetic_kinship_to_other_participants.notna()),
    "sample_names"
].astype(str))


In [None]:
exome_relatives = set(related_df.s.astype(str))


In [None]:
all_samples = array_relatives.union(exome_relatives)


In [None]:
# Create a DataFrame categorizing membership in both sets
data = {
    "sample_id": list(all_samples),
    "exome_relative": [1 if sample in exome_relatives else 0 for sample in all_samples],
    "array_relative": [1 if sample in array_relatives else 0 for sample in all_samples],
}

df = pd.DataFrame(data)

# Create the contingency table
contingency_table = pd.crosstab(df["exome_relative"], df["array_relative"])


In [None]:
contingency_table


In [None]:
len(exome_relatives.intersection(array_relatives))/len(exome_relatives)


*94% concordance between array and exome calls*

# Save file in tsv format

In [None]:
proj_dir = f"/notebooks/wes/sample_qc/relatedness/"
filename = "related_exome.tsv"
related_df.to_csv(filename, index=False, sep="\t")
upload_file_to_project(filename, proj_dir)
