In [None]:
# Enter here the data_location you used in the snakemake command (no trailing /)
parent_folder = ""

In [6]:
import pandas as pd

def confidence_best_biased(r):
    return r["bb"] if r["conf_bb"] > r["conf_alt"] else r["alt"]

samples = ["RPE1-WT", "RPE-BM510", "LCL", "C7"]
l = list()
for sample in samples:
    # READ DF OUTPUT FROM ARBIGENT
    file = "{parent_folder}/{sample}/arbigent/regenotyper_allsamples_bulk/arbigent_results/res_detail.csv".format(parent_folder=parent_folder, sample=sample)
    df = pd.read_csv(file, sep="\t")

    # REPROCESS DATA
    df = df.loc[df["valid_bins"] > 0]
    df[["bb", "T/F", "alt", "conf_bb", "conf_alt"]] = df[df.columns[-1]].str.split(":", expand=True) 
    df["conf_bb"] = df["conf_bb"].astype(float)
    df["conf_alt"] = df["conf_alt"].astype(float)
    df["len"] = df["len"].astype(float)
    df["start"] = df["start"].astype(int)
    df["final_genotype"] = df.apply(confidence_best_biased, axis=1)
    
    # REMOVE chr3 randomly sampled SV used for ArbiGent stability (lowconf regions)
    rm_l = df.loc[(df["chrom"] == "chr3") & (df["start"] != 60900000) & (df["start"] != 102800000), "ID"].unique().tolist()
    df = df.loc[~df["ID"].isin(rm_l)]

    df["Sample"] = sample
    df["ID"] = df["chrom"] + "-" + df["start"].astype(str) + "-" + df["end"].astype(str)
    df = df.rename({sample: "GT"}, axis=1).reset_index(drop=True)
    l.append(df)

# Concat DFs
df = pd.concat(l)
df

Unnamed: 0,chrom,start,end,ID,len,valid_bins,GT,bb,T/F,alt,conf_bb,conf_alt,final_genotype,Sample
0,chr12,25100000,25800000,chr12-25100000-25800000,700.0,6473,0|0:T:0|0:139.4:139.4,0|0,T,0|0,139.4,139.4,0|0,RPE1-WT
1,chr15,93300000,101991189,chr15-93300000-101991189,8691.189,81095,0|0:T:0|0:5e+09:5e+09,0|0,T,0|0,5000000000.0,5000000000.0,0|0,RPE1-WT
2,chr16,77600000,78100000,chr16-77600000-78100000,500.0,4862,0|0:T:3010:341.3:431.5,0|0,T,3010,341.3,431.5,3010,RPE1-WT
3,chr17,63100000,63400000,chr17-63100000-63400000,300.0,2911,0010:T:0|0:43.7:15,0010,T,0|0,43.7,15.0,0010,RPE1-WT
4,chr17,63100000,63500000,chr17-63100000-63500000,400.0,3861,0|0:T:0010:41.5:50.9,0|0,T,0010,41.5,50.9,0010,RPE1-WT
5,chr20,3900000,4100000,chr20-3900000-4100000,200.0,1863,1000:T:0|1:32.1:1.6,1000,T,0|1,32.1,1.6,1000,RPE1-WT
6,chr22,37900000,39700000,chr22-37900000-39700000,1800.0,15448,0|0:T:0|0:285.6:285.6,0|0,T,0|0,285.6,285.6,0|0,RPE1-WT
7,chr22,39700000,40400000,chr22-39700000-40400000,700.0,6729,0|0:T:0|0:136.2:136.2,0|0,T,0|0,136.2,136.2,0|0,RPE1-WT
8,chr3,60900000,62300000,chr3-60900000-62300000,1400.0,13273,0|0:T:2010:266.2:307.5,0|0,T,2010,266.2,307.5,2010,RPE1-WT
9,chr3,102800000,103600000,chr3-102800000-103600000,800.0,7426,0|0:T:0|0:159.1:159.1,0|0,T,0|0,159.1,159.1,0|0,RPE1-WT


In [10]:
# PIVOT TABLE
pivot = pd.pivot_table(df, columns="Sample", index="ID", values="final_genotype", aggfunc=lambda x: ' '.join(x)).reset_index()

# Reformat columns
pivot["chrom"] = pivot["ID"].apply(lambda r: r.split('-')[0])
pivot["POS"] = pivot["ID"].apply(lambda r: r.split('-')[1])

pivot["END"] = pivot["ID"].apply(lambda r: r.split('-')[2])
pivot["POS"] = pivot["POS"].astype(int)
pivot["END"] = pivot["END"].astype(int)
pivot["Len"] = (pivot["END"] - pivot["POS"]) / 1000

# Categorical typing & sorting 
pivot["chrom"] = pd.Categorical(
    pivot["chrom"],
    categories=["chr{}".format(e) for e in range(1, 23)] + ["chrX", "chrY"],
    ordered=True,
)
pivot = pivot.sort_values(by=["chrom","POS"]).drop(["chrom", "POS"], axis=1)

# Output
pivot.to_csv("heatmap_data_arbigent_genotyping.tsv", sep="\t", index=False)
pivot

Sample,ID,C7,LCL,RPE-BM510,RPE1-WT,END,Len
9,chr3-60900000-62300000,0|0,0|0,1020,2010,62300000,1400.0
8,chr3-102800000-103600000,1000,0|0,0|0,0|0,103600000,800.0
10,chr9-21900000-22400000,0|0,0|0,1000,0010,22400000,500.0
0,chr12-25100000-25800000,0|0,0|0,2010,0|0,25800000,700.0
1,chr15-93300000-101991189,2110,0|0,0020,0|0,101991189,8691.189
2,chr16-77600000-78100000,0|0,0|0,1020,3010,78100000,500.0
3,chr17-63100000-63400000,0|0,0|0,1000,0010,63400000,300.0
4,chr17-63100000-63500000,0|0,0|0,1000,0010,63500000,400.0
5,chr20-3900000-4100000,0|0,0|0,1000,1000,4100000,200.0
6,chr22-37900000-39700000,0|0,0|0,2110,0|0,39700000,1800.0
