In [1]:
import os
import pandas as pd
from functools import reduce

In [2]:
combo_files = [
    "/data6/deepro/ukb_bmi/2_rarecomb/data/british/combo2.csv",
    "/data6/deepro/ukb_bmi/2_rarecomb/data/british/combo3.csv"
]

combo_info_files = [
    "/data6/deepro/ukb_bmi/3_characterization/data/combo_info/british/discovery_combo2.csv",
    "/data6/deepro/ukb_bmi/3_characterization/data/combo_info/british/discovery_combo3.csv"
]

combo_info_df = pd.concat([pd.read_csv(cif) for cif in combo_info_files])
combo_df = pd.concat([pd.read_csv(cf, usecols=["uniq_items", "Case_Samples", "Control_Samples"]) for cf in combo_files])

In [3]:
combo_info_df = combo_info_df.merge(combo_df, on="uniq_items")

In [4]:
combo_info_df = combo_info_df.fillna("")

In [5]:
combo_genes = set("|".join(combo_info_df.uniq_items.str.replace("Input_", "").values).split("|"))
combo_samples = set("|".join(combo_info_df.combo_samples.fillna("").values).split("|"))
combo_case_samples = set("|".join(combo_info_df.Case_Samples.values).split("|"))
combo_control_samples = set("|".join(combo_info_df.Control_Samples.values).split("|"))

In [6]:
def get_vtype_df(block_df, combo_genes, combo_samples, combo_case_samples, combo_control_samples):
    # select lof and deleterious missense variants
    block_df = block_df.loc[(block_df.lof==True)|(block_df.splice_lof==True)|((block_df.missense==True)&(block_df.del_score>4))]
    block_df["samples"] = block_df.samples.str.split(",")
    block_df = block_df.explode("samples")
    block_df = block_df.loc[(block_df.samples.isin(combo_samples.union(combo_case_samples).union(combo_control_samples))) & (block_df.gene.isin(combo_genes))]
    block_df["variant"] = block_df.gene + "_" + block_df.locus + "_" + block_df.alleles
    return block_df

In [7]:
vcfs_per_chrm = {
    "chr1": 97, "chr2": 71, "chr3": 56, "chr4": 39, "chr5": 43, "chr6": 48, 
    "chr7": 47, "chr8": 35, "chr9": 42, "chr10": 40, "chr11": 57, "chr12": 52, 
    "chr13": 18, "chr14": 30, "chr15": 34, "chr16": 47, "chr17": 56, "chr18": 16, 
    "chr19": 65, "chr20": 25, "chr21": 11, "chr22": 23, "chrX": 24, "chrY": 1
}
annot_table_dir = "/data6/deepro/ukb_bmi/0_data_preparation_and_download/exome_annot/data/annot_tables_vep109"

variant_df = pd.DataFrame()
for chr_num in [f"chr{i}" for i in range(1,23)] + ["chrX", "chrY"]:
    print(chr_num)
    chr_file_num = vcfs_per_chrm[chr_num]
    for filei in range(chr_file_num):
        block_file = os.path.join(annot_table_dir, f"{chr_num}", f"block_{filei}.tsv.gz")
        block_df = pd.read_csv(block_file, sep="\t", index_col=0)
        block_df = get_vtype_df(block_df, combo_genes, combo_samples, combo_case_samples, combo_control_samples)
        variant_df = pd.concat((variant_df, block_df))

chr1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a 

chr2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a 

chr3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a 

chr4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a 

chr5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a 

chr6


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a 

chr7


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a 

chr8


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a 

chr9


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a 

chr10


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a 

chr11


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a 

chr12


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a 

chr13


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a 

chr14


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a 

chr15


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a 

chr16


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a 

chr17


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a 

chr18


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a 

chr19


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a 

chr20


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a 

chr21


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a 

chr22


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a 

chrX


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a 

chrY


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df["samples"] = block_df.samples.str.split(",")
A value is trying to be set on a copy of a 

In [8]:
def get_combo_case_loci(variant_case_df):
    combo_case_loci = set()
    for gn, gdf in variant_case_df.groupby("samples"):
        gdf = gdf.sort_values(["variant"])
        combo_case_locus = "|".join((gdf.variant).values)
        combo_case_loci.update([combo_case_locus])
    return combo_case_loci

def get_same_variant_samples(variant_subset_df, combo_case_variant_loci):
    same_variant_samples = set()
    diff_variant_samples = set()
    for gn, gdf in variant_subset_df.groupby("samples"):
        gdf = gdf.sort_values(["variant"])
        combo_cont_locus = "|".join((gdf.variant).values)
        if combo_cont_locus in combo_case_variant_loci:
            same_variant_samples.update([gn])
        else:
            diff_variant_samples.update([gn])
    return same_variant_samples, diff_variant_samples

def get_samples_with_same_variants(ser, variant_df, vtype="variant"):
    genes = ser.uniq_items.replace("Input_", "").split("|")
    case_samples = set(ser.Case_Samples.split("|"))
    control_samples = set(ser.Control_Samples.split("|"))
    all_samples = set(ser.combo_samples.split("|"))
    other_samples = all_samples.difference(case_samples.union(control_samples))
    variant_case_df = variant_df.loc[(variant_df.gene.isin(genes))&(variant_df.samples.isin(case_samples))]
    combo_case_variant_loci = get_combo_case_loci(variant_case_df)
    variant_control_df = variant_df.loc[(variant_df.gene.isin(genes))&(variant_df.samples.isin(control_samples))]
    variant_other_df = variant_df.loc[(variant_df.gene.isin(genes))&(variant_df.samples.isin(other_samples))]
    control_samples_same_variants, control_samples_other_variants = get_same_variant_samples(variant_control_df, combo_case_variant_loci)
    other_samples_same_variants, other_samples_other_variants = get_same_variant_samples(variant_other_df, combo_case_variant_loci)
    return pd.Series({
        "control_samples_same_variants": "|".join(control_samples_same_variants), 
        "control_samples_other_variants": "|".join(control_samples_other_variants),
        "other_samples_same_variants": "|".join(other_samples_same_variants), 
        "other_samples_other_variants": "|".join(other_samples_other_variants),
        })

In [9]:
combo_info_df = combo_info_df.merge(combo_info_df.apply(get_samples_with_same_variants, args=(variant_df, "variant"), axis=1), left_index=True, right_index=True)

In [10]:
combo_info_df

Unnamed: 0,uniq_items,combo_samples,Case_Samples,Control_Samples,control_samples_same_variants,control_samples_other_variants,other_samples_same_variants,other_samples_other_variants
0,Input_BMPR1B|Input_SHC2,1096941|1121194|1997309|2079898|2120943|217223...,4851616|2829943|3705558|2192189|3808185|112119...,2374413|5195017|4339656|2664424|2558947|2242075,,2558947|2664424|5195017|2242075|4339656|2374413,1997309|5473504|1096941|3537947,3311896|3749920|4673073|3351476|3732598|375495...
1,Input_BCHE|Input_TRPV4,1133669|1470103|2289357|2326174|2424337|249952...,4756156|5307719|2326174|4542312|1133669|228935...,4562244,,4562244,2930666,2499525|3247815|4007272|4096224|4986289|506183...
2,Input_ABCA13|Input_DDX60L,1182163|1253768|1355755|1397710|1786412|216534...,2614513|4264083|4423253|2165347|3808921|585198...,2501313|5513921|2353082|3186614|5166300|139771...,3186614,5644304|2353082|5166300|2501313|3117422|551392...,2427715|3831601|3576743|4235212,3024661|2312305|3463516|5112202|3018227|178641...
3,Input_MYH14|Input_NR1D1,1010013|1062480|1156553|1363786|1544603|156336...,4933679|1156553|4807178|5534440|3803952|581876...,5308561|3231450|2890441|2610139|2812302|3516504,,3231450|3516504|2812302|5308561|2610139|2890441,1062480,3614794|3624242|2240770|3265530|3932722|590538...
4,Input_ADAM19|Input_MMUT,1793868|1803901|2119938|2191681|2228888|235212...,5378577|3149749|1803901|2191681|4503641|251312...,2228888|5331084,,2228888|5331084,1793868,2869047|2352121|5661332|4460209|3181710|3666590
...,...,...,...,...,...,...,...,...
1829,Input_CPT1B|Input_DRG1|Input_SFI1,1220595|2797301|3109180|3593971|4158131|432321...,4323211|1220595|5831475|2797301|4158131,,,,,3109180|3593971
1830,Input_ACAP3|Input_SLC7A8|Input_TAS1R3,1332204|1545778|3196670|4231707|5749390,3196670|1332204|1545778|5749390|4231707,,,,,
1831,Input_F5|Input_NBEAL2|Input_SPINK8,2828581|4227055|4691233|4840853|5142560|579302...,4840853|4691233|5793025|4227055|2828581,,,,5142560|5803541,
1832,Input_GHDC|Input_KRTAP2-3|Input_TTN,1730047|3401377|4128798|4978416|5099625|523873...,1730047|5238738|5919664|4978416|5099625,,,,4128798,5316853|5246806|3401377


In [11]:
int_case_samples = combo_info_df.iloc[2].Case_Samples.split("|")

In [12]:
get_combo_case_loci(variant_df.loc[(variant_df.gene.isin(["ABCA13", "DDX60L"]))&(variant_df.samples.isin(int_case_samples))])

{'ABCA13_chr7:48192983_T_C|DDX60L_chr4:168384665_G_A',
 'ABCA13_chr7:48219490_C_T|DDX60L_chr4:168384665_G_A',
 'ABCA13_chr7:48271936_T_G|DDX60L_chr4:168384665_G_A',
 'ABCA13_chr7:48271936_T_G|DDX60L_chr4:168394520_A_G',
 'ABCA13_chr7:48271987_G_C|DDX60L_chr4:168420368_G_A',
 'ABCA13_chr7:48271987_G_C|DDX60L_chr4:168432539_G_A',
 'ABCA13_chr7:48272529_G_T|DDX60L_chr4:168403981_C_CA',
 'ABCA13_chr7:48281357_G_C|DDX60L_chr4:168394596_G_A',
 'ABCA13_chr7:48309986_T_G|DDX60L_chr4:168394520_A_G',
 'ABCA13_chr7:48317204_G_C|DDX60L_chr4:168384665_G_A',
 'ABCA13_chr7:48317259_C_T|DDX60L_chr4:168394596_G_A',
 'ABCA13_chr7:48389198_G_A|DDX60L_chr4:168421905_TC_T',
 'ABCA13_chr7:48410540_A_G|DDX60L_chr4:168421905_TC_T',
 'ABCA13_chr7:48410540_A_G|DDX60L_chr4:168471798_A_T',
 'ABCA13_chr7:48412473_C_A|DDX60L_chr4:168471798_A_T',
 'ABCA13_chr7:48507934_C_T|DDX60L_chr4:168384665_G_A',
 'ABCA13_chr7:48507934_C_T|DDX60L_chr4:168396112_G_GT',
 'ABCA13_chr7:48507993_C_A|DDX60L_chr4:168421905_TC_T',
 'ABC

In [25]:
sv_samples = set("|".join(combo_info_df.loc[combo_info_df.control_samples_same_variants!="", "control_samples_same_variants"].values).split("|"))

In [31]:
dv_samples = set("|".join(combo_info_df.loc[combo_info_df.control_samples_other_variants!="", "control_samples_other_variants"].values).split("|"))

In [27]:
pheno_df = pd.read_csv("/data6/deepro/ukb_bmi/0_data_preparation_and_download/phenotype/data/bmi_processed/filtered_bmi_info.csv.gz",
                       usecols=["sample_names", "bmi", "bmi_prs"], dtype={"sample_names": str, "bmi": float, "bmi_prs": float})

In [32]:
pheno_sv_df = pheno_df.loc[pheno_df.sample_names.isin(sv_samples)]
pheno_dv_df = pheno_df.loc[pheno_df.sample_names.isin(dv_samples)]

In [33]:
pheno_sv_df.bmi.describe()

count    136.000000
mean      22.892223
std        2.093977
min       15.640900
25%       21.648600
50%       23.073600
75%       24.324525
max       27.216300
Name: bmi, dtype: float64

In [34]:
pheno_dv_df.bmi.describe()

count    655.000000
mean      22.843780
std        1.900269
min       16.546200
25%       21.711950
50%       22.929700
75%       24.110650
max       27.755100
Name: bmi, dtype: float64

In [35]:
pheno_sv_df.bmi_prs.describe()

count    136.000000
mean       0.019042
std        0.922354
min       -2.026580
25%       -0.640512
50%       -0.007018
75%        0.593020
max        2.882350
Name: bmi_prs, dtype: float64

In [36]:
pheno_dv_df.bmi_prs.describe()

count    655.000000
mean      -0.100142
std        0.936883
min       -2.775770
25%       -0.774491
50%       -0.146371
75%        0.517203
max        2.927600
Name: bmi_prs, dtype: float64

In [22]:
from scipy.stats import kstest,ttest_ind

In [23]:
ttest_ind(pheno_sv_df.bmi, pheno_dv_df.bmi, alternative="greater")

TtestResult(statistic=-0.8417515340151916, pvalue=0.8000037805846016, df=3078.0)

In [24]:
ttest_ind(pheno_sv_df.bmi_prs, pheno_dv_df.bmi_prs, alternative="less") 

TtestResult(statistic=1.8938356711754896, pvalue=0.9708297461958982, df=3078.0)

In [37]:
sv_samples = set("|".join(combo_info_df.loc[combo_info_df.other_samples_same_variants!="", "other_samples_same_variants"].values).split("|"))
dv_samples = set("|".join(combo_info_df.loc[combo_info_df.other_samples_other_variants!="", "other_samples_other_variants"].values).split("|"))

pheno_sv_df = pheno_df.loc[pheno_df.sample_names.isin(sv_samples)]
pheno_dv_df = pheno_df.loc[pheno_df.sample_names.isin(dv_samples)]

In [39]:
pheno_sv_df.bmi.describe()

count    637.000000
mean      27.201390
std        2.081287
min       21.851700
25%       25.822450
50%       27.119100
75%       28.664500
max       33.074200
Name: bmi, dtype: float64

In [40]:
pheno_dv_df.bmi.describe()

count    2443.000000
mean       27.281890
std         2.167117
min        20.635000
25%        25.762450
50%        27.179200
75%        28.816933
max        33.956000
Name: bmi, dtype: float64

In [41]:
pheno_sv_df.bmi_prs.describe()

count    637.000000
mean      -0.229925
std        0.962693
min       -3.497380
25%       -0.859271
50%       -0.264136
75%        0.444506
max        2.758180
Name: bmi_prs, dtype: float64

In [42]:
pheno_dv_df.bmi_prs.describe()

count    2443.000000
mean       -0.312980
std         0.991716
min        -3.997840
25%        -0.992139
50%        -0.327686
75%         0.347911
max         3.134670
Name: bmi_prs, dtype: float64