In [1]:
import pandas as pd, numpy as np
import os
cwd = os.path.expanduser("~/cnv-project/adsp_real_data")

In [2]:
del_comm_cnv = pd.read_csv(f"{cwd}/del_comm_adsp_hq.gz", compression="gzip", sep="\t")
del_comm_cnv = del_comm_cnv.drop(columns=["COUNTED","ALT"])
del_comm_cnv.iloc[:,0] = del_comm_cnv.iloc[:,0].str.replace('chr','')

n_cnv,n_cols = del_comm_cnv.shape

percent_na_indv = [del_comm_cnv.iloc[:,i].isna().sum() / n_cnv for i in range(4,n_cols)]
cutoff_indv = 0.2
indv_to_keep = [index for index, element in enumerate(percent_na_indv) if element <= cutoff_indv]
del_comm_cnv = del_comm_cnv.iloc[:,indv_to_keep]

n_indvs = del_comm_cnv.shape[1] - 4
percent_na_cnv = [del_comm_cnv.iloc[i,:].isna().sum() / (n_indvs) for i in range(n_cnv)] 
cutoff_cnv = 0.4
cnv_to_keep = [index for index, element in enumerate(percent_na_cnv) if element <= cutoff_cnv]
del_comm_cnv = del_comm_cnv.iloc[cnv_to_keep,:]

In [3]:
del_comm_cnv.shape

(8476, 11716)

In [4]:
ref_gene = pd.read_table(f"{cwd}/refGene.clean.gz", compression = "gzip", sep = "\t", header = 0)

In [5]:
cnv = del_comm_cnv.iloc[:,:4]
cnv.head()

Unnamed: 0,CHR,SNP,CM,POS
0,22,chr22:10710932-10711692:DEL:COVERAGE:comm,10710932,10711692
1,22,chr22:10720746-10721165:DEL:COVERAGE:comm,10720746,10721165
2,22,chr22:10740324-10746330:DEL:AGGREGATED:comm,10740324,10746330
4,22,chr22:11030980-11031063:DEL:BREAKPOINT:comm,11030980,11031063
5,22,chr22:11031170-11032159:DEL:COVERAGE:comm,11031170,11032159


In [6]:
cnv_ref_gene = cnv.merge(ref_gene, how='left', on='CHR')
match_res = cnv_ref_gene[(cnv_ref_gene['CM'] >= cnv_ref_gene['start']) & (cnv_ref_gene['CM'] <= cnv_ref_gene['end'])
                        |(cnv_ref_gene['POS'] >= cnv_ref_gene['start']) & (cnv_ref_gene['POS'] <= cnv_ref_gene['end'])
                        |(cnv_ref_gene['CM'] <= cnv_ref_gene['start']) & (cnv_ref_gene['POS'] >= cnv_ref_gene['end'])
                        ]
match_res.shape

(4636, 7)

In [7]:
match_res.head()

Unnamed: 0,CHR,SNP,CM,POS,start,end,gene
15721,22,chr22:19076248-19076387:DEL:AGGREGATED:comm,19076248,19076387,19023794,19109967,DGCR2
17034,22,chr22:19168421-19168478:DEL:AGGREGATED:comm,19168421,19168478,19166985,19279242,CLTCL1
17687,22,chr22:19260392-19260579:DEL:AGGREGATED:comm,19260392,19260579,19166985,19279242,CLTCL1
19660,22,chr22:19924994-19925883:DEL:AGGREGATED:comm,19924994,19925883,19863040,19929341,TXNRD2
20316,22,chr22:19959799-19959866:DEL:AGGREGATED:comm,19959799,19959866,19957418,20004346,ARVCF


In [8]:
# Seperate reference genes to interrupted and normal
genes_1 = match_res['gene'].unique()
genes_0 = ref_gene[~ref_gene['gene'].isin(genes_1)]

gene_dict = {}
for g in genes_1:
    gene_dict[g] = []


In [12]:
def per_indv_annotate(std_match,idx):
    # Prepare standard match results and individual CNV data
    global_match = std_match[['SNP','gene']]
    person = del_comm_cnv.iloc[:,[1,(idx+4)]]
    
    # Make a union of two frames on SNP name (duplicates contaiend)
    # for aligning SNPs and gene
    union = person.merge(global_match, how='outer', on='SNP')
    valid = union[union['gene'].isna() == False]
    
    # Iterate each interrupted gene in an individual and assign binary values
    for g in genes_1:
        if valid[valid['gene'] == g].iloc[:,1].sum() > 0:
            gene_dict[g].append(1)
        else:
            gene_dict[g].append(0)
    
    return None

In [16]:
import time

start = time.time()
for i in range(10):
    per_indv_annotate(match_res,i)
end = time.time()
print(f"Cost {end-start} secs")
#X = pd.DataFrame(gene_dict)

Cost -25.317108392715454 secs


In [17]:
X = pd.DataFrame(gene_dict)
X

Unnamed: 0,DGCR2,CLTCL1,TXNRD2,ARVCF,ZDHHC8,FAM230G,YPEL1,IGLL5,BCR,LINC01659,...,FZD10-AS1,PIWIL1,RIMBP2,LINC01257,SFSWAP,ULK1,EP400,EP400P1,GALNT9,FBRSL1
0,0,1,0,1,0,0,0,0,0,0,...,0,1,1,0,1,0,1,0,1,0
1,0,1,0,1,0,1,0,0,1,0,...,0,1,1,0,1,0,1,1,1,0
2,0,1,0,1,0,0,0,0,0,0,...,0,1,1,1,1,1,1,0,1,0
3,0,1,0,1,0,0,0,0,0,0,...,1,0,1,0,1,1,1,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,1,1,1,1,1,0
5,0,1,0,1,0,0,0,0,0,0,...,0,1,1,0,1,0,1,0,1,0
6,0,1,0,1,0,1,0,0,1,0,...,0,1,1,0,1,0,1,1,1,0
7,0,1,0,1,0,0,0,0,0,0,...,0,1,1,1,1,1,1,0,1,0
8,0,1,0,1,0,0,0,0,0,0,...,1,0,1,0,1,1,1,0,1,0
9,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,1,1,1,1,1,0


3180

In [26]:
global_match = match_res[['SNP','gene']]
person = del_comm_cnv.iloc[:,[1,4]]

In [27]:
global_match.head(10)

Unnamed: 0,SNP,gene
15721,chr22:19076248-19076387:DEL:AGGREGATED:comm,DGCR2
17034,chr22:19168421-19168478:DEL:AGGREGATED:comm,CLTCL1
17687,chr22:19260392-19260579:DEL:AGGREGATED:comm,CLTCL1
19660,chr22:19924994-19925883:DEL:AGGREGATED:comm,TXNRD2
20316,chr22:19959799-19959866:DEL:AGGREGATED:comm,ARVCF
20979,chr22:20121816-20121912:DEL:AGGREGATED:comm,ZDHHC8
21639,chr22:20596335-20599552:DEL:AGGREGATED:comm,FAM230G
22343,chr22:22085877-22086037:DEL:AGGREGATED:comm,YPEL1
26930,chr22:23234669-23235625:DEL:AGGREGATED:comm,IGLL5
27587,chr22:23550492-23550559:DEL:AGGREGATED:comm,BCR


In [28]:
person.head(10)

Unnamed: 0,SNP,0:A-ACT-AC000014-BL-NCR-15AD78694
0,chr22:10710932-10711692:DEL:COVERAGE:comm,1.0
1,chr22:10720746-10721165:DEL:COVERAGE:comm,1.0
2,chr22:10740324-10746330:DEL:AGGREGATED:comm,0.0
4,chr22:11030980-11031063:DEL:BREAKPOINT:comm,1.0
5,chr22:11031170-11032159:DEL:COVERAGE:comm,1.0
6,chr22:11040820-11040874:DEL:AGGREGATED:comm,1.0
8,chr22:11048821-11049011:DEL:AGGREGATED:comm,0.0
9,chr22:11065106-11065225:DEL:COVERAGE:comm,1.0
10,chr22:11481028-11481101:DEL:AGGREGATED:comm,0.0
11,chr22:11588146-11588324:DEL:AGGREGATED:comm,1.0


In [29]:
union = person.merge(global_match, how='outer', on='SNP')

In [30]:
union.shape

(8951, 3)

In [31]:
union.head(10)

Unnamed: 0,SNP,0:A-ACT-AC000014-BL-NCR-15AD78694,gene
0,chr22:10710932-10711692:DEL:COVERAGE:comm,1.0,
1,chr22:10720746-10721165:DEL:COVERAGE:comm,1.0,
2,chr22:10740324-10746330:DEL:AGGREGATED:comm,0.0,
3,chr22:11030980-11031063:DEL:BREAKPOINT:comm,1.0,
4,chr22:11031170-11032159:DEL:COVERAGE:comm,1.0,
5,chr22:11040820-11040874:DEL:AGGREGATED:comm,1.0,
6,chr22:11048821-11049011:DEL:AGGREGATED:comm,0.0,
7,chr22:11065106-11065225:DEL:COVERAGE:comm,1.0,
8,chr22:11481028-11481101:DEL:AGGREGATED:comm,0.0,
9,chr22:11588146-11588324:DEL:AGGREGATED:comm,1.0,


In [38]:
union['SNP'].duplicated().any()

True

In [33]:
person.shape

(8476, 2)

In [37]:
8951 - sum(union['gene'].isna())

4636

In [42]:
valid = union[union['gene'].isna() == False] ####
valid

Unnamed: 0,SNP,0:A-ACT-AC000014-BL-NCR-15AD78694,gene
24,chr22:19076248-19076387:DEL:AGGREGATED:comm,0.0,DGCR2
26,chr22:19168421-19168478:DEL:AGGREGATED:comm,1.0,CLTCL1
27,chr22:19260392-19260579:DEL:AGGREGATED:comm,,CLTCL1
30,chr22:19924994-19925883:DEL:AGGREGATED:comm,0.0,TXNRD2
31,chr22:19959799-19959866:DEL:AGGREGATED:comm,1.0,ARVCF
...,...,...,...
8946,chr12:132730334-132730397:DEL:AGGREGATED:comm,2.0,GALNT9
8947,chr12:132776123-132776202:DEL:AGGREGATED:comm,2.0,GALNT9
8948,chr12:132844445-132844791:DEL:AGGREGATED:comm,0.0,GALNT9
8949,chr12:132868863-132869205:DEL:COVERAGE:comm,1.0,GALNT9


In [55]:
valid[valid['gene'] == 'CLTCL1'].iloc[:,1].sum()

1.0

In [67]:
for g in genes_1:
    if valid[valid['gene'] == g].iloc[:,1].sum() > 0:
        gene_dict[g].append(1)
    else:
        gene_dict[g].append(0)


In [68]:
gene_dict

{'DGCR2': [0],
 'CLTCL1': [1],
 'TXNRD2': [0],
 'ARVCF': [1],
 'ZDHHC8': [0],
 'FAM230G': [0],
 'YPEL1': [0],
 'IGLL5': [0],
 'BCR': [0],
 'LINC01659': [0],
 'GUSBP11': [1],
 'DDT': [1],
 'CABIN1': [0],
 'GGT5': [0],
 'SPECC1L': [0],
 'SPECC1L-ADORA2A': [0],
 'ADORA2A-AS1': [0],
 'POM121L10P': [0],
 'ZNRF3': [0],
 'HORMAD2-AS1': [0],
 'LIMK2': [0],
 'DRG1': [0],
 'SFI1': [0],
 'LINC02558': [0],
 'SYN3': [0],
 'LARGE1': [1],
 'HMGXB4': [0],
 'TOM1': [1],
 'RBFOX2': [0],
 'APOL3': [0],
 'MYH9': [1],
 'CACNG2': [0],
 'C1QTNF6': [0],
 'TMEM184B': [0],
 'KCNJ4': [0],
 'APOBEC3A_B': [1],
 'APOBEC3A': [1],
 'ENTHD1': [1],
 'TNRC6B': [0],
 'ZC3H7B': [1],
 'TNFRSF13C': [0],
 'ARFGAP3': [0],
 'TTLL1': [1],
 'BIK': [0],
 'EFCAB6-AS1': [0],
 'EFCAB6': [0],
 'SULT4A1': [0],
 'PNPLA3': [0],
 'SHISAL1': [1],
 'PRR5': [1],
 'PRR5-ARHGAP8': [0],
 'ARHGAP8': [0],
 'PHF21B': [1],
 'NUP50': [1],
 'LOC105373064': [0],
 'KIAA0930': [0],
 'FBLN1': [1],
 'ATXN10': [0],
 'PPARA': [0],
 'TTC38': [0],
 'TRMU': [

In [None]:
import numpy as np

def compute_gene_values_for_individual(gene_row, individual):
    mask = (
        (cnv['CM'] >= gene_row['start']) & (cnv['CM'] <= gene_row['end']) |
        (cnv['POS'] >= gene_row['start']) & (cnv['POS'] <= gene_row['end']) |
        (cnv['CM'] <= gene_row['start']) & (cnv['POS'] >= gene_row['end'])
    )
    # Extract the values for the individual where the conditions are met
    values = cnv.loc[mask, individual].dropna()  # drop NaNs

    # Check the condition to return appropriate values
    if values.empty or (values == 0).all():
        return gene_row['gene'], 0
    else:
        return gene_row['gene'], 1

# Create an empty dataframe with individuals as index and genes as columns, initialized with NaN
result_df = pd.DataFrame(np.NaN, index=(cnv.columns[4:])[:3], columns=ref_gene['gene'])

n_jobs = -1  # use all available cores

# Compute values for each individual for each gene
for individual in (cnv.columns[4:])[:3]:
    results = Parallel(n_jobs=n_jobs)(delayed(compute_gene_values_for_individual)(gene_row, individual) for _, gene_row in ref_gene.iterrows())
    # Fill the results into the dataframe
    for gene, value in results:
        result_df.at[individual, gene] = value


In [39]:
union[union.duplicated('SNP', keep=False)]

Unnamed: 0,SNP,0:A-ACT-AC000014-BL-NCR-15AD78694,gene
50,chr22:24701680-24701893:DEL:AGGREGATED:comm,0.0,SPECC1L
51,chr22:24701680-24701893:DEL:AGGREGATED:comm,0.0,SPECC1L-ADORA2A
92,chr22:39357654-39357717:DEL:AGGREGATED:comm,2.0,APOBEC3A_B
93,chr22:39357654-39357717:DEL:AGGREGATED:comm,2.0,APOBEC3A
117,chr22:45257968-45260929:DEL:AGGREGATED:comm,0.0,PRR5-ARHGAP8
...,...,...,...
8791,chr12:92816781-92817305:DEL:AGGREGATED:comm,0.0,CLLU1
8795,chr12:93964568-93964864:DEL:COVERAGE:comm,0.0,SOCS2-AS1
8796,chr12:93964568-93964864:DEL:COVERAGE:comm,0.0,SOCS2
8860,chr12:120928543-120928602:DEL:AGGREGATED:comm,1.0,DYNLL1


In [40]:
global_match[global_match.duplicated('SNP', keep=False)]

Unnamed: 0,SNP,gene
32848,chr22:24701680-24701893:DEL:AGGREGATED:comm,SPECC1L
32849,chr22:24701680-24701893:DEL:AGGREGATED:comm,SPECC1L-ADORA2A
59860,chr22:39357654-39357717:DEL:AGGREGATED:comm,APOBEC3A_B
59861,chr22:39357654-39357717:DEL:AGGREGATED:comm,APOBEC3A
75655,chr22:45257968-45260929:DEL:AGGREGATED:comm,PRR5-ARHGAP8
...,...,...
11472928,chr12:92816781-92817305:DEL:AGGREGATED:comm,CLLU1
11478499,chr12:93964568-93964864:DEL:COVERAGE:comm,SOCS2-AS1
11478500,chr12:93964568-93964864:DEL:COVERAGE:comm,SOCS2
11567727,chr12:120928543-120928602:DEL:AGGREGATED:comm,DYNLL1


In [12]:
subset = del_comm_cnv[del_comm_cnv['SNP'].isin(match_res['SNP'])]
subset.shape
subset.head(10)

Unnamed: 0,CHR,SNP,CM,POS,0:A-ACT-AC000014-BL-NCR-15AD78694,0:A-ACT-AC000034-BL-NCR-16AD84906,0:A-ACT-AC000057-BL-NCR-15AD78356,0:A-ACT-AC000072-BL-NCR-15AD77250,0:A-ACT-AC000088-BL-NCR-15AD76548,0:A-ACT-AC000092-BL-NCR-15AD78589,...,0:G-MSBB-MB000322-BR-MSBB-71983,0:G-MSBB-MB000323-BR-MSBB-71984,0:G-MSBB-MB000326-BR-MSBB-71987,0:G-MSBB-MB000327-BR-MSBB-71988,0:G-MSBB-MB000331-BR-MSBB-71993,0:G-MSBB-MB000332-BR-MSBB-71994,0:G-MSBB-MB000333-BR-MSBB-71995,0:G-MSBB-MB000334-BR-MSBB-71996,0:G-MSBB-MB000335-BR-MSBB-74458,0:G-MSBB-MB000337-BR-MSBB-76349
27,22,chr22:19076248-19076387:DEL:AGGREGATED:comm,19076248,19076387,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29,22,chr22:19168421-19168478:DEL:AGGREGATED:comm,19168421,19168478,1.0,1.0,1.0,1.0,,1.0,...,,1.0,1.0,2.0,1.0,2.0,,1.0,1.0,1.0
30,22,chr22:19260392-19260579:DEL:AGGREGATED:comm,19260392,19260579,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,0.0
34,22,chr22:19924994-19925883:DEL:AGGREGATED:comm,19924994,19925883,0.0,0.0,0.0,0.0,,0.0,...,0.0,0.0,,0.0,0.0,0.0,0.0,,,0.0
35,22,chr22:19959799-19959866:DEL:AGGREGATED:comm,19959799,19959866,1.0,1.0,2.0,2.0,0.0,2.0,...,2.0,2.0,,2.0,1.0,2.0,2.0,1.0,2.0,0.0
36,22,chr22:20121816-20121912:DEL:AGGREGATED:comm,20121816,20121912,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
38,22,chr22:20596335-20599552:DEL:AGGREGATED:comm,20596335,20599552,0.0,1.0,0.0,0.0,0.0,2.0,...,0.0,1.0,0.0,1.0,0.0,0.0,2.0,0.0,1.0,0.0
39,22,chr22:22085877-22086037:DEL:AGGREGATED:comm,22085877,22086037,0.0,0.0,0.0,0.0,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
46,22,chr22:23234669-23235625:DEL:AGGREGATED:comm,23234669,23235625,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
48,22,chr22:23550492-23550559:DEL:AGGREGATED:comm,23550492,23550559,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
subset['SNP'].duplicated().any()

False

In [9]:
new_subset = match_res[['SNP','gene']].merge(subset, how='left', on='SNP')
new_subset.head()

Unnamed: 0,SNP,gene,CHR,CM,POS,0:A-ACT-AC000014-BL-NCR-15AD78694,0:A-ACT-AC000034-BL-NCR-16AD84906,0:A-ACT-AC000057-BL-NCR-15AD78356,0:A-ACT-AC000072-BL-NCR-15AD77250,0:A-ACT-AC000088-BL-NCR-15AD76548,...,0:G-MSBB-MB000322-BR-MSBB-71983,0:G-MSBB-MB000323-BR-MSBB-71984,0:G-MSBB-MB000326-BR-MSBB-71987,0:G-MSBB-MB000327-BR-MSBB-71988,0:G-MSBB-MB000331-BR-MSBB-71993,0:G-MSBB-MB000332-BR-MSBB-71994,0:G-MSBB-MB000333-BR-MSBB-71995,0:G-MSBB-MB000334-BR-MSBB-71996,0:G-MSBB-MB000335-BR-MSBB-74458,0:G-MSBB-MB000337-BR-MSBB-76349
0,chr22:19076248-19076387:DEL:AGGREGATED:comm,DGCR2,22,19076248,19076387,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,chr22:19168421-19168478:DEL:AGGREGATED:comm,CLTCL1,22,19168421,19168478,1.0,1.0,1.0,1.0,,...,,1.0,1.0,2.0,1.0,2.0,,1.0,1.0,1.0
2,chr22:19260392-19260579:DEL:AGGREGATED:comm,CLTCL1,22,19260392,19260579,,0.0,0.0,0.0,0.0,...,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,0.0
3,chr22:19924994-19925883:DEL:AGGREGATED:comm,TXNRD2,22,19924994,19925883,0.0,0.0,0.0,0.0,,...,0.0,0.0,,0.0,0.0,0.0,0.0,,,0.0
4,chr22:19959799-19959866:DEL:AGGREGATED:comm,ARVCF,22,19959799,19959866,1.0,1.0,2.0,2.0,0.0,...,2.0,2.0,,2.0,1.0,2.0,2.0,1.0,2.0,0.0


In [10]:
new_subset.shape

(4636, 11717)

In [18]:
sample = del_comm_cnv.iloc[:,[*range(4),4]]
cnv_ref_gene = sample.merge(ref_gene, how='left', on='CHR')
match_res = cnv_ref_gene[(cnv_ref_gene['CM'] >= cnv_ref_gene['start']) & (cnv_ref_gene['CM'] <= cnv_ref_gene['end'])
                        |(cnv_ref_gene['POS'] >= cnv_ref_gene['start']) & (cnv_ref_gene['POS'] <= cnv_ref_gene['end'])
                        |(cnv_ref_gene['CM'] <= cnv_ref_gene['start']) & (cnv_ref_gene['POS'] >= cnv_ref_gene['end'])
                        ]
match_res = match_res.drop_duplicates(subset='gene', keep='first')
match_res.shape

(3180, 8)

In [25]:
match_res['ID']

KeyError: 'ID'

In [34]:
X = pd.DataFrame(columns=ref_gene['gene'], index=del_comm_cnv.columns[4:])
IDs = list((del_comm_cnv.columns[4:])[:5])
for i,ID in enumerate(IDs):
    sample = del_comm_cnv.iloc[:,[*range(4),(i+4)]]
    cnv_ref_gene = sample.merge(ref_gene, how='left', on='CHR')
    match_res = cnv_ref_gene[(cnv_ref_gene['CM'] >= cnv_ref_gene['start']) & (cnv_ref_gene['CM'] <= cnv_ref_gene['end'])
                            |(cnv_ref_gene['POS'] >= cnv_ref_gene['start']) & (cnv_ref_gene['POS'] <= cnv_ref_gene['end'])
                            |(cnv_ref_gene['CM'] <= cnv_ref_gene['start']) & (cnv_ref_gene['POS'] >= cnv_ref_gene['end'])
                            ]
    match_res = match_res.drop_duplicates(subset='gene', keep='first')
    new_values # Must be the whole row!
    X.loc[i,match_res['gene']] = dict(zip(match_res['gene'], match_res[ID]))
X.head()
    

gene,LOC102725121@1,DDX11L1,WASH7P,"MIR6859-1@1,MIR6859-2@1,MIR6859-3@1,MIR6859-4@1","MIR1302-10@1,MIR1302-11@1,MIR1302-2@1,MIR1302-9@1","FAM138A@1,FAM138C@1,FAM138F@1",OR4F5,LOC729737,"LOC100132062@1,LOC100132287@1","OR4F16@1,OR4F29@1,OR4F3@1",...,"MIR1184-1@3,MIR1184-2@3,MIR1184-3@3","H2AB2@3,H2AB3@3",H2AB1@3,"LOC101927830@2,TMLHE-AS1@2",TMLHE,SPRY3,VAMP7,IL9R,WASIR1,DDX11L16
0:A-ACT-AC000014-BL-NCR-15AD78694,,,,,,,,,,,...,,,,,,,,,,
0:A-ACT-AC000034-BL-NCR-16AD84906,,,,,,,,,,,...,,,,,,,,,,
0:A-ACT-AC000057-BL-NCR-15AD78356,,,,,,,,,,,...,,,,,,,,,,
0:A-ACT-AC000072-BL-NCR-15AD77250,,,,,,,,,,,...,,,,,,,,,,
0:A-ACT-AC000088-BL-NCR-15AD76548,,,,,,,,,,,...,,,,,,,,,,


In [35]:
ID

'0:A-ACT-AC000088-BL-NCR-15AD76548'

In [37]:
X.loc[ID,['DDX11L','WASH7P']] = {'DDX11L':0,'WASH7P':1}
X.head()

gene,LOC102725121@1,DDX11L1,WASH7P,"MIR6859-1@1,MIR6859-2@1,MIR6859-3@1,MIR6859-4@1","MIR1302-10@1,MIR1302-11@1,MIR1302-2@1,MIR1302-9@1","FAM138A@1,FAM138C@1,FAM138F@1",OR4F5,LOC729737,"LOC100132062@1,LOC100132287@1","OR4F16@1,OR4F29@1,OR4F3@1",...,"H2AB2@3,H2AB3@3",H2AB1@3,"LOC101927830@2,TMLHE-AS1@2",TMLHE,SPRY3,VAMP7,IL9R,WASIR1,DDX11L16,DDX11L
0:A-ACT-AC000014-BL-NCR-15AD78694,,,,,,,,,,,...,,,,,,,,,,
0:A-ACT-AC000034-BL-NCR-16AD84906,,,,,,,,,,,...,,,,,,,,,,
0:A-ACT-AC000057-BL-NCR-15AD78356,,,,,,,,,,,...,,,,,,,,,,
0:A-ACT-AC000072-BL-NCR-15AD77250,,,,,,,,,,,...,,,,,,,,,,
0:A-ACT-AC000088-BL-NCR-15AD76548,,,1.0,,,,,,,,...,,,,,,,,,,0.0


In [18]:
match_res.head()

Unnamed: 0,CHR,SNP,CM,POS,0:A-ACT-AC000088-BL-NCR-15AD76548,start,end,gene
15721,22,chr22:19076248-19076387:DEL:AGGREGATED:comm,19076248,19076387,0.0,19023794,19109967,DGCR2
17034,22,chr22:19168421-19168478:DEL:AGGREGATED:comm,19168421,19168478,,19166985,19279242,CLTCL1
19660,22,chr22:19924994-19925883:DEL:AGGREGATED:comm,19924994,19925883,,19863040,19929341,TXNRD2
20316,22,chr22:19959799-19959866:DEL:AGGREGATED:comm,19959799,19959866,0.0,19957418,20004346,ARVCF
20979,22,chr22:20121816-20121912:DEL:AGGREGATED:comm,20121816,20121912,0.0,20119326,20135530,ZDHHC8


In [2]:
cwd = os.path.expanduser("~/cnv-project/adsp_real_data")
    
print("Start loading data...")
# Load raw common deletion CNV data and do preprocessing
del_comm_cnv = pd.read_csv(f"{cwd}/del_comm_adsp_hq.gz", compression="gzip", sep="\t")
del_comm_cnv = del_comm_cnv.drop(columns=["COUNTED","ALT"])
del_comm_cnv.iloc[:,0] = del_comm_cnv.iloc[:,0].str.replace('chr','')
cnv = del_comm_cnv.iloc[:,:4]
indvs = del_comm_cnv.drop(columns=["CHR","CM","POS"])
del del_comm_cnv 
print("Data loaded!")

# QC 
print("Start QC...")
n_cnv,n_cols = indvs.shape

percent_na_indv = [indvs.iloc[:,i].isna().sum() / n_cnv for i in range(1,n_cols)]
cutoff_indv = 0.2
indv_to_keep = [index for index, element in enumerate(percent_na_indv) if element <= cutoff_indv]
indvs = indvs.iloc[:,indv_to_keep]

n_indvs = indvs.shape[1] - 1
percent_na_cnv = [indvs.iloc[i,:].isna().sum() / (n_indvs) for i in range(n_cnv)] 
cutoff_cnv = 0.4
cnv_to_keep = [index for index, element in enumerate(percent_na_cnv) if element <= cutoff_cnv]
indvs = indvs.iloc[cnv_to_keep,:]
print("QC completed!")

# Load reference genes
print("Start matching genes and CNVs...")
ref_gene = pd.read_table(f"{cwd}/refGene.clean.gz", compression = "gzip", sep = "\t", header = 0)
cnv_ref_gene = cnv.merge(ref_gene, how='left', on='CHR')
res = cnv_ref_gene[(cnv_ref_gene['CM'] >= cnv_ref_gene['start']) & (cnv_ref_gene['CM'] <= cnv_ref_gene['end'])
                   |(cnv_ref_gene['POS'] >= cnv_ref_gene['start']) & (cnv_ref_gene['POS'] <= cnv_ref_gene['end'])
                   |(cnv_ref_gene['CM'] <= cnv_ref_gene['start']) & (cnv_ref_gene['POS'] >= cnv_ref_gene['end'])
                  ]
print("Matching completed!")

Start loading data...
Data loaded!
Start QC...
QC completed!
Start matching genes and CNVs...
Matching completed!
