In [None]:
# default_exp chplink

# Linkage analysis of CHP markers

This pipeline is using paramlink2 to do linkage analysis. The R code is bridged to python through rpy2. It run linkage analysis from batch to batch. Its input is the intermediate result of seqlink.
Next, I will make it to a sos pipeline. Run all the chromosomes in parallel.

In [None]:
import pandas as pd
import numpy as np
import pickle
from SEQLinkage.linkage import *

In [None]:
import time
from concurrent.futures import ProcessPoolExecutor

## Functions to deal with haplotypes

In [None]:
def generate_marker(alleles):
    '''array of 0,1,2. 0 if all 0 -> 2 if any 2 else 1'''
    if np.all(alleles==0):
        return 0
    elif np.any(alleles==2):
        return 2
    else:
        return 1

In [None]:
def get_fam_marker(fam,hap,vcf=None,cutoff=0.05, recomb=False,halftyped=False):
    '''input hap is [varnames,freqs,halpotypes]'''
    idx = hap[1]<cutoff
    if not np.any(idx): return None
    iid,hap = hap[2][:,1], hap[2][:,2:]
    new_hap,new_iid = [],[]
    for i in range(0,hap.shape[0],2):
        cur_iid=iid[i]
        new_iid.append(cur_iid)
        hap_a0,hap_a1 = [0],[0] #inital as 0, if no vcf, marker will be 0.
        for a0,a1,rare in zip(hap[i],hap[i+1],idx):
            if not recomb and a0[-1] not in [':','|']:
                return None
            if rare and vcf is None or vcf[cur_iid]:#rare variants with vcf
                hap_a0.append(get_allele(a0))
                hap_a1.append(get_allele(a1))
        #generate marker
        #if halftyped True, any 0 in [m0,m1], set to [0,0]
        m0,m1=generate_marker(np.array(hap_a0)),generate_marker(np.array(hap_a1))
        if not halftyped and m0==0 or m1==0:
            new_hap.append([0,0])
        else:
            new_hap.append([m0,m1])
    new_hap = pd.DataFrame(new_hap)
    new_hap.index = new_iid
    new_hap = pd.concat([fam,new_hap],axis=1)
    #marker freq
    mfreq=1-np.prod(1-hap[1][idx]) # 1-prob(non-wild type) such as 1111.
    return new_hap,mfreq

In [None]:
def parallel_fam_marker(fams,haps,vcfs):
    with ProcessPoolExecutor(max_workers = 10) as executor:
        results = executor.map(get_fam_marker,[fams[i] for i in haps.keys()],haps.values(),[vcfs[k] if vcfs else None for k in haps.keys()])
    markers,afreqs={},{}
    for f,g in zip(haps.keys(),results):
        if g is not None:
            markers[f] = g[0]
            afreqs[f]= g[1]
    return markers,afreqs

In [None]:
def write_to_runPM(genes,fams,genemap,output,transfer=True):
    '''write CHP markers to tfam and tped'''
    genemap.index=list(genemap[3])
    genemap['zero']=0
    fams_d={i:fams[fams.fid==i] for i in fams.fid.unique()}
    gene_haps_lst = [parallel_fam_marker(fams_d,genes[g]['predata']) for g in genes.keys()]
    tmps=[pd.concat(gene_haps.values()) for gene_haps in gene_haps_lst]
    tmp1=pd.concat([tmp.iloc[:,6:] for tmp in tmps],axis=1)
    tmp1=tmp1.fillna(0)
    tmp1=tmp1.astype(int)
    iid = list(tmp1.index)
    tmp1.index=range(tmp1.shape[0])
    if transfer:
        tmp_a0=tmp1.iloc[:,range(0,tmp1.shape[1],2)]
        tmp_a0.columns=list(genes.keys())
        tmp_a1=tmp1.iloc[:,range(1,tmp1.shape[1],2)]
        tmp_a1.columns=list(genes.keys())
        tped=pd.concat([tmp_a0,tmp_a1],axis=0).sort_index()
        tfam=fams.loc[iid,:]
        tped=pd.concat([genemap.loc[tped.columns,[0,3,'zero',1]],tped.transpose()],axis=1)
        tfam.to_csv(output+'.tfam',header=False,index=False,sep='\t')
        tped.to_csv(output+'.tped',header=False,index=False,sep='\t')
    else:
        ped=pd.concat([fams.loc[tmp1.index,:],tmp1],axis=1)
        mapp=genemap.loc[genes.keys(),[0,1,3]]
        mapp.columns=['Chromosome','Haldane','Name']
        ped.to_csv(output+'.ped',header=False,index=False,sep='\t')
        mapp.to_csv(output+'.map',header=True,index=False,sep='\t')
        ped.sort_values(['fid','iid'],ascending=True).to_csv(output+'as.ped',header=False,index=False,sep='\t')
        mapp.to_csv(output+'as.map',header=True,index=False,sep='\t')
        ped.sort_values(['fid','iid'],ascending=False).to_csv(output+'de.ped',header=False,index=False,sep='\t')
        mapp.to_csv(output+'de.map',header=True,index=False,sep='\t')

In [None]:
genemap=pd.read_csv('../data/genemap.hg38.txt',sep='\t',header=None)

## All genes from haps to peds

### Read fam

In [None]:
fam17 = pd.read_csv('../data/new_trim_ped_famless17_no:xx.fam',delim_whitespace=True,header=None,names=['fid','iid','fathid','mothid','sex','ad'])
fam17.index = list(fam17.iid)
fam17.ad[fam17.ad==-9]=0
fam17_d = {}
for i in fam17.fid.unique():
    fam17_d[i] = fam17[fam17.fid==i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fam17.ad[fam17.ad==-9]=0


In [None]:
with open('../data/wg20220316/fam17_vcf.pickle', 'rb') as handle:
    fam17_vcf = pickle.load(handle)

## Read haplotypes

In [None]:
with open('../data/wg20220311/chr19test/CACHE/chr19test43.pickle', 'rb') as handle:
    genes = pickle.load(handle)

In [None]:
genes.keys()

dict_keys(['LOC100379224', 'ZNF225', 'ZNF234', 'ZNF226', 'ZNF227', 'ZNF233', 'ZNF235', 'ZNF112', 'ZNF285', 'ZNF229', 'ZNF180', 'CEACAM20', 'CEACAM22P', 'IGSF23', 'PVR', 'MIR4531', 'CEACAM19', 'CEACAM16', 'BCL3', 'CBLC', 'BCAM', 'NECTIN2', 'TOMM40', 'APOE', 'APOC1'])

In [None]:
fams=fam17_d
vcfs=fam17_vcf
rho=np.arange(0,0.5,0.05)
smy_res = {}
for g in genes.keys():
    haps = genes[g]['predata']
    gene_haps = parallel_fam_marker(fams,haps,vcfs)
    res = parallel_lods(gene_haps,rho)
    smy_res[g] = res

0.6206652149558067


In [None]:
gene_haps = parallel_fam_marker(fams,genes['APOE']['predata'],vcfs)

ERROR! Session/line number was not unique in database. History logging moved to new session 637


In [None]:
1

1

In [None]:
gene_haps.keys()

dict_keys(['1036', '10J_103', '10J_109', '10J_121', '10R_R38', '1154', '1177', '11_19', '1227', '1252', '1359', '1403', '1462', '150', '15_10052', '15_1121', '15_12091', '15_6051', '1713', '175', '1755', '1769', '17_13', '17_15', '17_17', '17_25', '1858', '19_L0010', '2', '20', '205', '2129', '2193', '22_1', '242', '25_37', '25_67', '25_73', '25_76', '263', '26_BES', '26_HIC', '26_KS', '26_SVF', '26_TRL', '26_WKU', '278', '27_107', '27_134', '27_239', '280', '28_4', '2_20', '3000', '304', '3087', '319', '3324', '336', '342', '348', '3541', '357', '3593', '365', '3762', '3768', '3798', '3811', '392', '411', '415', '4184', '424', '434', '476', '492', '4_146', '4_220', '4_364', '4_3742', '4_3758', '4_416', '4_429', '4_472', '4_512', '4_633', '4_92', '504', '506', '569', '577', '587', '5_26057', '5_26229', '5_26441', '615', '622', '665', '673', '686', '692', '6_1541', '753', '754', '756', '767', '780', '7_125', '800', '838', '850', '860', '863', '880', '8_62815', '8_62826', '8_64010', '8_6

In [None]:
smy_res['APOE']

KeyError: 'APOE'

In [None]:
[(j,i.sum()) for j,i in smy_res.items()]

[('LOC100379224', -34.94172634586681),
 ('ZNF225', -19.629969617998043),
 ('ZNF234', -22.79508583565604),
 ('ZNF226', -28.68891473779001),
 ('ZNF227', -18.13970890339937),
 ('ZNF233', -25.35481927315128),
 ('ZNF235', -22.646690015903488),
 ('ZNF112', -9.920684970379622),
 ('ZNF285', -14.584191566810034),
 ('ZNF229', -23.248170669670692),
 ('ZNF180', -8.606145345045856),
 ('CEACAM20', -9.35360734444087),
 ('CEACAM22P', -12.326900012001111),
 ('IGSF23', -10.378036089619906),
 ('PVR', -19.229646699116365),
 ('MIR4531', -0.35268455158429735),
 ('CEACAM19', -17.52338485092126),
 ('CEACAM16', -17.340024873202154),
 ('BCL3', -24.02983332448942),
 ('CBLC', -25.306183406373076),
 ('BCAM', -24.96431654693471),
 ('NECTIN2', -4.568870338884541),
 ('TOMM40', -21.2516701826396),
 ('APOE', -15.626263634012151),
 ('APOC1', -30.738444958869632)]

In [None]:
def run_marker_lod(file,fams,rho=0):
    with open(file+'.pickle', 'rb') as handle:
        genes = pickle.load(handle)
    smy_res = {}
    for g in genes.keys():
        haps = genes[g]['predata']
        gene_haps = parallel_fam_marker(fams,haps)
        res = parallel_lods(gene_haps.values(),rho)
        res=pd.Series([i.LOD[0] for i in res])
        res.index = list(gene_haps.keys())
        smy_res[g] = res
    with open(file+'CHPmarker'+'_rho'+"{:.2f}".format(rho)+'.result','wb') as handle:
        pickle.dump(smy_res, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
def run_marker_lods(file,fams,rho=np.arange(0,0.5,0.05)):
    with open(file+'.pickle', 'rb') as handle:
        genes = pickle.load(handle)
    smy_res = {}
    for g in genes.keys():
        haps = genes[g]['predata']
        gene_haps = parallel_fam_marker(fams,haps)
        res = parallel_lods(gene_haps.values(),rho)
        res=pd.Series([i.LOD[0] for i in res])
        res.index = list(gene_haps.keys())
        smy_res[g] = res
    with open(file+'CHPmarker'+'_rho'+"{:.2f}".format(rho)+'.result','wb') as handle:
        pickle.dump(smy_res, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
import glob

In [None]:
for i in glob.glob('../data/wg20220311/chr19test/CACHE/chr19test*.pickle'):
    run_marker_lods(i[:-7],fam17_d,rho=0.05)

0.5227391980588436
0.6587398871779442
0.6805614978075027
0.533792469650507
0.52691999822855
0.5056603439152241
0.6595170497894287
0.6337083019316196
0.98335250467062
0.6505585387349129
0.6123692840337753
0.6083393543958664
0.6113101467490196
0.6450104489922523
0.5765268504619598
0.6255120374262333
0.6523417122662067
0.6497547142207623
0.682370837777853
0.7410316728055477
0.5888953655958176
0.6145995892584324
0.6160224191844463
0.18414145335555077
0.5934646315872669
0.3559744395315647
0.618761420249939
0.5939791388809681
0.6557091996073723
0.595153171569109
0.6033066436648369
0.5765733681619167
0.5839221850037575
0.6416466757655144
0.661655131727457
0.8579119443893433
0.5994641147553921
0.6366937272250652
0.5957487337291241
0.5035934150218964
0.48030882328748703
0.5878654159605503
0.5956287123262882
0.6136039644479752
0.613315999507904
0.6410246975719929
0.5918047241866589
0.6063654944300652
0.9415804669260979
0.6767695210874081
1.5063829272985458
0.7392542250454426
1.6487146727740765
1

In [None]:
run_marker_lods('../data/wg20220311/chr19test/CACHE/chr19test44',fam17_d)

0.9677885994315147


In [None]:
with open('../data/wg20220311/chr19test/CACHE/chr19test44CHPmarker_rho0.05.result', 'rb') as handle:
    smy_res = pickle.load(handle)


In [None]:
[(j,i.sum()) for j,i in smy_res.items()]

[('APOC1P1', -13.890457625476389),
 ('APOC4', -12.353239489336831),
 ('APOC4-APOC2', -10.498619033165857),
 ('APOC2', -9.42570679590548),
 ('CLPTM1', -3.476186320447371),
 ('RELB', -2.4385740141677186),
 ('CLASRP', -5.93241794202331),
 ('ZNF296', -15.05026090076652),
 ('GEMIN7', -7.047033080139748),
 ('GEMIN7-AS1', -7.434652576714965),
 ('PPP1R37', -4.911433814887288),
 ('NKPD1', -6.21733641659929),
 ('TRAPPC6A', -7.808250853642941),
 ('BLOC1S3', -11.863940319445005),
 ('EXOC3L2', -7.55635042539838),
 ('MARK4', -1.6715446524766926),
 ('CKM', -4.714058494239083),
 ('KLC3', -9.584005051432024),
 ('ERCC2', -4.177771573203412),
 ('PPP1R13L', -8.252631749636558),
 ('CD3EAP', -11.77208933098256),
 ('ERCC1', -0.539400390022235),
 ('MIR6088', -0.18619753200139222),
 ('FOSB', -11.890591679110718),
 ('RTN2', -5.671116099885959)]

In [None]:
sum(smy_res['APOC1P1'])

-13.890457625476397

In [None]:
fam17

Unnamed: 0,fid,iid,fathid,mothid,sex,ad
1007_39,1007,1007_39,1007_40,1007_4,1,1
1007_99,1007,1007_99,1007_1,1007_2,2,2
1007_3,1007,1007_3,1007_1,1007_2,1,2
1007_5,1007,1007_5,1007_1,1007_2,2,2
1007_40,1007,1007_40,0,0,1,0
...,...,...,...,...,...,...
989_10,989,989_10,989_1,989_2,1,2
990_99,990,990_99,990_1,990_2,2,2
990_2,990,990_2,0,0,2,0
990_12,990,990_12,990_1,990_2,2,2


In [None]:
fam17.sort_index(ascending=False)

Unnamed: 0,fid,iid,fathid,mothid,sex,ad
990_99,990,990_99,990_1,990_2,2,2
990_2,990,990_2,0,0,2,0
990_12,990,990_12,990_1,990_2,2,2
990_1,990,990_1,0,0,1,0
989_99,989,989_99,989_1,989_2,2,2
...,...,...,...,...,...,...
1007_4,1007,1007_4,1007_1,1007_2,2,0
1007_39,1007,1007_39,1007_40,1007_4,1,1
1007_3,1007,1007_3,1007_1,1007_2,1,2
1007_2,1007,1007_2,0,0,2,0


In [None]:
sum(fam17.sort_values(['fid','iid'],ascending=False).index==fam17.sort_index(ascending=False).index)

1964

In [None]:
fam17[fam17.fid=='4']

Unnamed: 0,fid,iid,fathid,mothid,sex,ad
4_8,4,4_8,4_17,4_18,2,2
4_1,4,4_1,0,0,1,0
4_17,4,4_17,0,0,1,0
4_18,4,4_18,0,0,2,0
4_21,4,4_21,0,0,1,0
4_2,4,4_2,4_17,4_18,2,2
4_7,4,4_7,4_17,4_18,2,2
4_11,4,4_11,4_21,4_8,2,2
4_15,4,4_15,4_21,4_8,1,2
4_16,4,4_16,4_21,4_8,1,2


In [None]:
fam17[fam17.fid=='4'].sort_index(ascending=True)

Unnamed: 0,fid,iid,fathid,mothid,sex,ad
4_1,4,4_1,0,0,1,0
4_11,4,4_11,4_21,4_8,2,2
4_14,4,4_14,4_21,4_8,2,2
4_15,4,4_15,4_21,4_8,1,2
4_16,4,4_16,4_21,4_8,1,2
4_17,4,4_17,0,0,1,0
4_18,4,4_18,0,0,2,0
4_2,4,4_2,4_17,4_18,2,2
4_21,4,4_21,0,0,1,0
4_3,4,4_3,4_1,4_2,2,2


In [None]:
np.savetxt('tmp.txt',np.array(fam17.astype(str)),fmt='%s %s %s %s %s %s')

In [None]:
write_to_runPM(genes,fam17,genemap,'../../pseudomarker-2.0-linux/testchp',transfer=True)

In [None]:
genemap.loc[genes.keys(),[0,3,'zero',1]].to_csv('../../pseudomarker-2.0-linux/apoechp_noheader.map',header=False,index=False,sep='\t')

In [None]:
plink --map apoechp_noheader.map --ped apoechp.ped --recode transpose

In [None]:
plink=pd.read_csv('../../pseudomarker-2.0-linux/plink.tfam',header=None,sep=' ')

In [None]:
def write_haps_linkage(haps,output):

In [None]:
from functools import reduce
apoe_haps_all = reduce(lambda x, y: pd.merge(x, y, how = 'outer',left_index=True,right_index=True), [h.T for h in new_haps])

In [None]:
apoe_haps_all = apoe_haps_all.T

In [None]:
apoe_haps_all=apoe_haps_all.fillna(0)

In [None]:
apoe_haps_all.columns[6:][::2]

Index(['chr19:44105413:C:G_A0', 'chr19:44105454:A:C_A0',
       'chr19:44105534:T:TTTTTA_A0', 'chr19:44105571:T:A_A0',
       'chr19:44105621:G:A_A0', 'chr19:44105679:C:T_A0',
       'chr19:44105693:C:G_A0', 'chr19:44105746:C:G_A0',
       'chr19:44105751:A:G_A0', 'chr19:44105765:G:A_A0',
       ...
       'chr19:44918669:G:T_A0', 'chr19:44918692:G:C_A0',
       'chr19:44918715:AG:A_A0', 'chr19:44918722:C:A_A0',
       'chr19:44918903:C:G_A0', 'chr19:44919071:G:GATTC_A0',
       'chr19:44919189:A:T_A0', 'chr19:44919285:G:A_A0',
       'chr19:44919304:T:G_A0', 'chr19:44919330:A:G_A0'],
      dtype='object', length=10013)

In [None]:
apoe_haps_all=apoe_haps_all[['fid', 'iid', 'fathid', 'mothid', 'sex']+list(apoe_haps_all.columns)[:-5]]

In [None]:
var_lst = []
for var in apoe_haps_all.columns[6:][::2]:
    snp = var[:-3]
    var_lst.append(snp.split(':')[:2]+[snp])
variants=pd.DataFrame(var_lst,columns=['Chromosome','Haldane','Name'])
variants.Haldane = variants.Haldane.astype(int)
variants.sort_values('Haldane')

Unnamed: 0,Chromosome,Haldane,Name
0,chr19,44105413,chr19:44105413:C:G
1,chr19,44105454,chr19:44105454:A:C
2,chr19,44105534,chr19:44105534:T:TTTTTA
3,chr19,44105571,chr19:44105571:T:A
4,chr19,44105621,chr19:44105621:G:A
...,...,...,...
10008,chr19,44919071,chr19:44919071:G:GATTC
10009,chr19,44919189,chr19:44919189:A:T
10010,chr19,44919285,chr19:44919285:G:A
10011,chr19,44919304,chr19:44919304:T:G


In [None]:
apoe_haps_all.to_csv('../../pseudomarker-2.0-linux/pseudomarker-sampledata/test.ped',header=False, index=False,sep='\t')

In [None]:
variants.to_csv('../../pseudomarker-2.0-linux/pseudomarker-sampledata/test.map',header=True,index=False,sep='\t')

In [None]:
apoe_haps_all.iloc[:,:26].to_csv('../../pseudomarker-2.0-linux/pseudomarker-sampledata/test_f10.ped',header=False, index=False,sep='\t')

In [None]:
variants[:10].to_csv('../../pseudomarker-2.0-linux/pseudomarker-sampledata/test_f10.map',header=True,index=False,sep='\t')

In [None]:
variants = pd.read_csv('../../pseudomarker-2.0-linux/pseudomarker-sampledata/test_f10.map',header=0,sep='\t')

In [None]:
variants.Chromosome = 19

In [None]:
variants['Zero'] = 0

In [None]:
variants[['Chromosome','Name','Zero','Haldane']].to_csv('../../pseudomarker-2.0-linux/pseudomarker-sampledata/test_f10_nohead.map',header=False,index=False,sep='\t')

In [None]:
(gene_fam_haps['1007']['chr19:44105534:T:TTTTTA_A1']==0) | (gene_fam_haps['1007']['chr19:44105534:T:TTTTTA_A0']==0)

1007_39    False
1007_99    False
1007_3     False
1007_5     False
1007_40     True
1007_6     False
1007_1     False
1007_2     False
1007_4     False
dtype: bool

In [None]:
gene_fam_haps['1007']

Unnamed: 0,fid,iid,fathid,mothid,sex,ad,chr19:44105534:T:TTTTTA_A0,chr19:44105534:T:TTTTTA_A1,chr19:44105571:T:A_A0,chr19:44105571:T:A_A1,...,chr19:44918393:G:A_A0,chr19:44918393:G:A_A1,chr19:44918487:G:T_A0,chr19:44918487:G:T_A1,chr19:44918620:A:G_A0,chr19:44918620:A:G_A1,chr19:44918715:AG:A_A0,chr19:44918715:AG:A_A1,chr19:44918903:C:G_A0,chr19:44918903:C:G_A1
1007_39,1007,1007_39,1007_40,1007_4,1,1,1,1,1,1,...,1,1,1,2,2,1,1,2,1,1
1007_99,1007,1007_99,1007_1,1007_2,2,2,2,2,2,2,...,2,1,1,2,1,1,1,2,2,1
1007_3,1007,1007_3,1007_1,1007_2,1,2,1,2,1,2,...,1,1,1,2,2,1,1,2,1,1
1007_5,1007,1007_5,1007_1,1007_2,2,2,1,2,1,2,...,1,1,1,1,2,1,1,1,1,2
1007_40,1007,1007_40,0,0,1,0,1,0,1,0,...,1,0,2,0,1,0,2,0,1,0
1007_6,1007,1007_6,1007_1,1007_2,2,2,1,2,1,2,...,1,1,1,1,2,1,1,1,1,2
1007_1,1007,1007_1,0,0,1,0,2,2,2,2,...,1,1,1,2,1,1,1,2,2,1
1007_2,1007,1007_2,0,0,2,0,1,2,1,2,...,1,2,1,1,2,1,1,1,1,2
1007_4,1007,1007_4,1007_1,1007_2,2,0,1,2,1,2,...,1,1,1,1,2,1,1,1,1,2


In [None]:
pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
tmp0=gene_fam_haps['1007']['chr19:44105534:T:TTTTTA_A0']
tmp1=gene_fam_haps['1007']['chr19:44105534:T:TTTTTA_A1']

In [None]:
(gene_fam_haps['1007'].shape[1]-6)//2

1750

In [None]:
new_haps=[]
for haps in gene_fam_haps.values():
    for i in range(0,haps.shape[1]-6,2):
        tmp0 = haps.iloc[:,6+i]
        tmp1 = haps.iloc[:,7+i]
        ind = (tmp0==0) | (tmp1==0)
        tmp0[ind]=0
        tmp1[ind]=0
    new_haps.append(haps)  

In [None]:
new_haps[0]

Unnamed: 0,fid,iid,fathid,mothid,sex,ad,chr19:44105534:T:TTTTTA_A0,chr19:44105534:T:TTTTTA_A1,chr19:44105571:T:A_A0,chr19:44105571:T:A_A1,...,chr19:44918393:G:A_A0,chr19:44918393:G:A_A1,chr19:44918487:G:T_A0,chr19:44918487:G:T_A1,chr19:44918620:A:G_A0,chr19:44918620:A:G_A1,chr19:44918715:AG:A_A0,chr19:44918715:AG:A_A1,chr19:44918903:C:G_A0,chr19:44918903:C:G_A1
1007_39,1007,1007_39,1007_40,1007_4,1,1,1,1,1,1,...,1,1,1,2,2,1,1,2,1,1
1007_99,1007,1007_99,1007_1,1007_2,2,2,2,2,2,2,...,2,1,1,2,1,1,1,2,2,1
1007_3,1007,1007_3,1007_1,1007_2,1,2,1,2,1,2,...,1,1,1,2,2,1,1,2,1,1
1007_5,1007,1007_5,1007_1,1007_2,2,2,1,2,1,2,...,1,1,1,1,2,1,1,1,1,2
1007_40,1007,1007_40,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1007_6,1007,1007_6,1007_1,1007_2,2,2,1,2,1,2,...,1,1,1,1,2,1,1,1,1,2
1007_1,1007,1007_1,0,0,1,0,2,2,2,2,...,1,1,1,2,1,1,1,2,2,1
1007_2,1007,1007_2,0,0,2,0,1,2,1,2,...,1,2,1,1,2,1,1,1,1,2
1007_4,1007,1007_4,1007_1,1007_2,2,0,1,2,1,2,...,1,1,1,1,2,1,1,1,1,2


pseudomarker -p test_f10.ped -m test_f10.map --dom

# Merlin to linkage

In [None]:
cmap=pd.read_csv('../data/wg20220316/chr22test/MERLIN/chr22test.chr22.map',sep='\t')

In [None]:
new_map=cmap.iloc[:,[0,2,1]]

In [None]:
new_map.columns = ['Chromosome','Haldane','Name']

In [None]:
new_map.to_csv('../data/wg20220316/chr22test/MERLIN/chr22test.chr22_new.map',header=True,index=False,sep='\t')

In [None]:
new_map

Unnamed: 0,Chromosome,Haldane,Name
0,22,0.000000,DUXAP8
1,22,0.000000,BMS1P22@3
2,22,0.000000,"BMS1P17@3,BMS1P18@3"
3,22,0.000000,PSLNR
4,22,0.000000,POTEH
...,...,...,...
584,22,78.751192,ALG12
585,22,78.788843,CRELD2
586,22,78.895162,PIM3
587,22,79.101809,IL17REL


In [None]:
cped = pd.read_csv('../data/wg20220316/chr22test/MERLIN/chr22test.chr22.ped',sep='\t',header=None)

  cped = pd.read_csv('../data/wg20220316/chr22test/MERLIN/chr22test.chr22.ped',sep='\t',header=None)


In [None]:
cped.shape

(3899, 1184)

In [None]:
for i in range(0,cped.shape[1]-6,2):
    tmp0 = cped.iloc[:,6+i]
    tmp1 = cped.iloc[:,7+i]
    ind = (tmp0==0) | (tmp1==0)
    tmp0[ind]=0
    tmp1[ind]=0
    tmp0[tmp0.astype(int)>2]=2
    tmp1[tmp1.astype(int)>2]=2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp0[ind]=0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp1[ind]=0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp0[tmp0.astype(int)>2]=2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp1[tmp1.astype(int)>2]=2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the 

In [None]:
cped[5]=cped[5].replace(-9,0)

In [None]:
cped.index = list(cped[1])

In [None]:
cped=cped.sort_index()

In [None]:
cped.to_csv('../data/wg20220316/chr22test/MERLIN/chr22test.chr22_new.ped',header=False,index=False,sep='\t')

In [None]:
cped.iloc[:,:26].to_csv('../data/wg20220316/chr22test/MERLIN/chr22test.chr22_new_f10.ped',header=False, index=False,sep='\t')

In [None]:
new_map[:10].to_csv('../data/wg20220316/chr22test/MERLIN/chr22test.chr22_new_f10.map',header=True,index=False,sep='\t')

In [None]:
cped

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1174,1175,1176,1177,1178,1179,1180,1181,1182,1183
1007_1,1007,1007_1,0,0,1,0,1,2,1,2,...,0,0,0,0,1,2,2,1,0,0
1007_2,1007,1007_2,0,0,2,0,1,2,0,0,...,2,2,2,1,0,0,0,0,1,2
1007_3,1007,1007_3,1007_1,1007_2,1,2,0,0,0,0,...,2,1,1,1,0,0,2,2,0,0
1007_39,1007,1007_39,1007_40,1007_4,1,1,2,2,2,1,...,2,2,1,2,2,2,2,2,2,2
1007_4,1007,1007_4,1007_1,1007_2,2,0,1,2,0,0,...,0,0,0,0,1,1,1,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
989_99,989,989_99,989_1,989_2,2,2,0,0,2,2,...,2,2,1,1,0,0,2,2,2,2
990_1,990,990_1,0,0,1,0,0,0,2,2,...,1,2,1,1,2,1,2,2,0,0
990_12,990,990_12,990_1,990_2,2,2,0,0,0,0,...,2,2,2,2,2,1,0,0,0,0
990_2,990,990_2,0,0,2,0,2,2,2,1,...,1,1,0,0,0,0,2,1,0,0


## Run paramlink2 on CHP markers

In [None]:
cped = pd.read_csv('../data/wg20220316/chr22test/MERLIN/chr22test.chr22.ped',sep='\t',header=None)

  cped = pd.read_csv('../data/wg20220316/chr22test/MERLIN/chr22test.chr22.ped',sep='\t',header=None)


In [None]:
cped=cped.replace('?',0)

In [None]:
cped = pd.concat([cped.iloc[:,:4].astype(str),cped.iloc[:,4:].astype(int)],axis=1)

In [None]:
cped.index = list(cped[1])

In [None]:
cped=cped.sort_index()

In [None]:
cped[5]=cped[5].replace(-9,0)

In [None]:
tmp = cped.iloc[:,6:]

In [None]:
tmp[tmp>2]=2

In [None]:
cped = pd.concat([cped.iloc[:,:6],tmp],axis=1)

In [None]:
cped_d={}
for i in cped[0].unique():
    cped_d[i]=cped[cped[0]==i]

In [None]:
calculate_ped_lod(cped_d['1137'])

Unnamed: 0,MARKER,LOD
1,6,
2,8,0.009661
3,10,
4,12,0.009661
5,14,0.283737
...,...,...
585,1174,
586,1176,0.283529
587,1178,-0.005014
588,1180,0.000000


In [None]:
cped_res = parallel_lods(cped_d.values())

50.33882123604417


In [None]:
cmap

Unnamed: 0,CHROMOSOME,MARKER,Unnamed: 2,POSITION,FEMALE_POSITION,MALE_POSITION
0,22,DUXAP8,0.000000,0.000000,0.000000,
1,22,BMS1P22@3,0.000000,0.000000,0.000000,
2,22,"BMS1P17@3,BMS1P18@3",0.000000,0.000000,0.000000,
3,22,PSLNR,0.000000,0.000000,0.000000,
4,22,POTEH,0.000000,0.000000,0.000000,
...,...,...,...,...,...,...
584,22,ALG12,78.751192,68.840233,89.808657,
585,22,CRELD2,78.788843,68.907445,89.814048,
586,22,PIM3,78.895162,69.096823,89.829375,
587,22,IL17REL,79.101809,69.467435,89.859470,


In [None]:
variants = {}
for lod in cped_res:
    for m,l in zip(lod['MARKER'],lod['LOD']):
        if pd.isna(l):
            continue
        if m in variants.keys():
            variants[m] += l
        else:
            variants[m] = l
#variants=pd.DataFrame(variants)

ValueError: If using all scalar values, you must pass an index

In [None]:
variants

In [None]:
cped_d['1007'].to_csv('../data/wg20220316/chr22test/MERLIN/chr22test.chr22_new_1007.ped',header=False,index=False,sep='\t')