In [1]:
import tskit
import json
import pandas as pd
import os.path

In [2]:
DATA_PATH = '/wynton/group/capra/projects/modern_human_3Dgenome/data/tree_sequences/tree_files'


In [25]:
!ls $DATA_PATH

chr10_ancestral_variants.tsv
chr11_ancestral_variants.tsv
chr12_ancestral_variants.tsv
chr13_ancestral_variants.tsv
chr14_ancestral_variants.tsv
chr15_ancestral_variants.tsv
chr16_ancestral_variants.tsv
chr17_ancestral_variants.tsv
chr18_ancestral_variants.tsv
chr19_ancestral_variants.tsv
chr1_ancestral_variants.tsv
chr20_ancestral_variants.tsv
chr21_ancestral_variants.tsv
chr22_ancestral_variants.tsv
chr2_ancestral_variants.tsv
chr3_ancestral_variants.tsv
chr4_ancestral_variants.tsv
chr5_ancestral_variants.tsv
chr6_ancestral_variants.tsv
chr7_ancestral_variants.tsv
chr8_ancestral_variants.tsv
chr9_ancestral_variants.tsv
hgdp_tgp_sgdp_high_cov_ancients_chr10_p.dated.trees
hgdp_tgp_sgdp_high_cov_ancients_chr10_q.dated.trees
hgdp_tgp_sgdp_high_cov_ancients_chr11_p.dated.trees
hgdp_tgp_sgdp_high_cov_ancients_chr11_q.dated.trees
hgdp_tgp_sgdp_high_cov_ancients_chr12_p.dated.trees
hgdp_tgp_sgdp_high_cov_ancients_chr12_q.dated.trees
hgdp_tgp_sgdp_high_cov_ancients

In [23]:
def create_variant_files(chrm):
    p = "%s/hgdp_tgp_sgdp_high_cov_ancients_%s_p.dated.trees" % (DATA_PATH,chrm)
    p_exists = False
    if os.path.exists(p):
        print("loading p arm of %s" % chrm)
        p_exists = True
        tsp = tskit.load(p)
        tsp
    else:
        print("ATTENTION: p arm of %s does not have tree file" % chrm)

    q = "%s/hgdp_tgp_sgdp_high_cov_ancients_%s_q.dated.trees" % (DATA_PATH,chrm)
    q_exists = False
    if os.path.exists(q):
        print("loading q arm of %s" % chrm)
        q_exists = True
        tsq = tskit.load(q)
        tsq
    else:
        print("ATTENTION: q arm of %s does not have tree file" % chrm)
    anc_alleles = [('chr','position','ancestral_state','reference','rsid')]
    p_len=0
    q_len=0
    if (p_exists) and (q_exists) and (tsp.sites()[len(tsp.sites())-1].position < tsq.sites()[0].position):
        print("extracting ancestral states, both p and q")
        for site in tsp.sites():
            anc_state = site.ancestral_state
            m = json.loads(site.metadata)
            ref_state = m["REF"]
            rsid = m["ID"]
            pos = int(site.position)
            anc_alleles.append((chrm, pos,anc_state, ref_state, rsid))
            p_len +=1

        for site in tsq.sites():
            anc_state = site.ancestral_state
            m = json.loads(site.metadata)
            ref_state = m["REF"]
            rsid = m["ID"]
            pos = int(site.position)
            anc_alleles.append((chrm, pos,anc_state, ref_state, rsid))
            q_len +=1
    elif p_exists and not q_exists:
        print("extracting ancestral states p")
        for site in tsp.sites():
            anc_state = site.ancestral_state
            m = json.loads(site.metadata)
            ref_state = m["REF"]
            rsid = m["ID"]
            pos = int(site.position)
            anc_alleles.append((chrm, pos,anc_state, ref_state, rsid))
            p_len +=1
    elif q_exists and not p_exists:
        print("extracting ancestral states q")
        for site in tsq.sites():
            anc_state = site.ancestral_state
            m = json.loads(site.metadata)
            ref_state = m["REF"]
            rsid = m["ID"]
            pos = int(site.position)
            anc_alleles.append((chrm, pos,anc_state, ref_state, rsid))
            q_len +=1
    else:
        print("ATTENTION: last site in p arm is not less than first site in q arm, or both files don't exist")
    if (p_len + q_len + 1 == len(anc_alleles)):
        print("writing file")
        a = pd.DataFrame(anc_alleles)
        a.to_csv('%s/%s_ancestral_variants.tsv' % (DATA_PATH, chrm), header=False, index=False, sep='\t')
    else:
        print("ATTENTION: length of ancestral alleles does not match number of sites")
    return

In [26]:
chrms = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22']

for chrm in chrms:
    create_variant_files(chrm)
    

loading p arm of chr1
loading q arm of chr1
extracting ancestral states, both p and q
writing file
loading p arm of chr2
loading q arm of chr2
extracting ancestral states, both p and q
writing file
loading p arm of chr3
loading q arm of chr3
extracting ancestral states, both p and q
writing file
loading p arm of chr4
loading q arm of chr4
extracting ancestral states, both p and q
writing file
loading p arm of chr5
loading q arm of chr5
extracting ancestral states, both p and q
writing file
loading p arm of chr6
loading q arm of chr6
extracting ancestral states, both p and q
writing file
loading p arm of chr7
loading q arm of chr7
extracting ancestral states, both p and q
writing file
loading p arm of chr8
loading q arm of chr8
extracting ancestral states, both p and q
writing file
loading p arm of chr9
loading q arm of chr9
extracting ancestral states, both p and q
writing file
loading p arm of chr10
loading q arm of chr10
extracting ancestral states, both p and q
writing file
loading 

In [13]:
def make_fasta(vp, fp, op, ss):

    # make dictionary of variants
    dict = {}
    with open(vp, 'r') as calls:
        for site in calls:
            site = site.split()
            if site[1] != 'position':
                dict[int(site[1])] = site[2]

    # write reference sequence with ancestral calls present in the dictionary
    with open(f'{fp}', 'r') as fasta, open(f'{op}', 'w') as out:
        lines = [ line.strip() for line in fasta ]
        header = lines[0]
        print(header, file = out)
        no_header = lines[1:]
        seq = ''.join(no_header)

        for p, b in enumerate(seq, start=1):
            if p in dict:
                print(dict[p], end = "", file = out)
            else:
                print(b, end = "", file = out)

    # read in output file to split the new sequence every nth base and rewrite output			
    with open(f'{op}', 'r') as out:
        lines = [ line.strip() for line in out ]
        header = lines[0]
        no_header = lines[1]
        seq = ''.join(no_header)

        new_seq = ''
        for i, bp in enumerate(seq):
            if i % ss == 0:
                new_seq += '\n'
            new_seq += bp

        new_seq = new_seq[1:]

    with open(f'{op}', 'w') as out:
        print(header, file = out)
        print(new_seq, file = out) 
    
    return

In [14]:
chrms = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22']
PYSCRIPT = '/wynton/group/capra/projects/modern_human_3Dgenome/scripts/add_variants_to_FASTA.py'
FASTA_PATH = '/wynton/home/capra/egilbertson/data/human_genome/chrms'
OUT_PATH = '/wynton/group/capra/projects/modern_human_3Dgenome/data/genomes/human_archaic_ancestor'

In [15]:
for chrm in chrms:
    print(chrm)
    var_path = '%s/%s_ancestral_variants.tsv' % (DATA_PATH, chrm)
    output = '%s/human_archaic_ancestor_in_hg38_%s.fasta' % (OUT_PATH, chrm)
    fasta = '%s/%s.fa' % (FASTA_PATH, chrm)
    make_fasta(var_path, fasta, output, 50)
    
    

chr1
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr20
chr21
chr22
