# Setup

In [1]:
import os
import sys
sys.path[0] = '../'
import logging
from Bio import SeqIO
from pyard import ARD

import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from bin.build_gfedb import *

In [16]:
# Notebook config
%config IPCompleter.greedy=True

# Environment

In [2]:
imgt_hla = 'https://www.ebi.ac.uk/ipd/imgt/hla/docs/release.html'
imgt_hla_media_url = 'https://media.githubusercontent.com/media/ANHIG/IMGTHLA/'
imgt_hla_raw_url = 'https://raw.githubusercontent.com/ANHIG/IMGTHLA/'

imgt_kir = 'https://www.ebi.ac.uk/ipd/kir/docs/version.html'
kir_url = 'ftp://ftp.ebi.ac.uk/pub/databases/ipd/kir/KIR.dat'


data_dir = "../../data/" #os.path.dirname(__file__) + "/../../data/"

expre_chars = ['N', 'Q', 'L', 'S']

In [3]:
lastseqid = 1
lastid = 1
lastcdsid = 1

seqids = {}
cdsids = {}
alleleids = {}
group_edges = {}
trans_edges = {}

# The alleles are removed when the allele_nodes.csv is built
skip_alleles = ["HLA-DRB5*01:11", "HLA-DRB5*01:12", "HLA-DRB5*01:13",
                "HLA-DRB5*02:03", "HLA-DRB5*02:04", "HLA-DRB5*02:05",
                "HLA-DRB5*01:01:02", "HLA-DRB5*01:03", "HLA-DRB5*01:05",
                "HLA-DRB5*01:06", "HLA-DRB5*01:07", "HLA-DRB5*01:09",
                "HLA-DRB5*01:10N", "HLA-C*05:208N", "HLA-C*05:206"]

hla_loci = ['HLA-A', 'HLA-B', 'HLA-C', 'HLA-DRB1', 'HLA-DQB1',
            'HLA-DPB1', 'HLA-DQA1', 'HLA-DPA1', 'HLA-DRB3',
            'HLA-DRB4', 'HLA-DRB5']

hla_align = ['HLA-A', 'HLA-B', 'HLA-C', 'HLA-DRB1', 'HLA-DQB1',
             'HLA-DPB1', 'HLA-DQA1', 'HLA-DPA1']

kir_loci = ["KIR3DS1", "KIR3DP1", "KIR3DL3", "KIR3DL2", "KIR3DL1",
            "KIR2DS5", "KIR2DS4", "KIR2DS3", "KIR2DS2", "KIR2DS1",
            "KIR2DP1", "KIR2DL5B", "KIR2DL5A", "KIR2DL4"]

kir_aligloci = ["KIR2DL4", "KIR2DP1", "KIR2DS1", "KIR2DS2", "KIR2DS3",
                "KIR2DS4", "KIR2DS5", "KIR3DL1", "KIR3DL2", "KIR3DL3",
                "KIR3DP1"]

ard_groups = ['G', 'lg', 'lgx']

align = True

In [4]:
kir = None

if kir:
    load_loci = hla_loci + kir_loci
else:
    load_loci = hla_loci

from seqann import gfe
gfe_maker = gfe.GFE(verbose=True, verbosity=2,
                load_features=False, store_features=True,
                loci=load_loci)

# Run (1)

In [5]:
dat_file = '../../data/hla.3360.dat'

In [6]:
ard_groups = ['G', 'lg', 'lgx']
ard = ARD('3360')

In [7]:
a_gen = SeqIO.parse(dat_file, "imgt")

In [97]:
for idx, allele in enumerate(a_gen):
    
    if idx == 1:
            break
        
    if hasattr(allele, 'seq'):
        hla_name = allele.description.split(",")[0]
        print(f'hla_name: {hla_name}\n')
        
        loc = allele.description.split(",")[0].split("*")[0]
        print(f'loc: {loc}\n')
        
        a_name = allele.description.split(",")[0].split("-")[1]
        print(f'a_name: {a_name}\n')
        
        groups = [["HLA-" + ard.redux(a_name, grp), grp] if ard.redux(a_name, grp) != a_name else None for
                                  grp in ard_groups]
        print(f'groups: {groups}')
    
        seco = [[to_second(a_name), "2nd_FIELD"]]
        print(f'seco: {seco}\n')
        
        groups = list(filter(None, groups)) + seco
        print(f'groups: {groups}\n')
        
        complete_annotation = get_features(allele)
        print(f'complete_annotation: {complete_annotation}\n')
        
        ann = Annotation(annotation=complete_annotation,
                         method='match',
                         complete_annotation=True)
        print(f'ann: {ann}\n\n\n\n')
        
        # This process takes a long time
        features, gfe = gfe_maker.get_gfe(ann, loc)
        
        # gen_aln, nuc_aln, prot_aln
        aligned_gen = ''
        aligned_nuc = ''
        aligned_prot = ''

        if align:
            if allele.description.split(",")[0] in gen_aln[loc]:
                aligned_gen = gen_aln[loc][allele.description.split(",")[
                    0]]

            if allele.description.split(",")[0] in nuc_aln[loc]:
                aligned_nuc = nuc_aln[loc][allele.description.split(",")[
                    0]]

            if allele.description.split(",")[0] in prot_aln[loc]:
                aligned_prot = prot_aln[loc][allele.description.split(",")[
                    0]]

01/30/2021 01:41:11 PM - Logger.seqann.gfe - INFO - ID NA         - Storing new feature HLA-A:1:five_prime_UTR:CAGGAGCAGAGGGGTCAGGGCGAAGTCCCAGGGCCCCAGGCGTGGCTCTCAGGGTCTCAGGCCCCGAAGGCGGTGTATGGATTGGGGAGTCCCAGCCTTGGGGATTCCCCAACTCCGCAGTTTCTTTTCTCCCTCTCCCAACCTACGTAGGGTCCTTCATCCTGGATACTCACGACGCGGACCCAGTTCTCACTCCCATTGGGTGTCGGGTTTCCAGAGAAGCCAATCAGTGTCGTCGCGGTCGCTGTTCTAAAGTCCGCACGCACCCACCGGGACTCAGATTCTCCCCAGACGCCGAGG
01/30/2021 01:41:11 PM - Logger.seqann.gfe - INFO - ID NA         - Updated * all_feats 0.0007 MB *


hla_name: HLA-A*01:01:01:01

loc: HLA-A

a_name: A*01:01:01:01

groups: [['HLA-A*01:01:01G', 'G'], ['HLA-A*01:01g', 'lg'], ['HLA-A*01:01', 'lgx']]
seco: [['A*01:01', '2nd_FIELD']]

groups: [['HLA-A*01:01:01G', 'G'], ['HLA-A*01:01g', 'lg'], ['HLA-A*01:01', 'lgx'], ['A*01:01', '2nd_FIELD']]

complete_annotation: {'five_prime_UTR': SeqRecord(seq=Seq('CAGGAGCAGAGGGGTCAGGGCGAAGTCCCAGGGCCCCAGGCGTGGCTCTCAGGG...AGG', IUPACAmbiguousDNA()), id='1', name='<unknown name>', description='<unknown description>', dbxrefs=[]), 'exon_1': SeqRecord(seq=Seq('ATGGCCGTCATGGCGCCCCGAACCCTCCTCCTGCTACTCTCGGGGGCCCTGGCC...CGG', IUPACAmbiguousDNA()), id='1', name='<unknown name>', description='<unknown description>', dbxrefs=[]), 'intron_1': SeqRecord(seq=Seq('GTGAGTGCGGGGTCGGGAGGGAAACCGCCTCTGCGGGGAGAAGCAAGGGGCCCT...CAG', IUPACAmbiguousDNA()), id='1', name='<unknown name>', description='<unknown description>', dbxrefs=[]), 'exon_2': SeqRecord(seq=Seq('GCTCCCACTCCATGAGGTATTTCTTCACATCCGTGTCCCGGCCCGGCCGCGGGG...ACG', 

01/30/2021 01:41:12 PM - Logger.seqann.gfe - INFO - ID NA         - Storing new feature HLA-A:1:exon:ATGGCCGTCATGGCGCCCCGAACCCTCCTCCTGCTACTCTCGGGGGCCCTGGCCCTGACCCAGACCTGGGCGG
01/30/2021 01:41:12 PM - Logger.seqann.gfe - INFO - ID NA         - Updated * all_feats 0.0007 MB *
01/30/2021 01:41:12 PM - Logger.seqann.gfe - INFO - ID NA         - Storing new feature HLA-A:1:intron:GTGAGTGCGGGGTCGGGAGGGAAACCGCCTCTGCGGGGAGAAGCAAGGGGCCCTCCTGGCGGGGGCGCAGGACCGGGGGAGCCGCGCCGGGAGGAGGGTCGGGCAGGTCTCAGCCACTGCTCGCCCCCAG
01/30/2021 01:41:12 PM - Logger.seqann.gfe - INFO - ID NA         - Updated * all_feats 0.0007 MB *
01/30/2021 01:41:12 PM - Logger.seqann.gfe - INFO - ID NA         - Storing new feature HLA-A:2:exon:GCTCCCACTCCATGAGGTATTTCTTCACATCCGTGTCCCGGCCCGGCCGCGGGGAGCCCCGCTTCATCGCCGTGGGCTACGTGGACGACACGCAGTTCGTGCGGTTCGACAGCGACGCCGCGAGCCAGAAGATGGAGCCGCGGGCGCCGTGGATAGAGCAGGAGGGGCCGGAGTATTGGGACCAGGAGACACGGAATATGAAGGCCCACTCACAGACTGACCGAGCGAACCTGGGGACCCTGCGCGGCTACTACAACCAGAGCGAGGACG
01/30/2021 01:41:12

NameError: name 'gen_aln' is not defined

## build_graph( )

Examining arguments of `build_graph()` function.

In [86]:
groups

[['HLA-A*01:01:01G', 'G'],
 ['HLA-A*01:01g', 'lg'],
 ['HLA-A*01:01', 'lgx'],
 ['A*01:01', '2nd_FIELD']]

In [87]:
gfe

'HLA-Aw333-1-1-1-279-1-1-1-1-1-1-1-1-1-1-1-4'

In [88]:
allele

SeqRecord(seq=Seq('CCAGTTCTCACTCCCATTGGGTGTCGGGTTTCCAGAGAAGCCAATCAGTGTCGT...AAA', IUPACAmbiguousDNA()), id='HLA16436.1', name='HLA16436', description='HLA-A*01:01:01:07, Human MHC Class I sequence', dbxrefs=['EMBL:KY350872'])

In [89]:
features

[{'accession': 333,
  'hash_code': None,
  'locus': 'HLA-A',
  'rank': 1,
  'sequence': 'CCAGTTCTCACTCCCATTGGGTGTCGGGTTTCCAGAGAAGCCAATCAGTGTCGTCGCGGTCGCTGTTCTAAAGTCCGCACGCACCCACCGGGACTCAGATTCTCCCCAGACGCCGAGG',
  'term': 'five_prime_UTR'},
 {'accession': 1,
  'hash_code': None,
  'locus': 'HLA-A',
  'rank': 1,
  'sequence': 'ATGGCCGTCATGGCGCCCCGAACCCTCCTCCTGCTACTCTCGGGGGCCCTGGCCCTGACCCAGACCTGGGCGG',
  'term': 'exon'},
 {'accession': 1,
  'hash_code': None,
  'locus': 'HLA-A',
  'rank': 1,
  'sequence': 'GTGAGTGCGGGGTCGGGAGGGAAACCGCCTCTGCGGGGAGAAGCAAGGGGCCCTCCTGGCGGGGGCGCAGGACCGGGGGAGCCGCGCCGGGAGGAGGGTCGGGCAGGTCTCAGCCACTGCTCGCCCCCAG',
  'term': 'intron'},
 {'accession': 1,
  'hash_code': None,
  'locus': 'HLA-A',
  'rank': 2,
  'sequence': 'GCTCCCACTCCATGAGGTATTTCTTCACATCCGTGTCCCGGCCCGGCCGCGGGGAGCCCCGCTTCATCGCCGTGGGCTACGTGGACGACACGCAGTTCGTGCGGTTCGACAGCGACGCCGCGAGCCAGAAGATGGAGCCGCGGGCGCCGTGGATAGAGCAGGAGGGGCCGGAGTATTGGGACCAGGAGACACGGAATATGAAGGCCCACTCACAGACTGACCGAGCGAACCTGGGGACCCTGCGCGGCTAC

In [84]:
dbversion

'3360'

In [71]:
complete_annotation.keys()

dict_keys(['five_prime_UTR', 'exon_1', 'intron_1', 'exon_2', 'intron_2', 'exon_3', 'intron_3', 'exon_4', 'intron_4', 'exon_5', 'intron_5', 'exon_6', 'intron_6', 'exon_7', 'intron_7', 'exon_8', 'three_prime_UTR'])

In [75]:
complete_annotation['exon_1']

SeqRecord(seq=Seq('ATGGCCGTCATGGCGCCCCGAACCCTCCTCCTGCTACTCTCGGGGGCCCTGGCC...CGG', IUPACAmbiguousDNA()), id='1', name='<unknown name>', description='<unknown description>', dbxrefs=[])

In [27]:
allele.description

'HLA-A*01:01:76, Human MHC Class I sequence (partial)'

In [22]:
hla_name

'HLA-A*01:01:76'

In [23]:
loc

'HLA-A'

In [24]:
a_name

'A*01:01:76'

In [25]:
ard.redux(a_name, ard_groups[2])

'A*01:01:76'

In [26]:
groups

[None, None, None]