In [1]:
import re

# Get human ZFs

In [2]:
zf_re = re.compile('..C.{2,4}C.{12}H.{3,7}[HC]') ##zinc finger structure, ie CXX[XX]CXXXXXXXXXXXXHXXX[XXXX]H
zf_cterm = re.compile('C.{2,4}C')

In [3]:
infile = open('full_zf.fasta','r')
full_protein_lib = {}
line = infile.readline()
while line:
    seq = ''
    name = line.split('|')[1]
    line = infile.readline()
    while line and (not line[0]=='>'):
        seq += line[:-1]
        line = infile.readline()
    full_protein_lib[name] = seq

In [4]:
zf_order = []
allzf_lib = {}
for protein in full_protein_lib:
    seq = full_protein_lib[protein]
    zf_match = re.search(zf_re,seq)
    fing_num = 1
    while zf_match:
        zf = zf_match.group()
        end = zf_match.end()
        name = protein + ' finger ' + str(fing_num)
        same_seq = 0
        for have_zf in allzf_lib:
            if allzf_lib[have_zf] == zf:
                same_seq = 1
        if not same_seq:
            allzf_lib[name] = zf
            zf_order.append(name)
        seq = seq[end:]
        zf_match = re.search(zf_re,seq)
        fing_num += 1
allzf_lib['Q9Y2K1 finger 1']='FTCDSCGFGFSCEKLLDEHVLTC'
allzf_lib['Q9GZU2 finger 6']='YECEDCGLGFVDLTDLTDHQKVH'

## Generate input file for MARIA with individual ZFs

In [5]:
outfile = open('maria_nterm_link_zfs_final.txt','w')
outfile.write('Allele1\tAllele2 (Same as Allele1 if analyzing a single allele)\tGene Symbol\tPeptide Sequence\tTPM (Optional)\n')
for zf in allzf_lib.keys():
    seq = allzf_lib[zf]
    outfile.write('HLA-DRB1*01:01\tHLA-DRB1*01:01\t{}\t{}TGERP\t\n'.format(zf,seq[-14:]))
outfile.close()

## Read MHC output files for individual ZFs

In [6]:
infile = open('maria_nterm_final.txt','r')
maria_accepted_zfs = {}
line = infile.readline()
line = infile.readline()
while line:
    data = line.split('\t')
    name = data[2]
    score = data[7]
    if float(score) <63:
        maria_accepted_zfs[name] = allzf_lib[name]
    line = infile.readline()
infile.close()

## Generatate FASTA-format file of ZFs for DeepZF and ZifRC input

In [7]:
outfile = open('allzf_fasta.txt','w')
for zf in allzf_lib.keys():
    outfile.write('>{}\r{}\r'.format(zf,allzf_lib[zf]))

## Read DeepZF and ZifRC output

In [8]:
infile = open('allzf_deepzf_output.csv','r')
deepzf_codons = {}
deepzf_scores = {}
for zf in zf_order:
    codon = ''
    score = 0
    for i in range(3):
        a = float(infile.readline())
        c = float(infile.readline())
        g= float(infile.readline())
        t = float(infile.readline())
        if a >= c and a >= g and a>= t:
            codon += 'A'
            score += a
        elif c >= g and c >= t:
            codon += 'C'
            score += c
        elif g >= t:
            codon += 'G'
            score += g
        else:
            codon += 'T'
            score += a
    deepzf_codons[zf]=codon
    deepzf_scores[zf] = score
infile.close()

In [9]:
infile = open('all_zifrc.txt','r')
line = infile.readline()
line = infile.readline()
line = infile.readline()
zifrc_codons = {}
zifrc_scores = {}
while line:
    data = line.split('\t')
    name = data[1][:-1]
    line = infile.readline()
    line = infile.readline()
    line = infile.readline()
    line = infile.readline()
    line = infile.readline()
    codon = ''
    score = 0
    for i in range(3):
        base_data = line.split()[1:]
        score = score + float(max(base_data))
        a= float(base_data[0])
        c=float(base_data[1])
        g = float(base_data[2])
        t = float(base_data[3])
        if a == float(max(base_data)):
            codon = codon + 'A'
        elif c == float(max(base_data)):
            codon = codon + 'C'
        elif g == float(max(base_data)):
            codon = codon + 'G'
        elif t == float(max(base_data)):
            codon= codon + 'T'
        line = infile.readline()
    line = infile.readline()
    line = infile.readline()
    line = infile.readline()
    line = infile.readline()
    line = infile.readline()
    zifrc_codons[name] = codon
    zifrc_scores[name] = score
infile.close()

## Export ZF affinity summary files

In [10]:
zifrc_file = open('ZifRC_data.txt','w')
deepzf_file = open('DeepZF_data.txt','w')
for zf in allzf_lib:
    if zf in zifrc_codons:
        zifrc_file.write('{}\t{}\t{}\t{}\n'.format(zf,allzf_lib[zf],zifrc_codons[zf],zifrc_scores[zf]))
    deepzf_file.write('{}\t{}\t{}\t{}\n'.format(zf,allzf_lib[zf],deepzf_codons[zf],deepzf_scores[zf]))
zifrc_file.close()
deepzf_file.close()

## Determine ZF specificity for MARIA-accepted ZFs

In [11]:
possible_codons = []
for n1 in ['A', 'C', 'G','T']:
    for n2 in ['A', 'C', 'G','T']:
        for n3 in ['A', 'C', 'G','T']:
            possible_codons.append(n1+n2+n3)

In [12]:
maria_zifrc_codons = {}
for codon in possible_codons:
    maria_zifrc_codons[codon]= []
for zf in maria_accepted_zfs:
    if zf in zifrc_codons:
        maria_zifrc_codons[zifrc_codons[zf]].append(zf)

In [13]:
maria_deepzf_codons = {}
for codon in possible_codons:
    maria_deepzf_codons[codon]= []
for zf in maria_accepted_zfs:
    if zf in deepzf_codons:
        maria_deepzf_codons[deepzf_codons[zf]].append(zf)

In [14]:
for codon in maria_zifrc_codons:
    print('{}\t{}'.format(codon,len(maria_zifrc_codons[codon])))

AAA	11
AAC	12
AAG	18
AAT	31
ACA	11
ACC	16
ACG	13
ACT	35
AGA	10
AGC	0
AGG	13
AGT	23
ATA	80
ATC	18
ATG	63
ATT	109
CAA	0
CAC	0
CAG	2
CAT	3
CCA	1
CCC	2
CCG	5
CCT	0
CGA	7
CGC	0
CGG	4
CGT	4
CTA	12
CTC	0
CTG	8
CTT	6
GAA	5
GAC	1
GAG	9
GAT	19
GCA	10
GCC	9
GCG	11
GCT	28
GGA	20
GGC	7
GGG	9
GGT	14
GTA	14
GTC	26
GTG	30
GTT	85
TAA	0
TAC	0
TAG	4
TAT	4
TCA	2
TCC	17
TCG	10
TCT	10
TGA	5
TGC	1
TGG	21
TGT	8
TTA	2
TTC	3
TTG	22
TTT	20


In [15]:
for codon in maria_deepzf_codons:
    print('{}\t{}'.format(codon,len(maria_deepzf_codons[codon])))

AAA	18
AAC	15
AAG	20
AAT	26
ACA	16
ACC	9
ACG	3
ACT	13
AGA	24
AGC	6
AGG	31
AGT	26
ATA	10
ATC	2
ATG	13
ATT	11
CAA	16
CAC	12
CAG	31
CAT	21
CCA	16
CCC	18
CCG	10
CCT	30
CGA	7
CGC	9
CGG	22
CGT	8
CTA	15
CTC	5
CTG	21
CTT	16
GAA	25
GAC	12
GAG	25
GAT	17
GCA	13
GCC	27
GCG	23
GCT	34
GGA	18
GGC	25
GGG	39
GGT	10
GTA	12
GTC	9
GTG	27
GTT	19
TAA	3
TAC	7
TAG	0
TAT	5
TCA	4
TCC	9
TCG	1
TCT	10
TGA	7
TGC	10
TGG	17
TGT	11
TTA	6
TTC	4
TTG	10
TTT	8
