In [1]:
import numpy as np
import os
from matplotlib import pyplot as plt
from scipy import stats
import pickle

from tqdm import tqdm

In [2]:
def getKmer(segs, l):
    chopped = []
    for seg in segs:
        for i in range(len(seg)-l+1):
            chopped.append(seg[i:i+l])
    return list(set(chopped))

def sanity(fname, posweight):
    samples = {}
    weights = {}
    with open(fname, 'rt') as fin:
        for i, line in tqdm(enumerate(fin)):
            line = line.rstrip('\n')
            if i%2 == 0:
                seq, allele = line.split(' ')[:2]
                seq = seq[1:]
                if allele not in samples:
                    samples[allele] = []
                    weights[allele] = []
            else:
                segs = []
                i = line.find(seq)
                while i != -1:
                    segs.append(line[:i])
                    line = line[i+len(seq):]
                    i = line.find(seq)
                segs.append(line)
                negkmer = getKmer(segs, len(seq))
                if len(negkmer) == 0:
                    print ("No negatives", len(negkmer), seq, allele)
                    return
    return
        
sanity("./mhc1.fsa", 1/199)
sanity("./mhc2.fsa", 1/49)

3320it [00:00, 6988.88it/s]
1834it [00:00, 7020.71it/s]


In [3]:
def getKmer(segs, l):
    chopped = []
    for seg in segs:
        for i in range(len(seg)-l+1):
            chopped.append(seg[i:i+l])
    return list(set(chopped))

def getSamples(fname, posweight):
    samples = {}
    weights = {}
    nsamp1 = 0
    nsamp2 = 0
    with open(fname, 'rt') as fin:
        for i, line in tqdm(enumerate(fin)):
            line = line.rstrip('\n')
            if i%2 == 0:
                seq, allele = line.split(' ')[:2]
                seq = seq[1:]
                if allele not in samples:
                    samples[allele] = []
                    weights[allele] = []
            else:
                segs = []
                i = line.find(seq)
                while i != -1:
                    segs.append(line[:i])
                    line = line[i+len(seq):]
                    i = line.find(seq)
                segs.append(line)
                negkmer = getKmer(segs, len(seq))
                
                nsamp1 += 1
                nsamp2 += (1 + len(negkmer))
                
                samples[allele].append(seq)
                weights[allele].append(posweight)
                for nseq in negkmer:
                    samples[allele].append(nseq)
                    weights[allele].append(1/len(negkmer))
    print (nsamp1, nsamp2)
    return samples, weights
        
m1 = getSamples("./mhc1.fsa", 1/199)
m2 = getSamples("./mhc2.fsa", 1/49)

3320it [00:01, 3225.22it/s]
224it [00:00, 2205.62it/s]

1660 1645988


1834it [00:00, 2410.28it/s]

917 1201559





In [23]:
with open("./data.pkl", 'wb') as fout:
    pickle.dump((m1,m2), fout)

In [8]:
def getKmer(segs, l):
    chopped = []
    for seg in segs:
        for i in range(len(seg)-l+1):
            kmer = seg[i:i+l]
            if 'X' in kmer: continue
            chopped.append(kmer)
    return list(set(chopped))

def getSamplesWithScores(fname, posweight):
    samples = {}
    weights = {}
    scores = {}
    with open(fname, 'rt') as fin:
        for i, line in tqdm(enumerate(fin)):
            line = line.rstrip('\n')
            if i%2 == 0:
                seq, allele = line.split(' ')[:2]
                seq = seq[1:]
                if allele not in samples:
                    samples[allele] = []
                    weights[allele] = []
                    scores[allele] = []
            else:
                segs = []
                i = line.find(seq)
                while i != -1:
                    segs.append(line[:i])
                    line = line[i+len(seq):]
                    i = line.find(seq)
                segs.append(line)
                negkmer = getKmer(segs, len(seq))
                
                samples[allele].append(seq)
                weights[allele].append(posweight)
                scores[allele].append(1)
                for nseq in negkmer:
                    samples[allele].append(nseq)
                    weights[allele].append(1/len(negkmer))
                    scores[allele].append(0)
    return samples, weights, scores
        
ms1 = getSamplesWithScores("./mhc1.fsa", 1/199)
ms2 = getSamplesWithScores("./mhc2.fsa", 1/49)

3320it [00:01, 2604.86it/s]
1834it [00:00, 2091.28it/s]


In [9]:
with open("./data_withScores.pkl", 'wb') as fout:
    pickle.dump((ms1,ms2), fout)