In [1]:
from urslib2 import RSS
import urslib2.SS.Relation as Relation
import os, glob

## Merging identical hits found for several reference motifs

In [None]:
hits = []
with open("rawhits.tsv") as file:
    lines = file.readlines()
    title = lines[0].strip().split('\t')
    for line in lines[1:]:
        linesplit = line.strip().split('\t')
        hit = {title[i]:linesplit[i] for i in range(len(title))}
        hit['SIZE'] = int(hit['SIZE'])
        hit['RMSD'] = float(hit['RMSD'])
        hit['ID'] = hit['PDB']
        for pos in ('-2b','-2n','-1b','-1n', 'L1', 'L1p', '1n', '1b', '2b', '2n', '3b', '3n', '4b', '4n'):
            hit['ID'] += pos + '=' + hit[pos] + ';'
        hits.append(hit)
#print(len(hits))

In [None]:
besthits = {}

for hit in hits:
    if hit['ID'] not in besthits:
        besthits[hit['ID']] = (hit['SIZE'],hit['RMSD'])
    elif hit['SIZE'] > besthits[hit['ID']][0]:
        besthits[hit['ID']] = (hit['SIZE'],hit['RMSD'])
    elif hit['SIZE'] == besthits[hit['ID']][0] and hit['RMSD'] < besthits[hit['ID']][1]:
        besthits[hit['ID']] = (hit['SIZE'],hit['RMSD'])
#print(len(besthits))

In [4]:
with open("mergedhits.tsv",'w') as outp:
    outp.write("\t".join(title)+'\n')
    for hit in hits:
        if (hit['SIZE'],hit['RMSD']) == besthits[hit['ID']]:
            outp.write('\t'.join([str(hit[x]) for x in title])+'\n')

## Removing sub-hits & hits missing at least one of those positions: -1b,-1n, L1, 1n, 1b, 2b, 2n

In [None]:
hits = []
with open("mergedhits.tsv") as file:
    lines = file.readlines()
    title = lines[0].strip().split('\t')
    for line in lines[1:]:
        linesplit = line.strip().split('\t')
        hit = {title[i]:linesplit[i] for i in range(len(title))}
        hit['SIZE'] = int(hit['SIZE'])
        hit['RMSD'] = float(hit['RMSD'])
        hit['ID'] = hit['PDB']
        for pos in ('-2b','-2n','-1b','-1n', 'L1', 'L1p', '1n', '1b', '2b', '2n', '3b', '3n', '4b', '4n'):
            hit['ID'] += pos + '=' + hit[pos] + ';'
        hits.append(hit)
#print(len(hits))

In [None]:

def IsInferior(hit1, hit2):
    res1 = [hit1[x] for x in ('-2b','-2n','-1b','-1n', 'L1', 'L1p', '1n', 
                              '1b', '2b', '2n', '3b', '3n', '4b', '4n')]
    res2 = [hit2[x] for x in ('-2b','-2n','-1b','-1n', 'L1', 'L1p', '1n', 
                              '1b', '2b', '2n', '3b', '3n', '4b', '4n')]
    if all((x and y and x==y) or (x and not y) or (not x and not y) for x,y in zip(res1,res2)):
        return True
    else:
        return False

inferior = set()
pdbs = {}
for hit in hits:
    if hit['PDB'] not in pdbs:
        pdbs[hit['PDB']] = []
    pdbs[hit['PDB']].append(hit)

for pdb in pdbs:
    print(pdb[:4],end=' ')
    for i in range(len(pdbs[pdb])-1):
        for j in range(i+1,len(pdbs[pdb])):
            if IsInferior(pdbs[pdb][i],pdbs[pdb][j]):
                inferior.add(pdbs[pdb][j]['ID'])
            elif IsInferior(pdbs[pdb][j],pdbs[pdb][i]):
                inferior.add(pdbs[pdb][i]['ID'])
#len(inferior)

In [12]:
with open("filteredhits.tsv",'w') as outp:
    outp.write("\t".join(title)+'\n')
    for hit in hits:
        if hit['ID'] not in inferior and all(hit[x] for x in ('-1b','-1n', 'L1', '1n', '1b', '2b', '2n')):
            outp.write('\t'.join([str(hit[x]) for x in title])+'\n')