**1/14/21**

The purpose of this notebook is to collect all the human and bacterial proteins identified in the samples in order to submit them to Eggnog mapper for GO annotation.

In [1]:
from elliot_utils import *

In [2]:
analysisPath = Path.cwd().joinpath('analysis_files/functional_analysis/')

In [3]:
results = getOrderedFiles(HYBRID_RESULTS, '.tsv')
dbs = getOrderedFiles(HYBRID_DB, '.fasta')
refs = [ProtRef(x) for x in dbs]

In [4]:
hPeps = getFilteredPeptides(results, 'human')
bPeps = getFilteredPeptides(results, 'bacteria')

In [5]:
# Pull all of the valid, hit proteins out of the results and add them to the dictionary of proteins
# Collapses proteins with the same ID together, adding new taxa to the same protein ID
# Returns a dictionary in the format key=protID, value=protein object
def collectProtsInDict(results, refs, allowedPeps, typeOfProt):
    outDict = {} #key=protID, value=protein object
    for i in range(len(results)):
        res = results[i]
        ref = refs[i]
        with res.open(mode='r') as infile:
            reader = csv.reader(infile, delimiter='\t')
            for row in reader:
                protType = determineIDType(row)
                if protType == 'first':
                    continue
                if not isSignificant(row):
                    break
                if row[PEPTIDE] in allowedPeps and protType == typeOfProt:
                    hits = getProteinHitList(row, typeOfProt)
                    for hit in hits:
                        prot = ref.getProt(hit)
                        if prot.id in outDict.keys():
                            for t in prot.taxa:
                                outDict[prot.id].addTaxa(t)
                        else:
                            outDict[prot.id] = prot
    return outDict

In [6]:
hProts = collectProtsInDict(results, refs, hPeps, 'human')
bProts = collectProtsInDict(results, refs, bPeps, 'bacteria')

In [7]:
# Write the proteins in the dictionary out to the specified fasta file
def writeDictToFasta(protDict, output):
    toWrite = []
    for prot in protDict.values():
        toWrite.append(prot.getEntry())
    with open(output, 'w', newline='') as outfile:
        outfile.write(''.join(toWrite))

In [8]:
# Write the proteins to sepaparte files
bacteriaFile = analysisPath.joinpath('hit_bacterial_proteins.fasta')
humanFile = analysisPath.joinpath('hit_human_proteins.fasta')

In [9]:
writeDictToFasta(hProts, humanFile)
writeDictToFasta(bProts, bacteriaFile)