**1/24/24**

The purpose of this notebook is to take bacterial protein homologs identified by BLASTp searching and see how many of them exist in the protein data, as well as the genomes of vaginal bacteria more generally.

In [1]:
from elliot_utils import *

In [2]:
analysisPath = Path.cwd().joinpath('analysis_files/bacterial_homologs/')
bacterialData = analysisPath.joinpath('protein_summary_bacteria_strict.csv')

In [3]:
# Returns a set of homolog protein ids from a csv file
def getHomologsFromCSV(filePath):
    toReturn = set()
    with filePath.open(mode='r', encoding='utf-8-sig') as infile:
        reader = csv.reader(infile)
        for row in reader:
            toReturn.add(row[0])
    return toReturn

# Extracts the supplied homologs from the hit bacterial proteins file and outputs them to the csv file specified in outPath
def writeHomologsToCSV(hitsFile, homologSet, outPath):
    toWrite = []
    with hitsFile.open(mode='r') as infile:
        reader = csv.reader(infile)
        isFirst = True
        for row in reader:
            if isFirst:
                toWrite.append(row)
                isFirst = False
            elif row[0] in homologSet:
                toWrite.append(row)
    with outPath.open(mode='w', newline='') as outfile:
        writer = csv.writer(outfile)
        for row in toWrite:
            writer.writerow(row)

# Scans a blastp result for bacterial homologs of a query protein with Evalue < 0.0001, then refers to the vaginal bacterial
# protein reference database to extract information on those protein hits and write them to the specified csv file
def writeGenomeHomologsToCSV(blastpResult, bacterialRef, outpath):
    toWrite = [['id', 'protein', 'taxa', 'e-value']]
    with blastpResult.open(mode='r') as infile:
        reader = csv.reader(infile)
        for row in reader:
            if float(row[10]) < 0.0001:
                hit = bacterialRef.getProt(row[1])
                toWrite.append([hit.id, hit.name, hit.getTaxaString(), row[10]])
    with outpath.open(mode='w', newline='') as outfile:
        writer = csv.writer(outfile)
        for row in toWrite:
            writer.writerow(row)

In [11]:
### Look for homologs of proteins in the metaproteomics data
# Phosphoenolpyruvate carboxykinase homologs
pckaHomologs = getHomologsFromCSV(analysisPath.joinpath('homologs_pcka.csv'))
writeHomologsToCSV(bacterialData, pckaHomologs, analysisPath.joinpath('hits_pcka.csv'))

In [13]:
# Malate dehydrogenase homologs
mdhHomologs = getHomologsFromCSV(analysisPath.joinpath('homologs_mdh.csv'))
writeHomologsToCSV(bacterialData, mdhHomologs, analysisPath.joinpath('hits_mdh.csv'))

In [14]:
# Fumarate dehydrogenase homologs
fumbHomologs = getHomologsFromCSV(analysisPath.joinpath('homologs_fumb.csv'))
writeHomologsToCSV(bacterialData, fumbHomologs, analysisPath.joinpath('hits_fumb.csv'))

In [12]:
# Pyruvate formate lyase homologs
pflHomologs = getHomologsFromCSV(analysisPath.joinpath('homologs_pfl.csv'))
writeHomologsToCSV(bacterialData, pflHomologs, analysisPath.joinpath('hits_pfl.csv'))

In [4]:
### Look for homologs of proteins the the genomes of vaginal bacteria
bacterialRef = ProtRef(analysisPath.joinpath('community5.fasta'))

In [20]:
writeGenomeHomologsToCSV(analysisPath.joinpath('24-1-24_pfl_all_bacteria.csv'), bacterialRef, analysisPath.joinpath('genomes_pfl.csv'))

In [21]:
writeGenomeHomologsToCSV(analysisPath.joinpath('24-1-22_FDH_All_Bacteria.csv'), bacterialRef, analysisPath.joinpath('genomes_fdh.csv'))

In [23]:
writeGenomeHomologsToCSV(analysisPath.joinpath('24-1-23_NrdD_All_Bacteria.csv'), bacterialRef, analysisPath.joinpath('genomes_NrdD.csv'))

In [25]:
writeGenomeHomologsToCSV(analysisPath.joinpath('24-1-24_gdha_all_bacteria.csv'), bacterialRef, analysisPath.joinpath('genomes_gdha.csv'))

In [26]:
writeGenomeHomologsToCSV(analysisPath.joinpath('24-1-24_glua_all_bacteria.csv'), bacterialRef, analysisPath.joinpath('genomes_glua.csv'))

In [27]:
writeGenomeHomologsToCSV(analysisPath.joinpath('24-1-24_glutamine-synthetase_all_bacteria.csv'), bacterialRef, analysisPath.joinpath('genomes_glutamine-synthetase.csv'))

In [28]:
writeGenomeHomologsToCSV(analysisPath.joinpath('24-1-24_tmao-reductase_all_bacteria.csv'), bacterialRef, analysisPath.joinpath('genomes_tmao-reductase.csv'))

In [29]:
writeGenomeHomologsToCSV(analysisPath.joinpath('24-1-24_betaine-reductase_all_bacteria.csv'), bacterialRef, analysisPath.joinpath('genomes_betaine-reductase.csv'))

In [30]:
writeGenomeHomologsToCSV(analysisPath.joinpath('24-1-24_cutc_all_bacteria.csv'), bacterialRef, analysisPath.joinpath('genomes_cutc.csv'))

In [31]:
writeGenomeHomologsToCSV(analysisPath.joinpath('24-1-24_carnitine-monooxygenase_all_bacteria.csv'), bacterialRef, analysisPath.joinpath('genomes_carnitine-monooxygenase.csv'))

In [32]:
writeGenomeHomologsToCSV(analysisPath.joinpath('24-1-26_arginine-deiminase_all_bacteria.csv'), bacterialRef, analysisPath.joinpath('genomes_arginine-deiminase.csv'))

In [33]:
writeGenomeHomologsToCSV(analysisPath.joinpath('24-1-26_ornithine-carbamoyltransferase_all_bacteria.csv'), bacterialRef, analysisPath.joinpath('genomes_ornithine-carbamoyltransferase.csv'))

In [34]:
writeGenomeHomologsToCSV(analysisPath.joinpath('24-1-26_ornithine-decarboxylase_all_bacteria.csv'), bacterialRef, analysisPath.joinpath('genomes_ornithine-decarboxylase.csv'))

In [36]:
writeGenomeHomologsToCSV(analysisPath.joinpath('24-1-26_betaine_aldehyde_dehydrogenase_all_bacteria.csv'), bacterialRef, analysisPath.joinpath('genomes_betaine-aldehyde-dehydrogenase.csv'))

In [5]:
writeGenomeHomologsToCSV(analysisPath.joinpath('24-2-14_fdni_all_bacteria.csv'), bacterialRef, analysisPath.joinpath('genomes_nitrate-formate-dehydrogenase.csv'))

In [6]:
writeGenomeHomologsToCSV(analysisPath.joinpath('24-2-14_fdog_all_bacteria.csv'), bacterialRef, analysisPath.joinpath('genomes_alternate-formate-dehydrogenase.csv'))

In [7]:
writeGenomeHomologsToCSV(analysisPath.joinpath('24-2-14_fdha_all_bacteria.csv'), bacterialRef, analysisPath.joinpath('genomes_alternate-alternate-formate-dehydrogenase.csv'))