In [42]:
# import packages
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import Bio
import statsmodels.api as sm
from scipy.optimize import curve_fit
import pylab



Import datasets and define variables

In [43]:
# import data from computer
path: str = './data/' #folder where files are stored
prokaryotes: pd.DataFrame = pd.read_csv(os.path.join(path, "prokaryotes_unique_prot.csv"), dtype=str)   # imports file from the folder as species, based on name
prokaryotes_all: pd.DataFrame = pd.read_csv(os.path.join(path, "prokaryotes_all.csv"), dtype=str) # imports 3rd file from the folder, based on name

prokaryotes_all['temperature'] = prokaryotes_all['temperature'].astype('float64')
prokaryotes_all['fold_change'] = prokaryotes_all['fold_change'].astype('float64')
prokaryotes_all['Gene Ontology IDs'] = prokaryotes_all['Gene Ontology IDs'].astype('str')


In [45]:
print(prokaryotes_all['Organism'].unique())

['Oleispira antarctica RB-8' 'Thermus thermophilus'
 'Picrophilus torridus (strain ATCC 700027 / DSM 9790 / JCM 10055 / NBRC 100828 / KAW 2/3)'
 'Thermus thermophilus (strain ATCC BAA-163 / DSM 7039 / HB27)'
 'Bacillus subtilis (strain 168)' 'Escherichia coli (strain K12)'
 'Arabidopsis thaliana (Mouse-ear cress)'
 'Danio rerio (Zebrafish) (Brachydanio rerio)'
 'Drosophila melanogaster (Fruit fly)' 'Escherichia coli O157:H7'
 'Escherichia coli str. K-12 substr. MG1655'
 'Escherichia coli O6:H1 (strain CFT073 / ATCC 700928 / UPEC)'
 'Geobacillus stearothermophilus (strain DSM 13240 / CIP 106956 / 10)'
 'Mus musculus (Mouse)' 'Plasmodium falciparum' 'Caenorhabditis elegans']


In [47]:
prokaryotes_Ecoli = prokaryotes[prokaryotes['Organism'].str.contains('Escherichia coli')]

In [37]:
prokaryotes_GO = prokaryotes_all.dropna(subset=['Gene Ontology IDs'])
prokaryotes_GO= prokaryotes_GO[prokaryotes_GO['Gene Ontology IDs'] != 'nan']
GO_dict = {row['Entry Name']: [go_id for go_id in row['Gene Ontology IDs'].split(';') if go_id] for index, row in prokaryotes_GO.iterrows()}



In [38]:
print(GO_dict[list(GO_dict.keys())[0]])


['GO:0005506', ' GO:0009055', ' GO:0016020', ' GO:0016614', ' GO:0020037']


In [17]:
prokaryotes_all.head()

Unnamed: 0,run_name,Organism,Protein_ID,Entry Name,gene_name,Protein names,Temperature dependence,Length,Sequence,temperature,...,Gene Ontology IDs,Gene Ontology (biological process),Gene Ontology (cellular component),Gene Ontology (molecular function),KEGG,Helix,Turn,Beta strand,AlphaFoldDB,PDB
0,Oleispira antarctica_RB-8_lysate_R1,Oleispira antarctica RB-8,R4YQQ5_CccA,R4YQQ5_OLEAN,CccA,"Putative Cytochrome c, class I",,433.0,MLKRIVYSVCGAAIVGLGLFSLFAWNPSIDPVIAPIDTEYSPQIIE...,25.0,...,GO:0005506; GO:0009055; GO:0016020; GO:0016614...,,membrane [GO:0016020],electron transfer activity [GO:0009055]; heme ...,oai:OLEAN_C33350;,,,,R4YQQ5;,
1,Oleispira antarctica_RB-8_lysate_R1,Oleispira antarctica RB-8,R4YQQ5_CccA,R4YQQ5_OLEAN,CccA,"Putative Cytochrome c, class I",,433.0,MLKRIVYSVCGAAIVGLGLFSLFAWNPSIDPVIAPIDTEYSPQIIE...,31.0,...,GO:0005506; GO:0009055; GO:0016020; GO:0016614...,,membrane [GO:0016020],electron transfer activity [GO:0009055]; heme ...,oai:OLEAN_C33350;,,,,R4YQQ5;,
2,Oleispira antarctica_RB-8_lysate_R1,Oleispira antarctica RB-8,R4YQQ5_CccA,R4YQQ5_OLEAN,CccA,"Putative Cytochrome c, class I",,433.0,MLKRIVYSVCGAAIVGLGLFSLFAWNPSIDPVIAPIDTEYSPQIIE...,27.9,...,GO:0005506; GO:0009055; GO:0016020; GO:0016614...,,membrane [GO:0016020],electron transfer activity [GO:0009055]; heme ...,oai:OLEAN_C33350;,,,,R4YQQ5;,
3,Oleispira antarctica_RB-8_lysate_R1,Oleispira antarctica RB-8,R4YQQ5_CccA,R4YQQ5_OLEAN,CccA,"Putative Cytochrome c, class I",,433.0,MLKRIVYSVCGAAIVGLGLFSLFAWNPSIDPVIAPIDTEYSPQIIE...,38.2,...,GO:0005506; GO:0009055; GO:0016020; GO:0016614...,,membrane [GO:0016020],electron transfer activity [GO:0009055]; heme ...,oai:OLEAN_C33350;,,,,R4YQQ5;,
4,Oleispira antarctica_RB-8_lysate_R1,Oleispira antarctica RB-8,R4YQQ5_CccA,R4YQQ5_OLEAN,CccA,"Putative Cytochrome c, class I",,433.0,MLKRIVYSVCGAAIVGLGLFSLFAWNPSIDPVIAPIDTEYSPQIIE...,34.6,...,GO:0005506; GO:0009055; GO:0016020; GO:0016614...,,membrane [GO:0016020],electron transfer activity [GO:0009055]; heme ...,oai:OLEAN_C33350;,,,,R4YQQ5;,


In [12]:
prokaryotes_all['org_prot'] = prokaryotes_all['Organism'] + '_' + prokaryotes_all['Protein_ID'] # creates a new column with the organism and protein name
prokaryotes_all_gr = prokaryotes_all.groupby('org_prot')


Group into cells / lysates, test plots

In [None]:
prokaryotes['Entry Name'].to_csv('prokaryotes_entry_names.csv', index=False)