In [1]:
# import packages
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import Bio
import statsmodels.api as sm
from pathlib import Path

Import datasets and define variables

In [2]:
# import data from computer "C:/Users/marik/OneDrive - bwedu/Uni HD/FS 4/Bioinfo/topic04_02/data/cross-species.csv"
path: str = f"{os.path.abspath(os.path.join(os.getcwd(), ".."))}/data"  #folder where files are stored
species: pd.DataFrame = pd.read_csv(os.path.join(path, "cross-species.csv"), dtype=str)   # imports file from the folder as species, based on name
identifiers: pd.DataFrame = pd.read_csv(os.path.join(path, "identifiers.tsv"), sep='\t', dtype=str) # imports 3rd file from the folder, based on name
prokaryotes_auc : pd.DataFrame = pd.read_csv(os.path.join(path, "data_prokaryotes_auc.csv"), dtype=str)
Uniprot_ID_mapping : pd.DataFrame = pd.read_csv(os.path.join(path, "Uniprot_ID_mapping.tsv"), sep = '\t', dtype=str)

#import fasta file with biopython (Bio)
from Bio import SeqIO
fasta_id = []
fasta_seq = []
with open(os.path.join(path, "identifiers.fasta"), 'r') as handle:
    for record in SeqIO.parse(handle, "fasta"): 
        fasta_id.append(record.id) 
        fasta_seq.append(str(record.seq))
# Create df prot_seq which includes proper ID and sequences
fasta_id_df = pd.DataFrame([item.split('|') for item in fasta_id], columns=['0', 'From','ID']) # creates a df by splitting fasta_id list into 3 columns
fasta_id_EntryName:list = fasta_id_df['ID'] # creates a list with the Entry Names from the split df
prot_seq = pd.DataFrame(list(zip(fasta_id_EntryName, fasta_seq)), columns = ["ID", "Sequence"]) # creates a df by combining the Entry Names and the Sequences



#Define datatype for each column
species['fold_change'] = species['fold_change'].astype('float64') # changes fold_change column to float64
species['temperature'] = species['temperature'].astype('float64') # changes temperature column to int64
identifiers['Length'] = identifiers['Length'].astype('float64') # changes Length column to int64

#Update index
species = species.reset_index(drop=True) # resets index of species df
identifiers = identifiers.reset_index(drop=True) # resets index of identifiers df




In [3]:
Uniprot_ID_mapping = Uniprot_ID_mapping.drop(columns = ['From', 'Entry', 'Gene Ontology (cellular component)', 'Gene Ontology (biological process)', '3D', 'Subcellular location [CC]', 'Intramembrane', 'Topological domain', 'Transmembrane'], errors= 'ignore')


Add protein sequence from protein_seq dataset to identifiers to create identifiers_seq, based on 'ID' and 'Entry Name' respectively

Create joint data set from identifiers_seq and species_seq based on gene_name and Gene Names1 respectively

In [4]:
# merge dataframes based on Entry Name (df.identifiers) andID (df.prot_seq), if they are similar add the sequence to identifiers, if not add NaN
identifiers_seq = identifiers.merge(prot_seq, how='left', left_on='Entry Name', right_on='ID')[identifiers.columns.tolist() + ['Sequence']] # merges identifiers and prot_seq on Entry Name and ID
identifiers_seq.head()

#merge species and identifiers_seq based on gene_name and Gene Names1
species[['ProtID1', 'ProtID2']] = species['Protein_ID'].str.split("_", expand = True, n=1) # splits the Gene Names column into 2 columns
species_seq = pd.merge(left = species, right = identifiers_seq, how='outer', left_on='ProtID1', right_on='Entry') # merges species and identifiers_seq on gene_name and Gene Names1

# drop entrys with no sequence and unnessecary columns
species_seq.dropna(subset = ['Sequence'], inplace=True) # drops rows with NaN in the Sequence column
species_seq = species_seq.drop(columns=['ProtID1', 'ProtID2'])

Extract list of unique run_names from species and manually seperate in prokaryotes and eukaryotes. Create split datasets for eukaryotes and prokaryotes based on species_seq

In [5]:
# Get a list of unique entries in the 'run_name' column
#print(species['run_name'].unique())

# Based on uniqe run names all entries split into eukaryotes and prokaryotes (manually)
prokaryotes_list:list = ['Bacillus subtilis_168_lysate_R1', 'Escherichia coli lysate',
 'Geobacillus stearothermophilus NCA26 lysate',
 'Thermus thermophilus HB27 lysate', 'Thermus thermophilus HB27 cells',
 'Escherichia coli cells', 'Picrophilus torridus DSM9790 lysate', 'Oleispira antarctica_RB-8_lysate_R1']

#creates new dataframes that only contain prokaryotes or eukaryotes based
prokaryotes_all = species_seq[species_seq['run_name'].isin(prokaryotes_list)]

##reset index
prokaryotes_all = prokaryotes_all.reset_index(drop=True) # resets index of species df

In [6]:
prokaryotes_all_list = [
    'Thermus thermophilus HB27 lysate',                 #
    'Thermus thermophilus HB27 cells',                  #
    'Picrophilus torridus DSM9790 lysate',              #
    'Bacillus subtilis_168_lysate_R1',                  #
    'Escherichia coli lysate',                          #
    'Escherichia coli cells',                           #
    'Geobacillus stearothermophilus NCA26 lysate',
    'Oleispira antarctica_RB-8_lysate_R1'
    ]

prokaryotes_auc_list = [
    'T.thermophilus_P023431',                          # Thermus thermophilus HB27 lysate
    'T.thermophilus_cells_P023757',                     # Thermus thermophilus HB27 cells
    'P.torridus_P023430',                               # Picrophilus torridus DSM9790 lysate
    'B.subtilis_P023755',                               # Bacillus subtilis_168_lysate_R1                                                   
    'E.coli_cells_P023756',                             # Escherichia coli lysate
    'E.coli_P023428',                                   # Escherichia coli cells
    'G.stearothermophilus_P023429',                     # Geobacillus stearothermophilus NCA26 lysate
    'O.antarctica_P028248',                             # Oleispira antarctica_RB-8_lysate_R1  
]
# E.coli_ArcticExpress_P028249 not included

#Prokaryotes from R-shiny (https://meltomeatlas.proteomics.wzw.tum.de/master_meltomeatlasapp/)
'Escherichia coli lysate', 
'Escherichia coli cells', 
'Geobacillus stearothermophilus NCA26 lysate',
'Thermus thermophilus HB27 lysate',
'Thermus thermophilus HB27 cells',
'Picrophilus torridus DSM790 lysate',

('Picrophilus torridus DSM790 lysate',)

Adjust organism name in prokaryotes_auc to match names from prokaryotes_all

In [7]:
prokaryotes_auc['Sample'] = prokaryotes_auc['Sample'].replace(dict(zip(prokaryotes_auc_list, prokaryotes_all_list)))

#remove E.coli_ArcticExpress_P028249
prokaryotes_auc = prokaryotes_auc[prokaryotes_auc['Sample'] != 'E.coli_ArcticExpress_P028249']

Combine prokaryotes_all dataframe with prokaryotes_auc

In [8]:
prokaryotes_all['run_name_Protein_ID'] = prokaryotes_all['run_name'] + prokaryotes_all['Protein_ID']
prokaryotes_auc['Sample_Protein_ID'] = prokaryotes_auc['Sample'] + prokaryotes_auc['Protein_ID']

prokaryotes_all = pd.merge(left = prokaryotes_all, right = prokaryotes_auc , how='left', left_on='run_name_Protein_ID', right_on='Sample_Protein_ID') # merges species and identifiers_seq on gene_name and Gene Names1


drop unessesary columns

In [9]:
#drop columns
col_drop = ['Reviewed', 'run_name_Protein_ID', 'channel', 'From', 'Protein_ID_y',
            'Proteinname_P023428_E.coli', 'meltPoint_P023428_E.coli', 'gene_name_y',
            'Sample_Protein_ID', 'Sample', 'Entry', 'Gene Names', 'uniprot_ac']
for n in range(len(col_drop)):
    if col_drop[n] in list(prokaryotes_all.columns):
        prokaryotes_all = prokaryotes_all.drop(columns=[col_drop[n]])

#rename columns
prokaryotes_all = prokaryotes_all.rename(columns={'Protein_ID_x': 'Protein_ID'})
prokaryotes_all = prokaryotes_all.rename(columns={'gene_name_x': 'gene_name'})

#define variable
prokaryotes_all = prokaryotes_all.astype({'auc': 'float64'})



Combine Uniprot_ID_mapping with prokaryotes dataframe

In [10]:
prokaryotes_all = pd.merge(left = prokaryotes_all, right = Uniprot_ID_mapping, how='left', left_on='Entry Name', right_on='Entry Name') # merges species and identifiers_seq on gene_name and Gene Names1

Re-order dataframe columns

In [11]:
print(prokaryotes_all.columns)

Index(['run_name', 'Protein_ID', 'gene_name', 'meltPoint', 'fold_change',
       'temperature', 'Entry Name', 'Protein names', 'Organism', 'Length',
       'Gene Ontology (biological process)',
       'Gene Ontology (cellular component)',
       'Gene Ontology (molecular function)', 'Temperature dependence',
       'Sequence', 'auc', 'Gene Ontology IDs', 'Helix', 'Turn', 'Beta strand',
       'AlphaFoldDB', 'PDB', 'KEGG', 'EC number'],
      dtype='object')


In [12]:
columns_ordered =['run_name', 'Organism', 'Protein_ID' ,'Entry Name' ,'gene_name', 
 'Protein names',  'Temperature dependence', 'Length', 'Sequence', 
 'temperature', 'fold_change', 'meltPoint', 'auc', 'Gene Ontology IDs', 
 'Gene Ontology (biological process)', 'Gene Ontology (cellular component)', 
 'Gene Ontology (molecular function)', 'KEGG','EC number', 'Helix', 'Turn', 'Beta strand',
       'AlphaFoldDB', 'PDB']

prokaryotes_all = prokaryotes_all[columns_ordered]

Create a list of each unique protein for each organism

In [13]:
# prokaryotes
prokaryotes = prokaryotes_all.drop_duplicates(subset=['run_name', 'Protein_ID'], keep='first') #create new dataframe with only unique proteins
prokaryotes = prokaryotes.reset_index(drop=True) #reset index

# define meltpoint as float64
prokaryotes['meltPoint'] = prokaryotes['meltPoint'].astype('float64') # changes meltpoint column to float64
prokaryotes['temperature'] = prokaryotes['temperature'].astype('float64')
prokaryotes['fold_change'] = prokaryotes['fold_change'].astype('float64')
prokaryotes['Length'] = prokaryotes['Length'].astype('float64')
prokaryotes['auc'] = prokaryotes['auc'].astype('float64')

In [14]:
prokaryotes_all.to_csv(os.path.join(path,'prokaryotes_all.csv'), index=False)
prokaryotes.to_csv(os.path.join(path,'prokaryotes_unique_prot.csv'), index=False)
prokaryotes['Entry Name'].to_csv(os.path.join(path, 'Uniprot_IDs.csv'), index=False)