In [None]:
# import packages
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import Bio

Import datasets and define variables

In [None]:
# import data from computer
path: str = './data/' #folder where files are stored
species: pd.DataFrame = pd.read_csv(os.path.join(path, "cross-species.csv"), dtype=str)   # imports file from the folder as species, based on name
identifiers: pd.DataFrame = pd.read_csv(os.path.join(path, "identifiers.tsv"), sep='\t', dtype=str) # imports 3rd file from the folder, based on name

#import fasta file with biopython (Bio)
from Bio import SeqIO
fasta_id = []
fasta_seq = []
with open(os.path.join(path, "identifiers.fasta"), 'r') as handle:
    for record in SeqIO.parse(handle, "fasta"): 
        fasta_id.append(record.id) 
        fasta_seq.append(str(record.seq))
# Create df prot_seq which includes proper ID and sequences
fasta_id_df = pd.DataFrame([item.split('|') for item in fasta_id], columns=['0', 'From','ID']) # creates a df by splitting fasta_id list into 3 columns
fasta_id_EntryName:list = fasta_id_df['ID'] # creates a list with the Entry Names from the split df
prot_seq = pd.DataFrame(list(zip(fasta_id_EntryName, fasta_seq)), columns = ["ID", "Sequence"]) # creates a df by combining the Entry Names and the Sequences



#Define datatype for each column
species['fold_change'] = species['fold_change'].astype('float64') # changes fold_change column to float64
species['temperature'] = species['temperature'].astype('float64') # changes temperature column to int64
identifiers['Length'] = identifiers['Length'].astype('float64') # changes Length column to int64




Add protein sequence from protein_seq dataset to identifiers to create identifiers_seq, based on 'ID' and 'Entry Name' respectively

Create joint data set from identifiers_seq and species_seq based on gene_name and Gene Names1 respectively

In [None]:
# merge dataframes based on Entry Name (df.identifiers) andID (df.prot_seq), if they are similar add the sequence to identifiers, if not add NaN
identifiers_seq = identifiers.merge(prot_seq, how='left', left_on='Entry Name', right_on='ID')[identifiers.columns.tolist() + ['Sequence']] # merges identifiers and prot_seq on Entry Name and ID
identifiers_seq.head()

#merge species and identifiers_seq based on gene_name and Gene Names1
identifiers_seq[['Gene Names1', 'Gene Names2']] = identifiers_seq['Gene Names'].str.split(expand = True, n=1) # splits the Gene Names column into 2 columns
species_seq = pd.merge(left = species, right = identifiers_seq, how='outer', left_on='gene_name', right_on='Gene Names1') # merges species and identifiers_seq on gene_name and Gene Names1
#species_seq.head()

Drop all entries from species dataset which don't have a sequence.
Delete unnessesary columns ('Gene Names1', 'Gene Names2')

In [None]:
species_seq.dropna(subset = ['Sequence'], inplace=True) # drops rows with NaN in the Sequence column

for col in ['Gene Names1', 'Gene Names2']:
    if col in species_seq.columns:
        species_seq.drop(columns = ['Gene Names1', 'Gene Names2'], inplace=True) # drops Gene Names1 and Gene Names2 columns
#species_seq.head()


Extract list of unique run_names from species and manually seperate in prokaryotes and eukaryotes. Create split datasets for eukaryotes and prokaryotes based on species_seq

In [None]:
# Get a list of unique entries in the 'run_name' column
#print(species['run_name'].unique())

# Based on uniqe run names all entries split into eukaryotes and prokaryotes (manually)
eukaryotes = ('Saccharomyces cerevisiae lysate', 'Arabidopsis thaliana seedling lysate',
 'Mus musculus liver lysate', 'Drosophila melanogaster SII lysate',
 'Caenorhabditis elegans lysate', 'Danio rerio Zenodo lysate',
 'Mus musculus BMDC lysate', 'Oleispira antarctica_RB-8_lysate_R1',
 'Homo sapiens Jurkat cells', 'Homo sapiens K562 cells')
prokaryotes:list = ['Bacillus subtilis_168_lysate_R1', 'Escherichia coli lysate',
 'Geobacillus stearothermophilus NCA26 lysate',
 'Thermus thermophilus HB27 lysate', 'Thermus thermophilus HB27 cells',
 'Escherichia coli cells', 'Picrophilus torridus DSM9790 lysate']

#creates new dataframes that only contain prokaryotes or eukaryotes based
species_eukaryotes = species_seq[species_seq['run_name'].isin(eukaryotes)]
species_prokaryotes = species_seq[species_seq['run_name'].isin(prokaryotes)]

print(species_prokaryotes.dtypes)





Create a list of each unique protein for each organism

In [11]:
unique_org_prot=[]

for n in range(len(species_prokaryotes)): #iteratue through entire dataset
    name = species.loc[n, 'run_name'] + '_' + species.loc[n, 'Protein_ID'] #combine entry from 'run_name' and 'Protein_ID', seperated by '_' 
    #append name to unique_org_prot if entry is not yet included
    if unique_org_prot != name:
        unique_org_prot.append(name)
        n += 1
    else:
        n +=1


['Bacillus subtilis_168_lysate_R1_C0H3Q1_ytzI', 'Bacillus subtilis_168_lysate_R1_C0H3V2_mtlF', 'Bacillus subtilis_168_lysate_R1_C0H3V8_yyzM', 'Bacillus subtilis_168_lysate_R1_C0H3Y1_yhzD', 'Bacillus subtilis_168_lysate_R1_C0H3Z2_yjzH', 'Bacillus subtilis_168_lysate_R1_C0H405_ykzS', 'Bacillus subtilis_168_lysate_R1_C0H423_yozV', 'Bacillus subtilis_168_lysate_R1_C0H437_yoyH', 'Bacillus subtilis_168_lysate_R1_C0H453_yqzM', 'Bacillus subtilis_168_lysate_R1_C0SP82_yoaE', 'Bacillus subtilis_168_lysate_R1_C0SP85_yukE', 'Bacillus subtilis_168_lysate_R1_C0SP86_sftA', 'Bacillus subtilis_168_lysate_R1_C0SP89_yoaH', 'Bacillus subtilis_168_lysate_R1_C0SP93_accD', 'Bacillus subtilis_168_lysate_R1_C0SP94_yhfQ', 'Bacillus subtilis_168_lysate_R1_C0SP95_ycnJ', 'Bacillus subtilis_168_lysate_R1_C0SP98_ykfD', 'Bacillus subtilis_168_lysate_R1_C0SPA0_amyX', 'Bacillus subtilis_168_lysate_R1_C0SPA5_adhA', 'Bacillus subtilis_168_lysate_R1_C0SPB0_ytcI', 'Bacillus subtilis_168_lysate_R1_C0SPB1_yukJ', 'Bacillus su

Excract ecoli cells only to test some graphs.

In [None]:
species_Ecoli_cells = species_prokaryotes[species_prokaryotes['run_name'].isin(['Escherichia coli cells'])]
species_Ecoli_cells.head()

#reassign floats to values bcs they change after extracting e.coli (why???)
species_Ecoli_cells['fold_change'] = species_Ecoli_cells['fold_change'].astype('float64') # changes fold_change column to float64
species_Ecoli_cells['temperature'] = species_Ecoli_cells['temperature'].astype('float64') # changes temperature column to int64
species_Ecoli_cells['Length'] = species_Ecoli_cells['Length'].astype('float64') # changes Length column to int64

species_Ecoli_cells_lacZ = species_Ecoli_cells[species_Ecoli_cells['gene_name'].isin(['lacZ'])]
species_Ecoli_cells_rpmF=species_Ecoli_cells[species_Ecoli_cells['gene_name'].isin(['rpmF'])]
sns.scatterplot(data=species_Ecoli_cells_lacZ, x='temperature', y='fold_change')
plt.show()
print(species_Ecoli_cells_lacZ.dtypes)


In [None]:
print(species_Ecoli_cells.dtypes)

In [None]:

## aminoacid composition
AA_hydrophobic:list = ['A', 'V', 'I', 'L', 'M', 'F', 'T', 'Y', 'W', ]
AA_polar_neutral:list = ['N', 'C', 'Q', 'S', 'T']
AA_acidic:list = ['D', 'E']
AA_basic:list = ['R', 'H', 'K']
