# Lichen Proteome Workbook for python

Author: Eric Whisnant

Date: 04/02/2025

* Other data wrangling was done in the notebook: `lichen-proteome-pilot.qmd`

In [69]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os


In [81]:
# Read in the two data frames to work with:

combined = pd.read_csv("annotated-lichen-proteins.csv")

peltigera = pd.read_csv("annotated-peltigera-proteins.csv", low_memory=False)

peltigera.shape

(18811, 39)

Here is what needs to happen:

1. Group the rows by metataxa to create seperate dataframes for each metataxa.

* Do this for both dataframes.

DONE

2. Remove columns that are unneccessary

DONE

3. For each `signature_number` extract the sequence and create a fasta file that makes:

"> <match name> | <gene name> | <protein_name>" as the start of the header, followed by the sequence:

DONE

3. Create a consensus profile for each `signature_number` for each of the descriptor columns.

4. Then I need to



## Combined dataframe (skip ahead for Peligera)

In [82]:
combined.columns

## Drop columns (this need only be done once):
combined_clean = combined.drop(columns=["genera", "gene_name.x", "entry", "sequence_uncertainty", "protein_description.y", "Organism"])

# Combined now has these columns dropped
combined_clean.head()

# Group by metataxa:
gb_metataxa_comb = combined_clean.groupby(["metataxa"])

# The way to grab the groups is to use the get_group method:
[gb_metataxa_comb.get_group(x) for x in gb_metataxa_comb.groups]

# The groups are:
phycobiont_comb = gb_metataxa_comb.get_group("Phycobiont")
mycobiont_comb = gb_metataxa_comb.get_group("Mycobiont")

phycobiont_comb.head()
mycobiont_comb.head()



  [gb_metataxa_comb.get_group(x) for x in gb_metataxa_comb.groups]
  phycobiont_comb = gb_metataxa_comb.get_group("Phycobiont")
  mycobiont_comb = gb_metataxa_comb.get_group("Mycobiont")


Unnamed: 0,signature_number,metataxa,database,matched_organisms,protein_name,protein_description.x,nmbr_precursors_experiment_wide,nmbr_precursors_controls,nmbr_precursors_cladonia_sample,nmbr_precursors_peltigera_sample,...,GO_ID,sequence,gene_encoded_by,Ensembl,EnsemblFungi,EnsemblPlants,KEGG,eggNOG,EMBL,RefSeq
78,8,Mycobiont,Lecanaromycetes,Graphis pulverulenta,A0A023W1D8_9LECA,Elongation factor 1-alpha (Fragment),1,0,1,1,...,GO:0003746; GO:0003924; GO:0005525,DGQTREHALLAYTLGVKQLIVAVNKMDTTKWSEDRFNEIVKEVSNF...,,,,,,,KJ441074;,
308,42,Mycobiont,Lecanaromycetes,Alectoria fallacina,A0A8H3FNR3_9LECA,GST C-terminal domain-containing protein,1,0,1,0,...,GO:0004364; GO:0005737,MPNHPPTDGSWRHQLGTPSFPLEPDRYHLYVGLFCPFAHRVIIARE...,,,,,,,CAJPDR010000224;,
309,42,Mycobiont,Lecanaromycetes,Letharia lupina,A0A8H6C7B5_9LECA,GST C-terminal domain-containing protein,1,0,1,0,...,GO:0004364; GO:0005737,MPNHPPTDGSWRHQLGTPSFPVEPDRYHLYVGLFCPFAQRAIIARE...,,,,,,,JACCJB010000024;,
310,42,Mycobiont,Lecanaromycetes,Letharia columbiana,A0A8H6L172_9LECA,GST C-terminal domain-containing protein,1,0,1,0,...,GO:0004364; GO:0005737,MPNHPPTDGSWRHQLGTPSFPVEPDRYYLYVGLFCPFAQRAIIARE...,,,,,,,JACCJC010000054;,
312,42,Mycobiont,Lecanaromycetes,Lepraria neglecta,A0AAD9ZFV9_9LECA,GST N-terminal domain-containing protein,1,0,1,0,...,GO:0004364; GO:0005737,MPNYPPSDGSWRHTLGSPSFPVEPDRYHLYVGLFCPFAHRVIVTRE...,,,,,,,JASNWA010000003;,


Now that they are split apart, lets try and create unique fasta files for each signature_num

In [78]:
# Create fasta files for the signature numbers

# Create a "basename" for the fasta entry
mycobiont_comb["basename"] = mycobiont_comb[["signature_number","metataxa", "matched_organisms", "protein_name" , "protein_description.x" ]].astype(str).agg("|".join, axis=1)
mycobiont_comb["basename"] = mycobiont_comb["basename"].str.replace(" ", "_")

phycobiont_comb["basename"] = phycobiont_comb[["signature_number","metataxa", "matched_organisms", "protein_name" , "protein_description.x" ]].astype(str).agg("|".join, axis=1)
phycobiont_comb["basename"] = phycobiont_comb["basename"].str.replace(" ", "_")

mycobiont_comb

# Group by signature number
gb_sig_num_phycobiont_comb = phycobiont_comb.groupby(["signature_number"])
gb_sig_num_mycobiont_comb = mycobiont_comb.groupby(["signature_number"])

mycobiont_seq = mycobiont_comb[["signature_number", "basename", "sequence"]].copy()
gb_sig_num_mycobiont_seq = mycobiont_seq.groupby(["signature_number"])

phycobiont_seq = phycobiont_comb[["signature_number", "basename", "sequence"]].copy()
gb_sig_num_phycobiont_seq = phycobiont_seq.groupby(["signature_number"])


# Create a directory for the mycobiont FASTA files if it doesn't exist
output_dir_mycobiont = "combined/mycobiont_fasta_files"
os.makedirs(output_dir_mycobiont, exist_ok=True)

# Create a directory for the phycobiont FASTA files if it doesn't exist
output_dir_phycobiont = "combined/phycobiont_fasta_files"
os.makedirs(output_dir_phycobiont, exist_ok=True)

# Create a FASTA file for each signature number
for sig_num, group in gb_sig_num_mycobiont_seq:
    with open(os.path.join(output_dir, f"signature_{sig_num}.faa"), "w") as f:
        for index, row in group.iterrows():
            f.write(f">{row['basename']}\n{row['sequence']}\n")

# Create a FASTA file for each signature number
for sig_num, group in gb_sig_num_phycobiont_seq:
    with open(os.path.join(output_dir_phycobiont, f"signature_{sig_num}.faa"), "w") as f:
        for index, row in group.iterrows():
            f.write(f">{row['basename']}\n{row['sequence']}\n")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mycobiont_comb["basename"] = mycobiont_comb[["signature_number","metataxa", "matched_organisms", "protein_name" , "protein_description.x" ]].astype(str).agg("|".join, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mycobiont_comb["basename"] = mycobiont_comb["basename"].str.replace(" ", "_")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide

In [76]:
phycobiont_seq

Unnamed: 0,signature_number,basename,sequence
0,1,1|Phycobiont|Auxenochlorella_protothecoides|A0...,MVKKKINGIKKVKIGLTSPREIQEWGERSLLDGTIVGEVSSWETVN...
1,1,1|Phycobiont|Chlorella_variabilis|A0A075WQ93_C...,MNIQKARVFEKLEIGVASPKQIRHWAERFLPNGDTVGEVTSWETVN...
2,1,1|Phycobiont|Auxenochlorella_protothecoides|A0...,VKIGLTSPREIQEWGERSLLDGTIVGEVSSWETVNYKTLKPEMGGL...
3,1,1|Phycobiont|Prasiolopsis_sp._SAG_84.81|A0A097...,MSKSLIGHTSTQKFSNNRHLFKPLKKSSEAQIDSINIGLASPIRIL...
4,1,1|Phycobiont|Edaphochlorella_mirabilis|A0A097K...,MTKQNTFKSIQVESIRIGLASPDCIRKWAERTLPNGKIIGKVTSRE...
...,...,...,...
15997,9737,9737|Phycobiont|Prototheca_wickerhamii|Q5IWX0_...,CHMTGVAAQVVASPSRCLFASSQRTSKTASTSRCPSSRRWASLTTP...
15998,9738,9738|Phycobiont|Chlorella_ellipsoidea|Q6SKR1_C...,ELNLDDVGYDDVGGVRKQMAQIRELVELPLRHPQLFKTIGVKPPKG...
16001,9741,9741|Phycobiont|Trebouxia_jamesii|Q9LL42_9CHLO...,PEANREKMTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVLDSGD...
16003,9743,9743|Phycobiont|Stichococcus_sp._BCP-ZNP2-VF4|...,TYYTPDYQVKETDVLAAFRMTPQSGVPAEECGAAVAAESSTGTWTT...


### Create fasta files for the peltigera 

In [None]:
peltigera.head()

peltigera_cleaned = peltigera.drop(columns=["genera", "gene_name.x", "entry", "sequence_uncertainty", "protein_description.y", "Organism"])

# Group by metataxa:
gb_metataxa_pelt = peltigera.groupby(["metataxa"])

# The way to grab the groups is to use the get_group method:
[gb_metataxa_pelt.get_group(x) for x in gb_metataxa_pelt.groups]

# The groups are:
cyanobiont_pelt = gb_metataxa_pelt.get_group("Cyanobiont")
mycobiont_pelt = gb_metataxa_pelt.get_group("Mycobiont")

cyanobiont_pelt.head()
mycobiont_pelt.head()

  [gb_metataxa_pelt.get_group(x) for x in gb_metataxa_pelt.groups]
  cyanobiont_pelt = gb_metataxa_pelt.get_group("Cyanobiont")
  mycobiont_pelt = gb_metataxa_pelt.get_group("Mycobiont")


Unnamed: 0,signature_number,metataxa,database,matched_organisms,protein_name,protein_description.x,nmbr_precursors_experiment_wide,nmbr_precursors_controls,nmbr_precursors_peltigera,relative_abundance_controls,...,RefSeq,CAZy,MEROPS,UniLectin,Ensembl,EnsemblFungi,EnsemblPlants,EnsemblBacteria,KEGG,eggNOG
238,2,Mycobiont,Lecanaromycetes,Graphis pulverulenta,A0A023W1D8_9LECA,Elongation factor 1-alpha (Fragment),1,0,1,,...,,,,,,,,,,
239,3,Mycobiont,Lecanaromycetes,Peltigera dolichorrhiza,A0A075QJ18_9LECA,Elongation factor 2 (Fragment),1,0,1,,...,,,,,,,,,,
240,3,Mycobiont,Lecanaromycetes,Peltigera dolichorrhiza,A0A1D9IVW2_9LECA,Elongation factor 2 (Fragment),1,0,1,,...,,,,,,,,,,
241,3,Mycobiont,Lecanaromycetes,Peltigera hymenina,A0A1D9IVW7_9LECA,Elongation factor 2 (Fragment),1,0,1,,...,,,,,,,,,,
242,3,Mycobiont,Lecanaromycetes,Peltigera hymenina,A0A1D9IVW9_9LECA,Elongation factor 2 (Fragment),1,0,1,,...,,,,,,,,,,


In [79]:
# Create fasta files for the signature numbers

# Create a "basename" for the fasta entry
mycobiont_pelt["basename"] = mycobiont_pelt[["signature_number", "metataxa", "matched_organisms", "protein_name", "protein_description.x"]].astype(str).agg("|".join, axis=1)
mycobiont_pelt["basename"] = mycobiont_pelt["basename"].str.replace(" ", "_")

cyanobiont_pelt["basename"] = cyanobiont_pelt[["signature_number", "metataxa", "matched_organisms", "protein_name", "protein_description.x"]].astype(str).agg("|".join, axis=1)
cyanobiont_pelt["basename"] = cyanobiont_pelt["basename"].str.replace(" ", "_")

mycobiont_pelt

# Group by signature number
gb_sig_num_cyanobiont_pelt = cyanobiont_pelt.groupby(["signature_number"])
gb_sig_num_mycobiont_pelt = mycobiont_pelt.groupby(["signature_number"])

mycobiont_seq_pelt = mycobiont_pelt[["signature_number", "basename", "sequence"]].copy()
gb_sig_num_mycobiont_seq_pelt = mycobiont_seq_pelt.groupby(["signature_number"])

cyanobiont_seq = cyanobiont_pelt[["signature_number", "basename", "sequence"]].copy()
gb_sig_num_cyanobiont_seq = cyanobiont_seq.groupby(["signature_number"])

# Create a directory for the mycobiont FASTA files if it doesn't exist
output_dir = "peltigera/mycobiont_fasta_files"
os.makedirs(output_dir, exist_ok=True)

# Create a directory for the cyanobiont FASTA files if it doesn't exist
output_dir_cyanobiont = "peltigera/cyanobiont_fasta_files"
os.makedirs(output_dir_cyanobiont, exist_ok=True)

# Create a FASTA file for each signature number
for sig_num, group in gb_sig_num_mycobiont_seq_pelt:
    with open(os.path.join(output_dir, f"signature_{sig_num}.faa"), "w") as f:
        for index, row in group.iterrows():
            f.write(f">{row['basename']}\n{row['sequence']}\n")

# Create a FASTA file for each signature number
for sig_num, group in gb_sig_num_cyanobiont_seq:
    with open(os.path.join(output_dir_cyanobiont, f"signature_{sig_num}.faa"), "w") as f:
        for index, row in group.iterrows():
            f.write(f">{row['basename']}\n{row['sequence']}\n")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mycobiont_pelt["basename"] = mycobiont_pelt[["signature_number", "metataxa", "matched_organisms", "protein_name", "protein_description.x"]].astype(str).agg("|".join, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mycobiont_pelt["basename"] = mycobiont_pelt["basename"].str.replace(" ", "_")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/

### Let us make a consensus file with all of the signatures

### Now do the peltigera specific 


In [None]:
# peltigera = peltigera.drop(columns=["genera", "gene_name.x", "entry", "sequence_uncertainty", "protein_description.y", "Organism"])


gb_metataxa_pelt = peltigera.groupby(["metataxa"])

[gb_metataxa_pelt.get_group(x) for x in gb_metataxa_pelt.groups]

cyanobiont_pelt = gb_metataxa_pelt.get_group("Cyanobiont")

mycobiont_pelt = gb_metataxa_pelt.get_group("Mycobiont")

# nan_pelt = gb_metataxa_pelt.get_group("NaN")

print(peltigera.shape)

cyanobiont_pelt.head()

print(cyanobiont_pelt.shape)

mycobiont_pelt.head()

print(mycobiont_pelt.shape)

# Note: the missing rows after splitting the data frames are bc the proteins are coming from cow, human, or non lichen source


(18811, 33)
(12074, 33)
(6631, 33)


  [gb_metataxa_pelt.get_group(x) for x in gb_metataxa_pelt.groups]
  cyanobiont_pelt = gb_metataxa_pelt.get_group("Cyanobiont")
  mycobiont_pelt = gb_metataxa_pelt.get_group("Mycobiont")


18705