In [None]:
# Here I search for other known GFP variants, e.g. eGFP, sfGFP and also different colors. They will be tested for homology in blast to find mutations/differences in the sequences.
# Afterwards I look at the original dataframe if we have these mutations and how the amino acid properties have changed.
# It is possible to connect Bio.Blast with Biopython

In [7]:
from Bio.Blast import NCBIWWW, NCBIXML

# Performs a protein BLAST search
result_handle = NCBIWWW.qblast("blastp", "nr", "sequence")

# Parses the result handle
blast_records = NCBIXML.parse(result_handle)

# Processes the results
for blast_record in blast_records:
    for alignment in blast_record.alignments:
        for hsp in alignment.hsps:
            print("****Alignment****")
            print("Sequence:", alignment.title)
            print("E-value:", hsp.expect)
            print(hsp.query[0:75] + "...")
            print(hsp.match[0:75] + "...")
            print(hsp.sbjct[0:75] + "...")
# NCBIWWW.qblast() is used to perform a protein BLAST search

# The parameters passed to the function are the BLAST program (blastp for protein), the database to search (nr for the non-redundant protein database), and the sequence you want to search.

# The result_handle contains the XML results of the BLAST search.

# NCBIXML.parse() function is used to parse the XML and obtain the blast_records. --> parse = analysieren/zergliedern

# Iterate over the blast_records and extract the relevant information from the alignments and high-scoring pairs (HSPs).

# Note that performing a BLAST search can take some time, depending on the size of the sequence and the complexity of the search. Also, keep in mind any usage restrictions and guidelines set by the NCBI when using their services.


# Expected output:
# ****Alignment****
# Sequence: [Alignment Title]
# E-value: [E-value]
# [Query sequence snippet]...
# [Match sequence snippet]...
# [Subject sequence snippet]...


KeyboardInterrupt: 

In [9]:
import pandas as pd
from Bio.Blast import NCBIWWW, NCBIXML

unmutated_sequence = "MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK"

sequence_to_search = unmutated_sequence


result_handle = NCBIWWW.qblast("blastp", "nr", sequence_to_search)

blast_records = NCBIXML.parse(result_handle)

# Create an empty DataFrame to store the results
df_results1 = pd.DataFrame(
    columns=["Sequence", "Alignment Title", "E-value", "Query Sequence", "Match Sequence", "Subject Sequence"])

# Process the results and store in the DataFrame
for blast_record in blast_records:
    for alignment in blast_record.alignments:
        for hsp in alignment.hsps:
            result = {
                "Sequence": sequence_to_search,
                "Alignment Title": alignment.title,
                "E-value": hsp.expect,
                "Query Sequence": hsp.query,
                "Match Sequence": hsp.match,
                "Subject Sequence": hsp.sbjct
            }
            df_results1 = df_results1.append(result, ignore_index=True)

print(df_results1)

# The display of the results is very confusing and unorganized.
# Additionally, configuring all the search parameters is very complicated. It might be easier to use the Blast-Website and then import the data.

                                             Sequence  \
0   MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...   
1   MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...   
2   MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...   
3   MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...   
4   MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...   
5   MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...   
6   MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...   
7   MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...   
8   MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...   
9   MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...   
10  MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...   
11  MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...   
12  MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...   
13  MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...   
14  MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...   
15  MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...   
16  MSKGEELFTGVVPILVELDGDVNGHKF

  df_results1 = df_results1.append(result, ignore_index=True)
  df_results1 = df_results1.append(result, ignore_index=True)
  df_results1 = df_results1.append(result, ignore_index=True)
  df_results1 = df_results1.append(result, ignore_index=True)
  df_results1 = df_results1.append(result, ignore_index=True)
  df_results1 = df_results1.append(result, ignore_index=True)
  df_results1 = df_results1.append(result, ignore_index=True)
  df_results1 = df_results1.append(result, ignore_index=True)
  df_results1 = df_results1.append(result, ignore_index=True)
  df_results1 = df_results1.append(result, ignore_index=True)
  df_results1 = df_results1.append(result, ignore_index=True)
  df_results1 = df_results1.append(result, ignore_index=True)
  df_results1 = df_results1.append(result, ignore_index=True)
  df_results1 = df_results1.append(result, ignore_index=True)
  df_results1 = df_results1.append(result, ignore_index=True)
  df_results1 = df_results1.append(result, ignore_index=True)
  df_res

#FPbase is a really good website for all GFP-proteins

Unmutiert:
>unmutated_GFP_1
MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK
#We do not have a completely unmutated sequence, because we already have the F64L mutation in comparison to avGFP.

eGFP: Tsien RY (1998) The green fluorescent protein. Annu Rev Biochem 67: 509–544.
>eGFP_1
MVSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLTYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITLGMDELYK

sfGFP: https://www.fpbase.org/protein/superfolder-gfp/
>sfGFP_1
MSKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDATNGKLTLKFICTTGKLPVPWPTLVTTLTYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTISFKDDGTYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNFNSHNVYITADKQKNGIKANFKIRHNVEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSVLSKDPNEKRDHMVLLEFVTAAGITHGMDELYK

alphaGFP: https://www.fpbase.org/protein/alphagfp/
>alphaGFP_1
MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTISFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK

avGFP: https://www.fpbase.org/protein/avgfp/
>avGFP_1
MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK

In [10]:
#BLAST IN PYTHON
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML

unmutated_GFP_1 = "MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK"
eGFP = "MVSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLTYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITLGMDELYK"
sfGFP = "MSKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDATNGKLTLKFICTTGKLPVPWPTLVTTLTYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTISFKDDGTYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNFNSHNVYITADKQKNGIKANFKIRHNVEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSVLSKDPNEKRDHMVLLEFVTAAGITHGMDELYK"
alphaGFP = "MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTISFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK"
avGFP = "MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK"

result_handle = NCBIWWW.qblast("blastp", "nr",
                               unmutated_GFP_1 + "\n" + eGFP + "\n" + sfGFP + "\n" + alphaGFP + "\n" + avGFP)
blast_records = NCBIXML.parse(result_handle)
for record in blast_records:
    for alignment in record.alignments:
        # Extract information from the alignment
        print(alignment.title)
# Searches the entire database, which is not exactly what I want.

gb|QEG99026.1| codon-optimized 3XeGFP [Cloning vector pTrichoGate-6]
gb|AEG42740.1| SV40-3xeGFP [Cloning vector pPLV04] >gb|AIQ78254.1| n3xGFP [Cloning vector pPLV04_v2]
gb|QNN02187.1| 3xVenus-SYP [Cloning vector PjCADL1pro::3xVenus-SYP] >gb|QNN02189.1| 3xVenus-SYP [Cloning vector PjCADL2pro::3xVenus-SYP] >gb|QNN02191.1| 3xVenus-SYP [Cloning vector PjCADL3pro::3xVenus-SYP] >gb|QNN02193.1| 3xVenus-SYP [Cloning vector PjCADL4pro::3xVenus-SYP] >gb|QNN02195.1| 3xVenus-SYP [Cloning vector PjCADL5pro::3xVenus-SYP]
gb|AHE38517.1| 3x GFP, partial [Cloning vector pGGC025]
dbj|BAS49686.1| three repeats of Citrine with GGSGGS linkers [Binary vector pMpGWB123] >dbj|BAS49690.1| three repeats of Citrine with GGSGGS linkers [Binary vector pMpGWB124] >dbj|BAS49694.1| three repeats of Citrine with GGSGGS linkers [Binary vector pMpGWB125] >dbj|BAS49821.1| three repeats of Citrine with GGSGGS linkers [Binary vector pMpGWB223] >dbj|BAS49825.1| three repeats of Citrine with GGSGGS linkers [Binary vector pM

In [None]:
# Next try: Use clustal omega or muscle because they are better suited for an offline multiple alignment and they achieve a higher accuracy.


2. Try: in Clustal omega website, because integrating python is unnecessarily complicated...
Job ID: clustalo-I20230619-103006-0921-81728093-p1m
https://www.ebi.ac.uk/Tools/services/web/toolresult.ebi?jobId=clustalo-I20230619-103006-0921-81728093-p1m
CLUSTAL O(1.2.4) multiple sequence alignment


sfGFP_1              -MSKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDATNGKLTLKFICTTGKLPVPWPT	59
alphaGFP_1           -MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPT	59
eGFP_1               MVSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPT	60
unmutated_GFP_1      -MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPT	59
avGFP_1              -MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPT	59
                      :**************************** ******** ********************

sfGFP_1              LVTTLTYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTISFKDDGTYKTRAEVKFEGDTL	119
alphaGFP_1           LVTTFSYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTISFKDDGNYKTRAEVKFEGDTL	119
eGFP_1               LVTTLTYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTL	120
unmutated_GFP_1      LVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTL	119
avGFP_1              LVTTFSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTL	119
                     ****::**************:****************** *****.**************

sfGFP_1              VNRIELKGIDFKEDGNILGHKLEYNFNSHNVYITADKQKNGIKANFKIRHNVEDGSVQLA	179
alphaGFP_1           VNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGSVQLA	179
eGFP_1               VNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLA	180
unmutated_GFP_1      VNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLA	179
avGFP_1              VNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLA	179
                     *************************:******* *********.*******:********

sfGFP_1              DHYQQNTPIGDGPVLLPDNHYLSTQSVLSKDPNEKRDHMVLLEFVTAAGITHGMDELYK	238
alphaGFP_1           DHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK	238
eGFP_1               DHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITLGMDELYK	239
unmutated_GFP_1      DHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK	238
avGFP_1              DHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK	238
                     **************************.************************ *******
Tree data
(
(
(
sfGFP_1:0.02941,
alphaGFP_1:0.00420)
:0.01366,
eGFP_1:0.01155)
:0.00105,
unmutated_GFP_1:0.00105,
avGFP_1:0.00315);

3. Try: in MUSCLE (is supposed to be faster and more accurate)
muscle-I20230619-103332-0882-21245675-p1m
https://www.ebi.ac.uk/Tools/services/web/toolresult.ebi?jobId=muscle-I20230619-103332-0882-21245675-p1m
CLUSTAL multiple sequence alignment by MUSCLE (3.8)


sfGFP_1              -MSKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDATNGKLTLKFICTTGKLPVPWPT
alphaGFP_1           -MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPT
eGFP_1               MVSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPT
unmutated_GFP_1      -MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPT
avGFP_1              -MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPT
                      :**************************** ******** ********************

sfGFP_1              LVTTLTYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTISFKDDGTYKTRAEVKFEGDTL
alphaGFP_1           LVTTFSYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTISFKDDGNYKTRAEVKFEGDTL
eGFP_1               LVTTLTYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTL
unmutated_GFP_1      LVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTL
avGFP_1              LVTTFSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTL
                     ****::**************.****************** *****.**************

sfGFP_1              VNRIELKGIDFKEDGNILGHKLEYNFNSHNVYITADKQKNGIKANFKIRHNVEDGSVQLA
alphaGFP_1           VNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGSVQLA
eGFP_1               VNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLA
unmutated_GFP_1      VNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLA
avGFP_1              VNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLA
                     *************************:******* *********.*******:********

sfGFP_1              DHYQQNTPIGDGPVLLPDNHYLSTQSVLSKDPNEKRDHMVLLEFVTAAGITHGMDELYK
alphaGFP_1           DHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK
eGFP_1               DHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITLGMDELYK
unmutated_GFP_1      DHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK
avGFP_1              DHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK
                     **************************.************************ *******
 Tree data
(
(
(
sfGFP_1:0.02941,
alphaGFP_1:0.00420)
:0.01366,
eGFP_1:0.01155)
:0.00105,
unmutated_GFP_1:0.00105,
avGFP_1:0.00315);

Next steps:
#Calculate all the properties of all the GFP variants and compare the differences.
#Check whether we have the mutations in our data set that lead from one variant to the other.

In [12]:
import pandas as pd

# Defines the protein sequences and their respective sequence names
sequences = [unmutated_GFP_1, eGFP, sfGFP, avGFP, alphaGFP]
sequence_names = ["unmutated GFP", "eGFP", "sfGFP", "avGFP", "alphaGFP"]

ASE_datei = pd.read_csv(r"C:\Users\roman\Desktop\AS Eigenschaften\aminoacids.csv")

columns_to_delete = ['Name', 'Abbr', 'Molecular Formula', 'Residue Formula']
ASE_datei.drop(columns=columns_to_delete, inplace=True)
delete_columns = ["carbon", "hydrogen", "nitrogen", "oxygen", "sulfur"]
ASE_datei.drop(columns=delete_columns, inplace=True)

row_index_to_delete = 12
ASE_clear = ASE_datei.drop(row_index_to_delete)

# Creates a dictionary for each property mapping
property_maps = {}
for col in ASE_clear.columns:
    if col != "Letter":  # Skip the "Letter" column
        property_maps[col] = dict(zip(ASE_clear["Letter"].dropna(), ASE_clear[col].dropna()))

# Creates an empty DataFrame to store the results
results_df = pd.DataFrame(columns=["Sequence Name"] + list(property_maps.keys()))

# Calculates the sum of products for each property for each sequence
sequence_results = []
for sequence, sequence_name in zip(sequences, sequence_names):
    results = {}
    for prop, property_map in property_maps.items():
        result = 0
        for AA in property_map.keys():
            quantity = sequence.count(AA)
            result += property_map[AA] * quantity
        results[prop] = result


    sequence_results.append(pd.DataFrame({"Sequence Name": sequence_name, **results}, index=[0]))

# Concatenates all sequence results into the main results DataFrame
results_MuSe_ASE_df = pd.concat(sequence_results, ignore_index=True)

# Print the results DataFrame
print(results_MuSe_ASE_df)


   Sequence Name  Molecular Weight  Residue Weight    pKa1     pKb2    pKx3  \
0  unmutated GFP          31122.34        26834.81  498.47  2133.09  746.86   
1           eGFP          31229.54        26923.99  501.72  2142.86  736.79   
2          sfGFP          31051.24        26763.72  502.89  2150.77  746.86   
3          avGFP          31156.35        26868.83  497.94  2132.62  757.39   
4       alphaGFP          31066.16        26778.65  497.64  2132.74  755.04   

       pl4      H      VSC      P1      P2     SASA     NCISC  
0  1412.64  -9.68  15894.1  1997.6  38.159  406.603  9.501487  
1  1416.91  -7.28  16076.1  1996.9  38.570  409.399  9.600551  
2  1422.51 -15.07  16105.9  2019.2  38.825  411.887  9.558542  
3  1412.14  -9.55  15916.1  1997.9  38.263  406.900  9.487367  
4  1417.35 -13.48  15693.4  2007.5  37.670  404.607  9.418697  


Colour variants of GFP:
CFP: https://www.fpbase.org/protein/cfp/ Y66W
>CFP_1
MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSWGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK

RFP: https://www.fpbase.org/protein/eqfp611/ native sequence from Entacmaea quadricolor
> eqFP611_1
MNSLIKENMRMMVVMEGSVNGYQFKCTGEGDGNPYMGTQTMRIKVVEGGPLPFAFDILATSFMYGSKTFIKHTKGIPDFFKQSFPEGFTWERVTRYEDGGVFTVMQDTSLEDGCLVYHAKVTGVNFPSNGAVMQKKTKGWEPNTEMLYPADGGLRGYSQMALNVDGGGYLSCSFETTYRSKKTVENFKMPGFHFVDHRLERLEESDKEMFVVQHEHAVAKFCDLPSKLGRL

hyperfold_YFP: has a lot of mutations away from avGFP https://www.fpbase.org/protein/hyperfolder-yfp/
> hfYFP_1
MVSKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDATNGKLTLKLISTTGKLPVPWPTLVTTLGYGLMVFARYPDHMKQHDFFKSAMPEGYVQERTISFEDDGYYKTRAEVKFEGDTLVNRIVLKGIDFKEDGNILGHKLEYNFNSHNVYITADKQKNGIKANFKIRHNVEDGGVQLADHYQQNTPIGDGPVLLPDNHYLSYQSVLSKDPNEKRDHMVLKERVTAAGITHDMNELYK

avGFP: https://www.fpbase.org/protein/avgfp/
>avGFP_1
MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK

mOFP: https://www.fpbase.org/protein/morange/
>mOFP_1
MVSKGEENNMAIIKEFMRFKVRMEGSVNGHEFEIEGEGEGRPYEGFQTAKLKVTKGGPLPFAWDILSPQFTYGSKAYVKHPADIPDYFKLSFPEGFKWERVMNFEDGGVVTVTQDSSLQDGEFIYKVKLRGTNFPSDGPVMQKKTMGWEASSERMYPEDGALKGEIKMRLKLKDGGHYTSEVKTTYKAKKPVQLPGAYIVGIKLDITSHNEDYTIVEQYERAEGRHSTGGMDELYK

Results of the Muscle of the color variants:
CFP_1           -MSKGEELFTGVVP----ILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF-ICTTGKLP
avGFP_1         -MSKGEELFTGVVP----ILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF-ICTTGKLP
hfYFP_1         MVSKGEELFTGVVP----ILVELDGDVNGHKFSVRGEGEGDATNGKLTLKL-ISTTGKLP
eqFP611_1       --------MNSLIKENMRMMVVMEGSVNGYQFKCTGEGDGNPYMGTQTMRIKVVEGGPLP
mOFP_1          MVSKGEENNMAIIKEFMRFKVRMEGSVNGHEFEIEGEGEGRPYEGFQTAKLKVTKGGPLP
                          .::     : * ::*.***::*.  ***:* .  *  * .: :   * **

CFP_1           VPWPTLVTTFSWGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKF
avGFP_1         VPWPTLVTTFSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKF
hfYFP_1         VPWPTLVTTLGYGLMVFARYPDHMKQHDFFKSAMPEGYVQERTISFEDDGYYKTRAEVKF
eqFP611_1       FAFDILATSFMYGSKTFIKHTKGIP--DFFKQSFPEGFTWERVTRYEDGGVFTVMQDTSL
mOFP_1          FAWDILSPQFTYGSKAYVKHPADIP--DYFKLSFPEGFKWERVMNFEDGGVVTVTQDSSL
                ..:  * . : :*   : .:.  :   *:** ::***: .**.  ::*.*  ..  : .:

CFP_1           EGDTLVNRIELKGIDFKEDGNILGHK-LEYNYNSHNVYIMADKQKNGIKVNFKIRHNIED
avGFP_1         EGDTLVNRIELKGIDFKEDGNILGHK-LEYNYNSHNVYIMADKQKNGIKVNFKIRHNIED
hfYFP_1         EGDTLVNRIVLKGIDFKEDGNILGHK-LEYNFNSHNVYITADKQKNGIKANFKIRHNVED
eqFP611_1       EDGCLVYHAKVTGVNFPSNGAVMQKKTKGWEPNTEMLY----PADGGLRGYSQMALNVDG
mOFP_1          QDGEFIYKVKLRGTNFPSDGPVMQKKTMGWEASSERMY----PEDGALKGEIKMRLKLKD
                :.. :: .  : * :* .:* :: :*   :: .:  :*      ...:.   ::  ::..

CFP_1           GS---VQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAA--G
avGFP_1         GS---VQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAA--G
hfYFP_1         GG---VQLADHYQQNTPIGDGPVLLPDNHYLSYQSVLSKDPNEKRDHMVLKERVTAA--G
eqFP611_1       GGYLSCSFETTYRSKKTVEN--FKMPGFHFVDHRLERLEE-SDKEMFVVQHEHAVAKFCD
mOFP_1          GGHYTSEVKTTYKAKKPVQ-----LPGAYIVGIKLDITSH-NEDYTIVEQYERAEGR--H
                *.    ..   *. :..:      :*. : :. .    .  .:.   :   * . .

CFP_1           ITHGMDELYK
avGFP_1         ITHGMDELYK
hfYFP_1         ITHDMNELYK
eqFP611_1       LPSKLGRL--
mOFP_1          STGGMDELYK
                 .  :. *
Tree data
(
CFP_1:0.00427,
avGFP_1:0.00000,
(
hfYFP_1:0.04931,
(
eqFP611_1:0.28275,
mOFP_1:0.21725)
:0.42158)
:0.05994);

In [14]:
CFP = "MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSWGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK"
RFP = "MNSLIKENMRMMVVMEGSVNGYQFKCTGEGDGNPYMGTQTMRIKVVEGGPLPFAFDILATSFMYGSKTFIKHTKGIPDFFKQSFPEGFTWERVTRYEDGGVFTVMQDTSLEDGCLVYHAKVTGVNFPSNGAVMQKKTKGWEPNTEMLYPADGGLRGYSQMALNVDGGGYLSCSFETTYRSKKTVENFKMPGFHFVDHRLERLEESDKEMFVVQHEHAVAKFCDLPSKLGRL"
hfYFP = "MVSKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDATNGKLTLKLISTTGKLPVPWPTLVTTLGYGLMVFARYPDHMKQHDFFKSAMPEGYVQERTISFEDDGYYKTRAEVKFEGDTLVNRIVLKGIDFKEDGNILGHKLEYNFNSHNVYITADKQKNGIKANFKIRHNVEDGGVQLADHYQQNTPIGDGPVLLPDNHYLSYQSVLSKDPNEKRDHMVLKERVTAAGITHDMNELYK"
avGFP = "MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK"
mOFP = "MVSKGEENNMAIIKEFMRFKVRMEGSVNGHEFEIEGEGEGRPYEGFQTAKLKVTKGGPLPFAWDILSPQFTYGSKAYVKHPADIPDYFKLSFPEGFKWERVMNFEDGGVVTVTQDSSLQDGEFIYKVKLRGTNFPSDGPVMQKKTMGWEASSERMYPEDGALKGEIKMRLKLKDGGHYTSEVKTTYKAKKPVQLPGAYIVGIKLDITSHNEDYTIVEQYERAEGRHSTGGMDELYK"


In [15]:
import pandas as pd

# Defines the protein sequences and their respective sequence names
sequences = [CFP, RFP, hfYFP, avGFP, mOFP]
sequence_names = ["CFP", "RFP", "hfYFP", "avGFP", "mOFP"]

ASE_datei = pd.read_csv(r"C:\Users\roman\Desktop\AS Eigenschaften\aminoacids.csv")

columns_to_delete = ['Name', 'Abbr', 'Molecular Formula', 'Residue Formula']
ASE_datei.drop(columns=columns_to_delete, inplace=True)
delete_columns = ["carbon", "hydrogen", "nitrogen", "oxygen", "sulfur"]
ASE_datei.drop(columns=delete_columns, inplace=True)

row_index_to_delete = 12
ASE_clear = ASE_datei.drop(row_index_to_delete)

# Creates a dictionary for each property mapping
property_maps = {}
for col in ASE_clear.columns:
    if col != "Letter":  # Skip the "Letter" column
        property_maps[col] = dict(zip(ASE_clear["Letter"].dropna(), ASE_clear[col].dropna()))

# Creates an empty DataFrame to store the results
results_df = pd.DataFrame(columns=["Sequence Name"] + list(property_maps.keys()))

# Calculates the sum of products for each property for each sequence
sequence_results = []
for sequence, sequence_name in zip(sequences, sequence_names):
    results = {}
    for prop, property_map in property_maps.items():
        result = 0
        for AA in property_map.keys():
            quantity = sequence.count(AA)
            result += property_map[AA] * quantity
        results[prop] = result

    # Creates a new DataFrame row for the current sequence
    sequence_results.append(pd.DataFrame({"Sequence Name": sequence_name, **results}, index=[0]))

# Concatenates all sequence results into the main results DataFrame
results_MuSe_ASE_Farben_df = pd.concat(sequence_results, ignore_index=True)

print(results_MuSe_ASE_Farben_df)

  Sequence Name  Molecular Weight  Residue Weight    pKa1     pKb2    pKx3  \
0           CFP          31179.39        26891.87  500.14  2141.73  757.39   
1           RFP          30196.58        26035.33  489.13  2084.61  780.22   
2         hfYFP          31189.45        26883.87  503.83  2141.18  739.16   
3         avGFP          31156.35        26868.83  497.94  2132.62  757.39   
4          mOFP          30971.22        26719.88  495.49  2107.59  777.99   

       pl4      H      VSC      P1      P2     SASA      NCISC  
0  1412.37  -9.29  16033.4  2004.1  38.561  409.268   9.510966  
1  1382.72  -3.91  15636.3  1900.4  38.150  399.483  10.010540  
2  1431.27 -14.06  16167.3  2012.9  39.008  411.644   9.863915  
3  1412.14  -9.55  15916.1  1997.9  38.263  406.900   9.487367  
4  1424.67 -21.32  15462.6  1989.4  37.212  400.781  10.699294  


In [16]:
from Bio import AlignIO, SeqIO
from Bio.Align import MultipleSeqAlignment
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align.Applications import ClustalOmegaCommandline
import pandas as pd

unmutated_GFP_1 = "MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK"
eGFP = "MVSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLTYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITLGMDELYK"
sfGFP = "MSKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDATNGKLTLKFICTTGKLPVPWPTLVTTLTYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTISFKDDGTYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNFNSHNVYITADKQKNGIKANFKIRHNVEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSVLSKDPNEKRDHMVLLEFVTAAGITHGMDELYK"
alphaGFP = "MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTISFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK"
avGFP = "MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNY"

# Create SeqRecord objects for each sequence
sequences = [
    SeqRecord(Seq(unmutated_GFP_1), id="unmutated GFP"),
    SeqRecord(Seq(eGFP), id="eGFP"),
    SeqRecord(Seq(sfGFP), id="sfGFP"),
    SeqRecord(Seq(alphaGFP), id="alphaGFP"),
    SeqRecord(Seq(avGFP), id="avGFP")
]

# Write the sequences to a temporary file in FASTA format
temp_fasta_file = "sequences.fasta"
SeqIO.write(sequences, temp_fasta_file, "fasta")

# Performs MSA using the Clustal Omega command-line tool
clustalomega_cline = ClustalOmegaCommandline(infile=temp_fasta_file, outfile="aligned_sequences.fasta", verbose=True,
                                             auto=True)
clustalomega_cline

# Parses the aligned records
alignment = AlignIO.read("aligned_sequences.fasta", "fasta")

# Converts the alignment to a DataFrame
df_clusral_results = pd.DataFrame(alignment)
print(df_clustal_results)

# I have tried to work using a temporary fasta, but the code does not work because my computer does not identify the fasta.
# I was not able to fix the code, but shifted the focus, as the website mentioned above contains all the GFP variant information.
# I just wanted to learn how to run blast/clustal analysis in python.

FileNotFoundError: [Errno 2] No such file or directory: 'aligned_sequences.fasta'