In [1]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupShuffleSplit 
from sklearn.preprocessing import OneHotEncoder

from Bio import pairwise2
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

from sklearn.metrics import r2_score

from sklearn import preprocessing
from Bio.SeqUtils.ProtParam import ProteinAnalysis


from Bio import AlignIO
from Bio import SeqIO
from Bio.Align.Applications import MuscleCommandline
from Bio.Align import AlignInfo
import pandascharm as pc

from Bio.SubsMat.MatrixInfo import blosum62
from Bio.SubsMat.MatrixInfo import blosum45



In [2]:
def removeoutlier_col(df,cols):
    Q1 = df[cols].quantile(0.25)
    Q3 = df[cols].quantile(0.75)
    IQR = Q3 - Q1
    df_out = df[~((df[[cols]] < (Q1 - 1.5 * IQR)) |(df[[cols]] > (Q3 + 1.5 * IQR))).any(axis=1)]
    return df_out


def RF(X_train, y_train, X_val):
    
    rf = RandomForestRegressor()
    rf.fit(X_train,np.ravel(y_train))
    RF_pred = rf.predict(X_val)

    return RF_pred

def XGBR(X_train, y_train, X_val):
    
    model = xgb.XGBRegressor()
    model.fit(X_train, y_train)
    XGB_pred = model.predict(X_val)
    
    return XGB_pred

def SVM(X_train, y_train, X_val):
    
    model = SVR()
    model.fit(X_train, y_train)
    SVM_pred = model.predict(X_val)
    
    return SVM_pred



In [3]:
blosum62.update(((b,a),val) for (a,b),val in list(blosum62.items()))
blosum45.update(((b,a),val) for (a,b),val in list(blosum45.items()))

def score_pairwise(seq1, seq2, matrix, gap_s, gap_e, gap = True):
    for A,B in zip(seq1, seq2):
        diag = ('-'==A) or ('-'==B)
        yield (gap_e if gap else gap_s) if diag else matrix[(A,B)]
        gap = diag

In [4]:
def encode(encoding, output, df_clean, aln, esm1b, key = None):
    

    ClustalAlign = AlignIO.read(aln, 'clustal')
    summary_align = AlignInfo.SummaryInfo(ClustalAlign )
    dframe = pc.from_bioalignment(ClustalAlign).transpose()
    sequences = dframe.loc[df_clean.index]
    
    y = df_clean['Log'+output]

    scaler = preprocessing.StandardScaler()
    
    
    if encoding == 'One-Hot-Encoder':

        one_hot = OneHotEncoder()
        encoded = one_hot.fit(sequences)
        X = encoded.transform(sequences).toarray()
        X = np.array(X)
        scaler.fit(X)
        X_scaled = scaler.transform(X)
        
           
    if encoding == 'Bag-of-Words':

        X = pd.DataFrame([ProteinAnalysis(i).count_amino_acids() for i in df_clean['Sequence']])
        X = np.array(X)
        scaler.fit(X)
        X_scaled = scaler.transform(X)
        
    if encoding == 'bigram':
        
        X = df_clean['Sequence']

        example = df_clean['Sequence'][0]
        lst = ['E','G','L','Y','T','H','R','A','C','D','P','I','F','N','K','S','V','M','W','Q']
        all_dct = {}
        key = []
        for i in lst:
            for j in lst:
                st = i+j
                all_dct[st] = []

        for example, id in zip(X,range(len(X))):

            temp = list(example)
            temp_dct = dict.fromkeys(all_dct.keys(),0)
            for k in range(len(temp)-1):
                try:
                    check = temp[k] + temp[k+1]
                    temp_dct[check] += 1
                except:
                    pass
            for key, value in temp_dct.items():
                all_dct[key].append(value)
                
                
        X = pd.DataFrame.from_dict(all_dct).set_index(df_clean.index)
        X = np.array(X)
        scaler.fit(X)
        X_scaled = scaler.transform(X)
    
    if encoding == 'trigram':
        
        X = df_clean['Sequence']

        example = df_clean['Sequence'][0]
        lst = ['E','G','L','Y','T','H','R','A','C','D','P','I','F','N','K','S','V','M','W','Q']
        all_dct = {}
        key = []
        for i in lst:
            for j in lst:
                for k in lst:
                    st = i+j+k
                    all_dct[st] = []

        for example, id in zip(X,range(len(X))):

            temp = list(example)
            temp_dct = dict.fromkeys(all_dct.keys(),0)
            for k in range(len(temp)-2):
                try:
                    check = temp[k] + temp[k+1]+temp[k+2]
                    temp_dct[check] += 1
                except:
                    pass
            for key, value in temp_dct.items():
                all_dct[key].append(value)
                
        X = pd.DataFrame.from_dict(all_dct).set_index(df_clean.index)
        X = np.array(X)
        scaler.fit(X)
        X_scaled = scaler.transform(X)
        
        
    if encoding == 'quadrogram':
        
        X = df_clean['Sequence']

        example = df_clean['Sequence'][0]
        lst = ['E','G','L','Y','T','H','R','A','C','D','P','I','F','N','K','S','V','M','W','Q']
        all_dct = {}
        key = []
        for i in lst:
            for j in lst:
                for k in lst:
                    for l in lst:
                        st = i+j+k+l
                        all_dct[st] = []

        for example, id in zip(X,range(len(X))):

            temp = list(example)
            temp_dct = dict.fromkeys(all_dct.keys(),0)
            for k in range(len(temp)-3):
                try:
                    check = temp[k] + temp[k+1]+temp[k+2]+temp[k+3]
                    temp_dct[check] += 1
                except:
                    pass
            for key, value in temp_dct.items():
                all_dct[key].append(value)
                
        X = pd.DataFrame.from_dict(all_dct).set_index(df_clean.index)
        X = np.array(X)
        scaler.fit(X)
        X_scaled = scaler.transform(X)

    if encoding == 'BLOSUM62':

        n = len(sequences)
        enc_seq = np.zeros((n,n))

        i = 0

        for a in list(sequences.index):
            j = 0
            for b in list(sequences.index):
                enc_seq[i,j] = sum(score_pairwise(sequences.loc[a], sequences.loc[b], blosum62, -5, -1))
                j += 1
            i += 1
        
        X = np.array(enc_seq)
        scaler.fit(enc_seq)
        X_scaled = scaler.transform(enc_seq)
        
        
        
    if encoding == 'BLOSUM45':
        
        n = len(sequences)
        enc_seq = np.zeros((n,n))

        i = 0

        for a in list(sequences.index):
            j = 0
            for b in list(sequences.index):
                enc_seq[i,j] = sum(score_pairwise(sequences.loc[a], sequences.loc[b], blosum45, -5, -1))
                j += 1
            i += 1
        
        X = np.array(enc_seq)   
        scaler.fit(enc_seq)
        X_scaled = scaler.transform(enc_seq)
        
    if encoding == 'ESM1b':
        

        encoded = esm1b.loc[df_clean.index]
        X = np.array(encoded)
        scaler.fit(X)
        X_scaled = scaler.transform(X)
        
    return X, y, X_scaled, scaler


In [5]:
def evaluate_performance_identity_based(enzyme , df, output, aln, esm1b, methods, state):

    summary=pd.DataFrame()
    
    df['Log'+output]=np.log10(df[output])
    
    df_clean = removeoutlier_col(df,'Log'+output).copy()
    
    # Create a mapping of unique sequences to unique codes
    sequence_to_code = {seq: f"ENZYME_{i+1}" for i, seq in enumerate(df_clean['Sequence'].unique())}

    # Map these codes to a new column in the DataFrame using .loc
    df_clean.loc[:, 'Sequence Code'] = df_clean['Sequence'].map(sequence_to_code)
    
    df_clean = df_clean.set_index('Index')

        
    for method in methods:

        X, y, X_scaled, scaler = encode(method, output, df_clean, aln, esm1b, key=None)
        
        splitter = GroupShuffleSplit(test_size=.20, n_splits=10, random_state = state)

        split = splitter.split(df_clean, groups=df_clean['Sequence Code'])
        
        train_inds, val_inds = next(split)
        
        X_train = X[train_inds]
        y_train = y[train_inds]
        X_scaled_train = X_scaled[train_inds]
        
        
        X_val = X[val_inds]
        y_val = y[val_inds]
        X_scaled_val = X_scaled[val_inds]

        
        y_predicted_RF = RF(X_train, y_train, X_val)
        y_predicted_XGBR = XGBR(X_train, y_train, X_val)
        y_predicted_SVM = SVM(X_scaled_train, y_train, X_scaled_val)

        
        summary['RF with ' + method] = y_predicted_RF
        summary['XGB with ' + method] = y_predicted_XGBR
        summary['SVM with ' + method] = y_predicted_SVM


    X_train_seq = df_clean.loc[y_train.index]['Sequence']
    X_val_seq = df_clean.loc[y_val.index]['Sequence']
    max_list = []
    
    for sequence_val in X_val_seq:
        score_list=[]
        for sequence_train in X_train_seq:
            alignment = pairwise2.align.globalxx(sequence_val, sequence_train)[0]
            aligned_length = len(alignment.seqA)
            identical_positions = sum(a == b for a,b in zip(alignment.seqA, alignment.seqB))
            identity_score = (identical_positions / aligned_length)
            score_list.append(identity_score)
        max_list.append(max(score_list))

 
    # Define the ranges
    ranges = [(0, 0.7999), (0.80, 0.9999), (1, 1)]

    # Initialize a dictionary to hold the counts for each range
    counts = {r: 0 for r in ranges}

    # Iterate through each element and determine which range it falls into
    for element in max_list:
        for r in ranges:
            if r[0] <= element <= r[1]:
                counts[r] += 1

    # Print the counts for each range
    for r in ranges:
        print(f"Range {r}: {counts[r]}")

    summary['y_val'] = y_val.values
    summary['Sequence Identity'] = max_list
    summary['wild type or mutant'] = df_clean.loc[y_val.index]['wild type or mutant'].values
    summary=summary.set_index(y_val.index)

    identity_less100 = summary[summary['Sequence Identity'] < 1] 
    identity_80 = summary[(summary['Sequence Identity'] < 0.9999) & (summary['Sequence Identity'] > 0.8)]
    identity_0 = summary[summary['Sequence Identity'] < 0.7999]
    identity_80_99 = summary[(summary['Sequence Identity'] <= 0.99) & (summary['Sequence Identity'] > 0.8)]
    identity_99 = summary[summary['Sequence Identity'] > 0.99] 

    identity_less100_w = summary[(summary['Sequence Identity'] < 1) & (summary['wild type or mutant'] == 'wild')] 
    identity_80_w = summary[(summary['Sequence Identity'] < 0.9999) & (summary['Sequence Identity'] > 0.8) &
                                                               (summary['wild type or mutant'] == 'wild')]
    identity_0_w = summary[(summary['Sequence Identity'] < 0.7999) & (summary['wild type or mutant'] == 'wild')]
    identity_80_99_w = summary[(summary['Sequence Identity'] <= 0.99) & (summary['Sequence Identity'] > 0.8) &
                                                                      (summary['wild type or mutant'] == 'wild')]
    identity_99_w = summary[(summary['Sequence Identity'] > 0.99) & (summary['wild type or mutant'] == 'wild')]  


    identity_less100_m = summary[(summary['Sequence Identity'] < 1) & (summary['wild type or mutant'] == 'mutant')] 
    identity_80_m = summary[(summary['Sequence Identity'] < 0.9999) & (summary['Sequence Identity'] > 0.8) &
                                                                    (summary['wild type or mutant'] == 'mutant')]
    identity_99_m = summary[(summary['Sequence Identity'] > 0.99) & (summary['wild type or mutant'] == 'mutant')]  




    matrix = [identity_less100, identity_80 ,identity_0 ,identity_80_99, identity_99 ,
              identity_less100_w, identity_80_w, identity_0_w, identity_80_99_w , identity_99_w,
              identity_less100_m,identity_80_m,identity_99_m]
    list_R2=[]

    for identity in matrix:
        for model in summary.columns[:-1]:
            r2=r2_score(identity['y_val'], identity[model])
            list_R2.append(r2)



    x = np.reshape(list_R2, (len(matrix), len(summary.columns[:-1])))

    number_of_data = [len(a) for a in matrix]

    Results=pd.DataFrame(x, columns=summary.columns[:-1], index = ['<100%', '80-99.99%', '0-79.99%', '80-99%', '>99%',
                                                              '<100% wild type','80-99.99% wild type', '0-79.99% wild type', '80-99% wild type', '>99% wild type',
                                                              '<100% mutant type','80-99.99% mutant type', '>99% mutant type'])
    Results['Number of Data'] = number_of_data
    
    return summary, list_R2, Results, number_of_data



In [6]:
enzyme = 'betaGlucosidasewithMutants'

df = pd.read_excel('betaGlucosidasewithMutantsOptimumTemperature.xlsx')

output = 'pNP-Glc kcat/Km (1/smM)'
aln = enzyme + '.aln'

x = datetime.datetime.now()
date = str(x.year)+str(x.month)+str(x.day)


methods = ['ESM1b', 'BLOSUM45', 'BLOSUM62', 'bigram',  'quadrogram', 'Bag-of-Words']


esm1b = pd.read_excel(enzyme+'ESM1b_embeddings.xlsx', index_col = 0)

In [7]:
df_kcatKmTopt = df[df['Percentage Activity Depending on Optimum Temp']==1]

In [8]:
list_summary = []
list_list_R2 = []
list_Results = []
random_state = [202 , 1, 42, 101, 2022,5 , 10, 22, 1995, 0]

list_number_of_data = []

for state in random_state:
    summary, list_R2, Results, number_of_data = evaluate_performance_identity_based(enzyme , 
                                                                                  df_kcatKmTopt , output, aln, esm1b,
                                                                                  methods, state)
    print( "Run is completed")
    print(Results)
    list_summary.append(summary)
    list_list_R2.append(list_R2)
    list_Results.append(Results)
    list_number_of_data.append(number_of_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Log'+output]=np.log10(df[output])


Range (0, 0.7999): 20
Range (0.8, 0.9999): 34
Range (1, 1): 0
Run is completed
                       RF with ESM1b  XGB with ESM1b  SVM with ESM1b  \
<100%                       0.757423        0.711517        0.674277   
80-99.99%                   0.774069        0.804293        0.651761   
0-79.99%                    0.407795       -0.132825        0.468274   
80-99%                    -12.527854      -15.726311      -15.853964   
>99%                        0.779927        0.843900        0.633228   
<100% wild type             0.361266       -0.033160        0.333036   
80-99.99% wild type         0.288485        0.118005        0.123836   
0-79.99% wild type          0.407795       -0.132825        0.468274   
80-99% wild type         -181.591154     -226.787679     -227.255733   
>99% wild type              0.741174        0.690054        0.706960   
<100% mutant type           0.797538        0.865129        0.654311   
80-99.99% mutant type       0.797538        0.865129     

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Log'+output]=np.log10(df[output])


Range (0, 0.7999): 16
Range (0.8, 0.9999): 38
Range (1, 1): 0
Run is completed
                       RF with ESM1b  XGB with ESM1b  SVM with ESM1b  \
<100%                       0.641010        0.585977        0.609056   
80-99.99%                   0.722069        0.708799        0.645632   
0-79.99%                   -0.151627       -0.482331        0.017534   
80-99%                      0.575071        0.371604        0.480743   
>99%                        0.696371        0.702522        0.610327   
<100% wild type            -0.045262       -0.282816        0.100943   
80-99.99% wild type        -0.084307       -0.162482        0.043624   
0-79.99% wild type         -0.165332       -0.494569        0.006753   
80-99% wild type            0.377909        0.042472        0.209995   
>99% wild type             -1.730485       -0.899232       -0.554554   
<100% mutant type           0.778740        0.769152        0.683760   
80-99.99% mutant type       0.776371        0.767820     

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Log'+output]=np.log10(df[output])


Range (0, 0.7999): 23
Range (0.8, 0.9999): 31
Range (1, 1): 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Log'+output]=np.log10(df[output])


Run is completed
                       RF with ESM1b  XGB with ESM1b  SVM with ESM1b  \
<100%                       0.577325        0.601336        0.486214   
80-99.99%                   0.615154        0.689369        0.432182   
0-79.99%                    0.265767        0.210921        0.291961   
80-99%                     -0.338625        0.896242       -4.183500   
>99%                        0.550739        0.627573        0.369346   
<100% wild type             0.327647        0.290864        0.291865   
80-99.99% wild type         0.813650        0.921354        0.279478   
0-79.99% wild type          0.265767        0.210921        0.291961   
80-99% wild type                 NaN             NaN             NaN   
>99% wild type              0.804737        0.624782        0.476964   
<100% mutant type           0.561266        0.640164        0.388122   
80-99.99% mutant type       0.561266        0.640164        0.388122   
>99% mutant type            0.532540        0.6

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Log'+output]=np.log10(df[output])


Range (0, 0.7999): 20
Range (0.8, 0.9999): 34
Range (1, 1): 0
Run is completed
                       RF with ESM1b  XGB with ESM1b  SVM with ESM1b  \
<100%                       0.701155        0.661964        0.632480   
80-99.99%                   0.747207        0.723138        0.618149   
0-79.99%                    0.301773        0.183031        0.353624   
80-99%                      0.416726        0.449340        0.636495   
>99%                        0.719970        0.691029        0.563623   
<100% wild type             0.336737        0.231143        0.387651   
80-99.99% wild type         0.620143        0.623921        0.664111   
0-79.99% wild type          0.301773        0.183031        0.353624   
80-99% wild type           -0.389975       -0.414661        0.152873   
>99% wild type              0.899667        0.911375        0.805057   
<100% mutant type           0.719573        0.691016        0.565346   
80-99.99% mutant type       0.719573        0.691016     

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Log'+output]=np.log10(df[output])


Range (0, 0.7999): 19
Range (0.8, 0.9999): 35
Range (1, 1): 0
Run is completed
                       RF with ESM1b  XGB with ESM1b  SVM with ESM1b  \
<100%                       0.614492        0.596637        0.582473   
80-99.99%                   0.791175        0.801053        0.754846   
0-79.99%                    0.164077        0.076165        0.142424   
80-99%                      0.559376        0.725651        0.393890   
>99%                        0.805250        0.785912        0.788808   
<100% wild type             0.314339        0.324989        0.280584   
80-99.99% wild type         0.388310        0.622635        0.314747   
0-79.99% wild type          0.164077        0.076165        0.142424   
80-99% wild type            0.539728        0.784395        0.346789   
>99% wild type             -0.064880        0.141349        0.213101   
<100% mutant type           0.863875        0.820321        0.832197   
80-99.99% mutant type       0.863875        0.820321     

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Log'+output]=np.log10(df[output])


Range (0, 0.7999): 19
Range (0.8, 0.9999): 35
Range (1, 1): 0
Run is completed
                       RF with ESM1b  XGB with ESM1b  SVM with ESM1b  \
<100%                       0.772070        0.737332        0.681949   
80-99.99%                   0.840512        0.813014        0.707678   
0-79.99%                    0.419690        0.338967        0.359155   
80-99%                     -2.296514       -2.886404       -4.844256   
>99%                        0.821392        0.791410        0.665188   
<100% wild type             0.378123        0.257884        0.204292   
80-99.99% wild type         0.109332       -0.073421       -0.555004   
0-79.99% wild type          0.429620        0.321359        0.349836   
80-99% wild type           -2.433100       -3.115411       -5.198997   
>99% wild type              0.866804        0.828625        0.868125   
<100% mutant type           0.863117        0.858193        0.763888   
80-99.99% mutant type       0.859968        0.839243     

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Log'+output]=np.log10(df[output])


Range (0, 0.7999): 18
Range (0.8, 0.9999): 36
Range (1, 1): 0
Run is completed
                       RF with ESM1b  XGB with ESM1b  SVM with ESM1b  \
<100%                       0.647292        0.638777        0.611129   
80-99.99%                   0.811798        0.829279        0.699962   
0-79.99%                    0.095827        0.005795        0.289693   
80-99%                      0.312498        0.496910        0.064337   
>99%                        0.839487        0.830788        0.712699   
<100% wild type             0.227267        0.211489        0.289986   
80-99.99% wild type         0.372988        0.524488        0.147306   
0-79.99% wild type          0.095827        0.005795        0.289693   
80-99% wild type            0.305676        0.486760        0.044760   
>99% wild type              0.791785        0.751524        0.791677   
<100% mutant type           0.834037        0.827681        0.701121   
80-99.99% mutant type       0.834037        0.827681     

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Log'+output]=np.log10(df[output])


Range (0, 0.7999): 16
Range (0.8, 0.9999): 38
Range (1, 1): 0
Run is completed
                       RF with ESM1b  XGB with ESM1b  SVM with ESM1b  \
<100%                       0.662173        0.570579        0.603663   
80-99.99%                   0.778865        0.710380        0.708134   
0-79.99%                    0.302274        0.137820        0.275418   
80-99%                     -0.215487       -0.204363       -0.136923   
>99%                        0.708119        0.608678        0.603851   
<100% wild type             0.405094        0.275336        0.369913   
80-99.99% wild type         0.613161        0.624096        0.478373   
0-79.99% wild type          0.302274        0.137820        0.275418   
80-99% wild type           -0.123445       -0.146561       -0.186222   
>99% wild type              0.913566        0.938579        0.748275   
<100% mutant type           0.752058        0.668212        0.673351   
80-99.99% mutant type       0.752058        0.668212     

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Log'+output]=np.log10(df[output])


Range (0, 0.7999): 18
Range (0.8, 0.9999): 36
Range (1, 1): 0
Run is completed
                       RF with ESM1b  XGB with ESM1b  SVM with ESM1b  \
<100%                       0.626490        0.506039        0.602560   
80-99.99%                   0.798307        0.719653        0.757229   
0-79.99%                    0.235107        0.019352        0.250040   
80-99%                     -0.265396       -0.333533       -0.751107   
>99%                        0.876000        0.740664        0.897370   
<100% wild type             0.242223        0.061011        0.161246   
80-99.99% wild type         0.029482       -0.094326       -0.392953   
0-79.99% wild type          0.235107        0.019352        0.250040   
80-99% wild type           -0.324668       -0.455199       -0.910201   
>99% wild type              0.641685       -0.258316        0.684526   
<100% mutant type           0.905181        0.823457        0.925367   
80-99.99% mutant type       0.905181        0.823457     

In [9]:
df_average = pd.concat([list_Results[0],list_Results[1],list_Results[2],list_Results[3], list_Results[4],
                       list_Results[5],list_Results[6],list_Results[7],list_Results[8], list_Results[9]])

average = df_average.groupby(level=0, sort=False).mean()
std = df_average.groupby(level=0, sort=False).std()
list_Results.append(average)
list_Results.append(std)



In [10]:
df_number_of_data = pd.DataFrame(data = list_number_of_data, columns = ['<100%', '80-99.99%', '0-79.99%', '80-99%', '>99%',
                                                              '<100% wild type','80-99.99% wild type', '0-79.99% wild type', '80-99% wild type', '>99% wild type',
                                                              '<100% mutant type','80-99.99% mutant type', '>99% mutant type'])
list_Results.append(df_number_of_data)

In [11]:
names = ['Trial 1','Trial 2','Trial 3','Trial 4','Trial 5','Trial 6','Trial 7','Trial 8','Trial 9','Trial 10']

writer=pd.ExcelWriter(r"C:\Users\memre\Desktop\Research\Predicting Enzyme Properties Based on Various Organisms\Code\Temperature Profile Prediction\20240822 Predicting Maximum Activity betaGlucosidase - y_val values.xlsx")
_ = [A.to_excel(writer,sheet_name="{0}".format(names[i])) for i, A in enumerate(list_summary)]
writer.close()

In [13]:
names = ['Trial 1','Trial 2','Trial 3','Trial 4','Trial 5','Trial 6','Trial 7','Trial 8','Trial 9','Trial 10', "Average", "STD", 'Number of Data Points']
writer=pd.ExcelWriter(r"C:\Users\memre\Desktop\Research\Predicting Enzyme Properties Based on Various Organisms\Code\Temperature Profile Prediction\20240822 Predicting Maximum Activity betaGlucosidase - Summary based on Identity.xlsx")
_ = [A.to_excel(writer,sheet_name="{0}".format(names[i])) for i, A in enumerate(list_Results)]
writer.close()