# Calculate Average Functional Effects

In [17]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [18]:
def add_grouped_summary_stats(df, column_name, group_cols=['condition', 'site']):
    """
    Add grouped summary statistics (min, max, mean, median) for a specified column
    grouped by 'condition', 'site', and 'wildtype'.

    Parameters:
        df (pd.DataFrame): The input DataFrame.
        column_name (str): The column for which summary statistics are calculated.

    Returns:
        pd.DataFrame: A new DataFrame with additional summary statistic columns.
    """
    # Assert that the group columns are present in the DataFrame
    assert all(col in df.columns for col in group_cols), "Group columns not found in DataFrame" 
    # Assert that the column for which summary statistics are calculated is present in the DataFrame
    assert column_name in df.columns, "Column not found in DataFrame"
    # Assert the column for which summary statistics are calculated is numeric
    assert pd.api.types.is_numeric_dtype(df[column_name]), "Column is not numeric"

    # Group the data by the specified columns
    grouped = df.groupby(group_cols)[column_name]
    
    # Calculate the summary statistics
    summary = grouped.agg(['min', 'max', 'mean', 'median']).reset_index()
    summary.columns = group_cols + [f"{stat}_{column_name}" for stat in ['min', 'max', 'mean', 'median']]
    
    # Merge the summary statistics back to the original dataframe
    df_with_summary = pd.merge(df, summary, on=group_cols, how='left')
    
    return df_with_summary


In [19]:
C636_entry_func_effects = pd.read_csv("../results/func_effects/averages/C636_entry_func_effects.csv")
C636_entry_func_effects['condition'] = "C636"
Mxra8_entry_func_effects = pd.read_csv("../results/func_effects/averages/293T-Mxra8_entry_func_effects.csv")
Mxra8_entry_func_effects['condition'] = "293T-Mxra8"
TIM1_entry_func_effects = pd.read_csv("../results/func_effects/averages/293T-TIM1_entry_func_effects.csv")
TIM1_entry_func_effects['condition'] = "293T-TIM1"

In [20]:
# Join all conditions into a single data frame
all_functional_effect_raw = pd.concat([C636_entry_func_effects, Mxra8_entry_func_effects, TIM1_entry_func_effects])

In [21]:
# Filter
all_functional_effect = (
    all_functional_effect_raw
        .query('times_seen >= 2')
        .query('mutant not in ["*", "-"]')
        .query('mutant != wildtype')
        .reset_index(drop=True)
)
all_functional_effect.head()

Unnamed: 0,site,wildtype,mutant,effect,effect_std,times_seen,n_selections,condition
0,1,M,I,-5.7,0.07851,17.25,4,C636
1,1,M,T,-5.686,0.01939,5.0,4,C636
2,2,S,A,-0.8984,0.6183,10.5,4,C636
3,2,S,C,-0.651,0.05669,9.5,4,C636
4,2,S,D,0.4727,0.2227,8.0,4,C636


In [22]:
summarized_functional_effects = add_grouped_summary_stats(all_functional_effect, 'effect')
summarized_functional_effects.head()

Unnamed: 0,site,wildtype,mutant,effect,effect_std,times_seen,n_selections,condition,min_effect,max_effect,mean_effect,median_effect
0,1,M,I,-5.7,0.07851,17.25,4,C636,-5.7,-5.686,-5.693,-5.693
1,1,M,T,-5.686,0.01939,5.0,4,C636,-5.7,-5.686,-5.693,-5.693
2,2,S,A,-0.8984,0.6183,10.5,4,C636,-1.943,0.5547,-0.546497,-0.3563
3,2,S,C,-0.651,0.05669,9.5,4,C636,-1.943,0.5547,-0.546497,-0.3563
4,2,S,D,0.4727,0.2227,8.0,4,C636,-1.943,0.5547,-0.546497,-0.3563


In [23]:
summarized_functional_effects.to_csv("./summarized_functional_effects.csv", index=False)