# AMP Dataset Analysis
By: George Phung
Date: 08-10-2025

In [41]:
# essentials
import pandas as pd
import numpy as np
# data viz packages
import matplotlib as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
# stats packages
import scipy as stats
from scipy.stats import shapiro

# Questions
For each tested methanogena, which AMPs are successful?
1.	Data sources
    a.	Found in AMP data files (E. coli, M. boviskoreani, M. gottschalkii, M.rumination)
    b.	Look at the PNGs, AMPs (y-axis) with an x-axis value to the left of the red line are “successful” – to the right is “unsuccessful”
        i.	Note that multiple replicates, those will need to be avg’d together
2.	Actions
    a.	Take avg of replicates in combined_amp_data
    b.	Create new column noting ‘successful’ or ‘unsuccessful’ for each methanogen 
3.	Outputs
    a.	Return 3 data files, one for each PNG’d species, with binary success column. Also return 1 file combining those 3

When looking at the methanogens that AMPs were successful against, what is common and what is distinct about the lipid compositions?
1.	Data sources
    a.	The output combined file from the above question
2.	Actions
    a.	Take the returned individual data files above (i.e., for one methanogen) and for that methanogen 
For each possible AMP, at each possible concentration, create individual frames to have and pull


In [42]:
# Reading the xlsx files

# stole it from jonathans code lmao
df = pd.read_excel(r"C:\Users\georg\Downloads\DataBridge_Fall2025\lipid_analysis_biomedit 2\lipid_analysis_biomedit\data_files\amp_data\input\Compiled AMP data - normalized.xlsx", \
                   sheet_name="AMP normalized - compiled", skiprows=4)
amp_data = df.copy()
#dropping column not found in the second data file
amp_data.drop(columns='Assay_date', inplace=True)
amp_data['filesource'] = "excel file 1"
#dividing the percent growth col by 100 so both files have the percentage as decimals (which match the PNGs)
amp_data['percent_growth_inhibition_compared_to_no_addition_ctrl'] = amp_data['percent_growth_inhibition_compared_to_no_addition_ctrl'] / 100

#reading in the second data file
df2 = pd.read_excel(r"C:\Users\georg\Downloads\DataBridge_Fall2025\lipid_analysis_biomedit 2\lipid_analysis_biomedit\data_files\amp_data\input\PercentGrowth_2nd_round_peptides.xlsx", \
                    sheet_name="PercentGrowth_2nd_round_peptide")
amp2_data = df2.copy()
#dropping unique column not found in the first file
amp2_data.drop(columns='index_ID', inplace=True)
amp2_data['filesource'] = "txt file 2"

#combining the data files into one
full_amp_data = pd.concat([amp_data, amp2_data], ignore_index=True)

In [43]:
full_amp_data

Unnamed: 0,AMP_ID,Methanogen_species,percent_growth_inhibition_compared_to_no_addition_ctrl,Concentration,Incubation_time,filesource
0,m3-m10,Mruminantium,0.939606,10uM,96h,excel file 1
1,m10-m3,Mruminantium,0.998813,10uM,96h,excel file 1
2,m3-m5,Mruminantium,0.909472,10uM,96h,excel file 1
3,m3-m6,Mruminantium,0.979233,10uM,96h,excel file 1
4,m3-m7,Mruminantium,1.004577,10uM,96h,excel file 1
...,...,...,...,...,...,...
524,ai21-m3,E.coli,-0.231120,10uM,T17,txt file 2
525,ai21-m10,E.coli,1.718674,10uM,T17,txt file 2
526,ai21-m10,E.coli,1.397523,10uM,T17,txt file 2
527,ai21-m10,E.coli,1.705712,10uM,T17,txt file 2


# Preliminary Checks

In [44]:
# Reformatting
full_amp_copy = full_amp_data.copy()

species_mapping = { #key = old, value = standard (new)
    "Ecoli" : "E.coli",
    "Mboviskoreani" : "M.boviskoreani",
    "M.gottschalkii" : "M.gottschalkii",
    "Mruminantium" : "M.ruminantium"
}
full_amp_copy['Methanogen_species'] = full_amp_copy['Methanogen_species'].replace(species_mapping)

time_mapping = {
    "T17" : "17h",
    "T72" : "72h",
    "T96" : '96h'
}
full_amp_copy['Incubation_time'] = full_amp_copy['Incubation_time'].replace(time_mapping)

amp_id_mapping = {
    "CAP-18 WT 1uM"     : "CAP-18 WT",
    "CAP-18 WT 5uM"     : "CAP-18 WT",
    "CAP18_WT"          : "CAP-18 WT",
    "ai21-3-10 1 uM"    : "ai21-3-10",
    "ai21-3-10 10 uM"   : "ai21-3-10",
    "ai21-3-10 5 uM"    : "ai21-3-10",
    "ai21-m3 1 uM"      : "ai21-m3",
    "ai21-m3 10 uM"     : "ai21-m3",
    "ai21-m3 5 uM"      : "ai21-m3",
    "combogr 1uM"       : "combo-gr",
    "combogr 5uM"       : "combo-gr",
    "m10-2 1uM"         : "m10-2",
    "m10-2 5uM"         : "m10-2",
    "n7-trim-m10m3 1uM" : "n7-trim-m10m3",
    "n7-trim-m10m3 5uM" : "n7-trim-m10m3",
    "n7-trim-m3 1uM"    : "n7-trim-m3",
    "n7-trim-m3 5uM"    : "n7-trim-m3",
    "n7-trim-m3m10 1uM" : "n7-trim-m3m10",
    "n7-trim-m3m10 5uM" : "n7-trim-m3m10",
    "r10g_n11r 1uM"     : "r10g_n11r",
    "r10g_n11r 5uM"     : "r10g_n11r",
}
full_amp_copy['AMP_ID'] = full_amp_copy['AMP_ID'].replace(amp_id_mapping)

# Splitting our dfs based on concentration

# probably wont use this for now
AMP1uM = full_amp_copy[full_amp_copy["Concentration"] == "1uM"].sort_values(["AMP_ID", "Methanogen_species"], ascending=True)
AMP5uM = full_amp_copy[full_amp_copy["Concentration"] == "5uM"].sort_values(["AMP_ID", "Methanogen_species"], ascending=True)
AMP10uM = full_amp_copy[full_amp_copy["Concentration"] == "10uM"].sort_values(["AMP_ID", "Methanogen_species"], ascending=True)

# make a tree map with each specieces and concentration to see which AMP compared CAP-18 is sucessful or not 

In [58]:
# Cleaning our data out
AMP10uM = full_amp_copy[full_amp_copy["Concentration"] == "10uM"].sort_values(["AMP_ID", "Methanogen_species"], ascending=True)
AMP10uM = AMP10uM[AMP10uM["filesource"] == "txt file 2"]
AMP10uM["mean"] = AMP10uM.groupby(["AMP_ID", 'Methanogen_species'])['percent_growth_inhibition_compared_to_no_addition_ctrl'].transform('mean')
AMP_list = list(set(AMP10uM["AMP_ID"]))
AMP_list.sort()
species_list = list(species_mapping.values())
# adding a successful? and delta columns
for species in species_list:
    tmp_df = AMP10uM[AMP10uM['Methanogen_species'] == species]
    cap18_med = tmp_df.loc[tmp_df['AMP_ID'].eq("CAP-18 WT"), 'percent_growth_inhibition_compared_to_no_addition_ctrl'].mean()
    AMP10uM.loc[AMP10uM["Methanogen_species"].eq(species), "delta"] = cap18_med - AMP10uM.loc[AMP10uM["Methanogen_species"].eq(species), "percent_growth_inhibition_compared_to_no_addition_ctrl"]
    AMP10uM["sufficient?"] = np.where(AMP10uM["delta"] > 0, 1, 0)
# grouped
AMP10uMGrouped = AMP10uM.groupby(["AMP_ID",	"Methanogen_species", "Concentration", "Incubation_time", "filesource"], as_index=False)["percent_growth_inhibition_compared_to_no_addition_ctrl"].mean()
for species in species_list:
    tmp_df = AMP10uMGrouped[AMP10uMGrouped['Methanogen_species'] == species]
    cap18_med = tmp_df[tmp_df["AMP_ID"] == "CAP-18 WT"]["percent_growth_inhibition_compared_to_no_addition_ctrl"].mean()
    AMP10uMGrouped.loc[AMP10uMGrouped["Methanogen_species"].eq(species), "delta"] = cap18_med - AMP10uMGrouped.loc[AMP10uMGrouped["Methanogen_species"].eq(species), "percent_growth_inhibition_compared_to_no_addition_ctrl"]
    AMP10uMGrouped["sufficient?"] = np.where(AMP10uMGrouped["delta"] > 0, 1, 0)
    
AMP10uM = AMP10uM[AMP10uM["AMP_ID"] != "CAP-18 WT"]


In [46]:
print("general info:")
print(AMP10uM.info(), "\n")
print("testing for duplicates, number of dupes:")
print(AMP10uM.duplicated().sum(), "\n")
print("null columns:")
print(AMP10uM.isna().sum())

general info:
<class 'pandas.core.frame.DataFrame'>
Index: 342 entries, 481 to 398
Data columns (total 9 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   AMP_ID                                                  342 non-null    object 
 1   Methanogen_species                                      342 non-null    object 
 2   percent_growth_inhibition_compared_to_no_addition_ctrl  342 non-null    float64
 3   Concentration                                           342 non-null    object 
 4   Incubation_time                                         342 non-null    object 
 5   filesource                                              342 non-null    object 
 6   mean                                                    342 non-null    float64
 7   delta                                                   0 non-null      float64
 8   sufficient?                   

In [53]:
# Checking for normality using the Shapiro–Wilk test
counter = 0
success = 0
for AMPs in AMP_list:
    for species in species_list:
        values = AMP10uM[(AMP10uM["AMP_ID"] == AMPs) & (AMP10uM["Methanogen_species"] == species)]["percent_growth_inhibition_compared_to_no_addition_ctrl"].tolist()
        if len(values) != 0:     # omitting the species not tested with the AMPS          
            counter = counter + 1
            stat, p = shapiro(values)
            print("Statstic for " + str(AMPs) + " against " + str(species) + ": " + str(stat))
            if p > 0.05:
                print("Pval: " + str(p) + "\ndata looks normally distrubted")
                success = success + 1
            else:
                print("Pval:" + str(p) + "\ndata NOT looks normally distrubted")
print("percent of the groups tested is normal: " + str(success/counter))

# Most groups are normal, but since the sample size is so small, we can't assume that they're normally distrbuted. 

Statstic for ai21-3-10 against E.coli: 0.945133138868581
Pval: 0.6858588105082515
data looks normally distrubted
Statstic for ai21-3-10 against M.boviskoreani: 0.9317235776388808
Pval: 0.6045662040324984
data looks normally distrubted
Statstic for ai21-3-10 against M.gottschalkii: 0.917045708323391
Pval: 0.5205146751366847
data looks normally distrubted
Statstic for ai21-3-10 against M.ruminantium: 0.9789529378125107
Pval: 0.8958504070359325
data looks normally distrubted
Statstic for ai21-3-10 against E.coli: 0.945133138868581
Pval: 0.6858588105082515
data looks normally distrubted
Statstic for ai21-3-10 against M.boviskoreani: 0.9317235776388808
Pval: 0.6045662040324984
data looks normally distrubted
Statstic for ai21-3-10 against M.gottschalkii: 0.917045708323391
Pval: 0.5205146751366847
data looks normally distrubted
Statstic for ai21-3-10 against M.ruminantium: 0.9789529378125107
Pval: 0.8958504070359325
data looks normally distrubted
Statstic for ai21-3-10 against E.coli: 0.94513

In [None]:
# Another thing to note is that the incubation time is the same for each Methanogen_species, AMPs did not have an effect on the incubation time
same = AMP10uM.groupby(["Methanogen_species", "Incubation_time"])["Incubation_time"].count()
same

Methanogen_species  Incubation_time
E.coli              17h                96
M.boviskoreani      96h                99
M.gottschalkii      72h                96
M.ruminantium       96h                51
Name: Incubation_time, dtype: int64

In [None]:
# Our concentration is also the same across all species tested
same = AMP10uM.groupby(["Methanogen_species", "Concentration"])["Concentration"].count()
same

Methanogen_species  Concentration
E.coli              10uM             96
M.boviskoreani      10uM             99
M.gottschalkii      10uM             96
M.ruminantium       10uM             51
Name: Concentration, dtype: int64

In [54]:
# graph
fig = px.density_heatmap(AMP10uM, 
                         x = "Methanogen_species", 
                         y= "AMP_ID", 
                         z = "delta", 
                         histfunc="avg",
                         width = 800, 
                         height = 600, 
                         color_continuous_scale="RdBu"
                         )
fig.update_coloraxes(colorbar_title="Delta Proportion")
fig.update_layout(title="Difference of Effectiveness of AMPs vs CAP-18 WT against Methanogen species", yaxis_title = "AMPs", xaxis_title ="Methanogen species")
fig.update_layout(coloraxis = dict(cmin = -0.5, cmax = 0.5))
fig.show(renderer='vscode')

We can see that all samples of the AMPs "ai-21-10-3", "ai-21-m10", "m10-2", and "m10-7" were less effective than the control "CAP-18WT" in the E.coli species tested, it's also intereating to note that many AMPs for E.coli was less effective than CAP-18 WT, where around half is extremely less effective (> 0.5) while the other half is somewhat less effective.
On the flipside, only "ai21-m3" was more effective than the control group in all specieces. It's also worthy to note that m3-6 was less effective in E.coli, but is effective in all other species, with the values being more "polarized."
Another interesting thing is that the M.ruminantium species had more effective AMPs, but around half had the same effectiveness as "CAP-18 WT"

In [None]:
# adding my own shape, yay.
shape = go.Figure()
shape.add_shape(
    type="circle",  # cirlce blah blah bah
    xref="x", yref="y",
    x0=1, y0=1, x1=5, y1=3,  # bounds
    fillcolor="LightSkyBlue",
    line_color="RoyalBlue",
    opacity=0.7,
    layer="below"
)
# TODO: fix this dumb bullshit
avg_by_species = AMP10uM.groupby("AMP_ID")["delta"].mean().reset_index(drop=False)
fig = px.line(avg_by_species, x = "delta", y = "AMP_ID", markers=True, height = 800, text = "AMP_ID")
fig.update_layout(title="Difference of Effectiveness of AMPs vs CAP-18 WT against all Methanogen species", 
                  xaxis_title = "average effectiveness", 
                  yaxis=dict(autorange="reversed", visible = False),
                  xaxis = dict(autorange = "reversed", range = [0.3, -0.3]))

fig.update_traces(marker = dict(size=25, symbol = "square"))
fig.add_vline(x = 0, line_dash = "dash")

# cannot figure out how to make pills lmfao

We can see that all AMPs frmo the "n7-trim" group are all on average, slightly better than to the control "CAP-18 WT"

In [None]:
#Attempting the spearman's rank correleation test

# pisses me off sm
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# attemping to create a df w/ 1 and zeros correspoind of they are sufficient enough
listy_list = pd.DataFrame()

listy_list["amps"] = AMP_list
for species in species_list:
    spec_list = AMP10uMGrouped[AMP10uMGrouped["Methanogen_species"] == species].copy()
    new_rows = []
    for AMPs in AMP_list:
        if AMPs not in spec_list["AMP_ID"].tolist():
            new_rows.append({ # creating NaN values for species not tested 
                "AMP_ID": AMPs,
                "Methanogen_species": species,
                "Concentration": None,
                "Incubation_time": None,
                "filesource": None,
                "percent_growth_inhibition_compared_to_no_addition_ctrl": None,
                "delta": None,
                "sufficient?": None
            })
    if new_rows:
        spec_list = pd.concat([spec_list, pd.DataFrame(new_rows)], ignore_index=True)
    spec_list = spec_list.sort_values("AMP_ID", ascending = True)
    species_success_list = spec_list["sufficient?"].tolist()
    listy_list[species] = species_success_list
    species_success_list = []
# this took way longer than what i anticpated

listy_list

Unnamed: 0,amps,E.coli,M.boviskoreani,M.gottschalkii,M.ruminantium
0,CAP-18 WT,0.0,0.0,0.0,0.0
1,ai21-10-3,0.0,0.0,0.0,
2,ai21-3-10,0.0,0.0,0.0,1.0
3,ai21-3-10m10,0.0,,,
4,ai21-m10,0.0,0.0,0.0,
5,ai21-m3,1.0,1.0,1.0,1.0
6,combo-gr,0.0,0.0,0.0,
7,m10-2,0.0,0.0,0.0,
8,m10-3,0.0,0.0,1.0,1.0
9,m10-4,0.0,0.0,0.0,1.0


In [111]:
# now to the real thing, we are using a non parametric test b/c we cannot prove that the samples are normally distrubtued
from scipy.stats import spearmanr
import itertools

lists = [
    listy_list["E.coli"].tolist(),
    listy_list["M.boviskoreani"].tolist(),
    listy_list["M.gottschalkii"].tolist(),
    listy_list["M.ruminantium"].tolist()
         ]
names = ['E.coli', 'M.boviskoreani', 'M.gottschalkii', 'M.ruminantium']

for (n1, l1), (n2, l2) in itertools.combinations(zip(names, lists), 2):

    l1 = np.array(l1)
    l2 = np.array(l2)

    # create a mask for where both values are not NaN
    mask = ~np.isnan(l1) & ~np.isnan(l2)

    l1 = l1[mask]
    l2 = l2[mask]
    print("spearman's rank corrlation between " + n1 + " and " + n2)
    correlation, p_value = spearmanr(l1, l2)
    print(f"Spearman's Rank Correlation Coefficient: {correlation}")
    print(f"P-value: {p_value}" + "\n")

spearman's rank corrlation between E.coli and M.boviskoreani
Spearman's Rank Correlation Coefficient: 0.42640143271122094
P-value: 0.037727624056725056

spearman's rank corrlation between E.coli and M.gottschalkii
Spearman's Rank Correlation Coefficient: 0.2548235957188128
P-value: 0.2294814590497754

spearman's rank corrlation between E.coli and M.ruminantium
Spearman's Rank Correlation Coefficient: 0.19245008972987523
P-value: 0.5287571161014745

spearman's rank corrlation between M.boviskoreani and M.gottschalkii
Spearman's Rank Correlation Coefficient: 0.3850770231077027
P-value: 0.057320345818545906

spearman's rank corrlation between M.boviskoreani and M.ruminantium
Spearman's Rank Correlation Coefficient: 0.4000000000000001
P-value: 0.15644958191999983

spearman's rank corrlation between M.gottschalkii and M.ruminantium
Spearman's Rank Correlation Coefficient: 0.05504818825631804
P-value: 0.8517327152291266



We can see that between E.coli and M.boviskoreani, there is a statsitcal signifigance between those two if we are assuming below the 5% threshold.

Future questions that will be answered (will be more focused on the groups of AMPs):

Do different AMP groups have significantly different effectiveness against methanogens?

Do groups naturally cluster by performance or by affected species?

Do AMP groups differ in their average inhibition rate or success rate?

