In [None]:
# ! pip install pyopenms matchms networkx matplotlib pandas scipy seaborn

# Load libraries and define functions

In [None]:
import glob
import pandas as pd
import numpy as np
import seaborn as sns
from pyopenms import *
from scipy.spatial.distance import cosine
from matchms import Spectrum
from matchms.similarity import ModifiedCosine
import networkx as nx
import matplotlib.pyplot as plt

# Function to extract spectrum closest to a given retention time
def extract_spectrum_at_rt(exp, target_rt, rt_window=0.5):
    """
    Extracts the spectrum closest to the target retention time within a specified window.
    
    :param exp: MSExperiment object containing the spectra.
    :param target_rt: Target retention time.
    :param rt_window: Retention time window for selecting the spectrum.
    :return: The spectrum closest to the target retention time.
    """
    min_diff = float('inf')
    selected_spectrum = None
    for spectrum in exp:
        rt = spectrum.getRT()
        if abs(rt - target_rt) < rt_window and abs(rt - target_rt) < min_diff:
            selected_spectrum = spectrum
            min_diff = abs(rt - target_rt)
    return selected_spectrum
    
# Compute similarity between spectra (using cosine similarity as an example)
def compute_similarity(spectrum1, spectrum2):
    mz1, intensity1 = spectrum1.get_peaks()
    mz2, intensity2 = spectrum2.get_peaks()
    intensity1 = (intensity1 / intensity1.sum())
    intensity2 = (intensity2 / intensity2.sum())
    mz1 = mz1.round()
    mz2 = mz2.round()
    intersect = set(mz1).intersection(mz2)
    df1 = pd.DataFrame([mz1, intensity1]).T.drop_duplicates(0)
    df2 = pd.DataFrame([mz2, intensity2]).T.drop_duplicates(0)
    df1 = df1[df1[0].isin(intersect)]
    df2 = df2[df2[0].isin(intersect)]
    
    # Simple cosine similarity - consider more sophisticated alignment and similarity for real applications
    similarity = 1 - cosine(df1[1].tolist(), df2[1].tolist())
    return similarity

# Compute similarity between spectra (using cosine similarity as an example)
# https://matchms.readthedocs.io/en/stable/api/matchms.similarity.html#matchms.similarity.ModifiedCosine
def modified_cosine_similarity(spectrum1, spectrum2):
    mz1, intensity1 = spectrum1.get_peaks()
    mz2, intensity2 = spectrum2.get_peaks()
    
    intensity1 = (intensity1 / intensity1.sum()).astype(float)
    intensity2 = (intensity2 / intensity2.sum()).astype(float)

    spectrum_1 = Spectrum(mz=mz1.astype(float), intensities=intensity1, metadata={"precursor_mz": 1.0})
    spectrum_2 = Spectrum(mz=mz2.astype(float), intensities=intensity2, metadata={"precursor_mz": 1.0})
    
    # Use factory to construct a similarity function
    modified_cosine = ModifiedCosine(tolerance=2)
    
    score = modified_cosine.pair(spectrum_1, spectrum_2)
    return score

In [None]:
# Load your metabolite retention times
metabolites_df = pd.read_csv("Draft/metabolite_data.txt", sep="\t")  # Assuming a CSV with 'Name' and 'RetentionTime' columns
metabolites_df["rt"] = metabolites_df["Name"].str.split("_").str[2].astype(float)
metabolites_df.head(n=2)

In [None]:
sample_dict = {'samle1': "sample1.mzML",
'samle2': "samle2.mzML",
'samle3': "samle3.mzML"}

mog_cols = []
for col in metabolites_df.columns:
    if "MoG" in col:
        mog_cols.append(col)
tmp_df = metabolites_df[["Name", "rt", "Fold Change", "T-test", "class"] + mog_cols]

# Function to find the column name with the largest value
def find_max_column(row):
    # The `idxmax` method returns the index (column name in this case) of the first occurrence of maximum value
    return row.idxmax()

# Apply the function across the dataframe row-wise (axis=1)
tmp_df['MaxValueColumn'] = tmp_df[mog_cols].apply(find_max_column, axis=1)
tmp_df['filename'] = tmp_df['MaxValueColumn'].replace(sample_dict)
tmp_df.head(n=3)

In [None]:
# Load your raw file
spectra_dict = {}

for filename in tmp_df["filename"].unique().tolist():
    name_list = tmp_df[tmp_df["filename"]==filename]["Name"].tolist()
    rt_list = tmp_df[tmp_df["filename"]==filename]["rt"].tolist()
    file_path = "mzML_converted/" + filename
    exp = MSExperiment()
    MzMLFile().load(file_path, exp)

    # Extract spectra for each metabolite
    metabolite_spectra = {}
    #for _, row in metabolites_df.iterrows():
    for ix, rt in enumerate(rt_list):
        #print(rt)
        #break
        spectrum = extract_spectrum_at_rt(exp, rt * 60)
        spectra_dict[name_list[ix]] = spectrum


# Regular Cosien

In [None]:
# Construct a similarity network
G = nx.Graph()
for name1, spectrum1 in spectra_dict.items():
    for name2, spectrum2 in spectra_dict.items():
        if name1 != name2:
            similarity = compute_similarity(spectrum1, spectrum2)
            if similarity > 0.5:  # Threshold for similarity to consider an edge
                G.add_edge(name1, name2, weight=similarity)

In [None]:
import matplotlib.patches as mpatches

F = G.copy()
F.remove_edges_from([(n1, n2) for n1, n2, w in G.edges(data="weight") if w < 0.75])

# Visualize the network
#pos = nx.spring_layout(F)
pos = nx.kamada_kawai_layout(F)

node_classes = tmp_df[["Name", "class"]].set_index("Name").to_dict()["class"]
# Assign class attributes to nodes in the graph
nx.set_node_attributes(F, node_classes, 'class')


# Unique classes and their corresponding indices
unique_classes = tmp_df["class"].unique().tolist()
class_indices = {cls: idx for idx, cls in enumerate(unique_classes)}

# Use a Matplotlib colormap
colormap = plt.cm.get_cmap('tab20', len(unique_classes))  # 'tab10' is a good choice for up to 10 classes

# Generate colors for each node based on its class
node_colors = [colormap(class_indices[F.nodes[node]['class']]) for node in F]

# Draw the graph
nx.draw(F, pos, with_labels=False, node_color=node_colors, node_size=100)

# Create a legend
legend_handles = [mpatches.Patch(color=colormap(i), label=cls) for cls, i in class_indices.items()]
plt.legend(handles=legend_handles, title="Node Classes", bbox_to_anchor=(1.04, 1), loc="upper left")
#plt.legend()

plt.show()

In [None]:
import matplotlib.patches as mpatches

F = G.copy()
F.remove_edges_from([(n1, n2) for n1, n2, w in G.edges(data="weight") if w < 0.75])

# Visualize the network
#pos = nx.spring_layout(F)
pos = nx.kamada_kawai_layout(F)

node_classes = tmp_df[["Name", "class"]].set_index("Name").to_dict()["class"]
# Assign class attributes to nodes in the graph
nx.set_node_attributes(F, node_classes, 'class')


# Unique classes and their corresponding indices
unique_classes = tmp_df["class"].unique().tolist()
class_indices = {cls: idx for idx, cls in enumerate(unique_classes)}

# Use a Matplotlib colormap
colormap = plt.cm.get_cmap('tab20', len(unique_classes))  # 'tab10' is a good choice for up to 10 classes

# Generate colors for each node based on its class
node_colors = [colormap(class_indices[F.nodes[node]['class']]) for node in F]

# Draw the graph
nx.draw(F, pos, with_labels=False, node_color=node_colors, node_size=100)

# Create a legend
legend_handles = [mpatches.Patch(color=colormap(i), label=cls) for cls, i in class_indices.items()]
plt.legend(handles=legend_handles, title="Node Classes", bbox_to_anchor=(1.04, 1), loc="upper left")
#plt.legend()

plt.show()

# Modified Cosine

In [None]:
# Construct a similarity network
G_mod = nx.Graph()
for name1, spectrum1 in spectra_dict.items():
    for name2, spectrum2 in spectra_dict.items():
        if name1 != name2:
            similarity = modified_cosine_similarity(spectrum1, spectrum2)
            if similarity['score'] > 0.5:  # Threshold for similarity to consider an edge
                G_mod.add_edge(name1, name2, weight=float(similarity['score']))

In [None]:
import matplotlib.patches as mpatches

F = G_mod.copy()
F.remove_edges_from([(n1, n2) for n1, n2, w in F.edges(data="weight") if w < 0.8])

# Visualize the network
#pos = nx.spring_layout(F)
pos = nx.kamada_kawai_layout(F)

node_classes = tmp_df[["Name", "class"]].set_index("Name").to_dict()["class"]
# Assign class attributes to nodes in the graph
nx.set_node_attributes(F, node_classes, 'class')


# Unique classes and their corresponding indices
unique_classes = tmp_df["class"].unique().tolist()
class_indices = {cls: idx for idx, cls in enumerate(unique_classes)}

# Use a Matplotlib colormap
colormap = plt.cm.get_cmap('tab20', len(unique_classes))  # 'tab10' is a good choice for up to 10 classes

# Generate colors for each node based on its class
node_colors = [colormap(class_indices[F.nodes[node]['class']]) for node in F]

# Draw the graph
nx.draw(F, pos, with_labels=False, node_color=node_colors, node_size=100)

# Create a legend
legend_handles = [mpatches.Patch(color=colormap(i), label=cls) for cls, i in class_indices.items()]
plt.legend(handles=legend_handles, title="Node Classes", bbox_to_anchor=(1.04, 1), loc="upper left")
#plt.legend()

plt.show()

In [None]:
# Save the graph as an edge list
# Assuming G is your NetworkX graph
edges = nx.to_pandas_edgelist(F)
edges.to_csv('edges.tsv', index=False, sep="\t")