In [35]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn import preprocessing
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

In [36]:
# pip install -U kaleido

In [37]:
def normalizar(df):
  df_norm = df.apply(lambda a: a/a.sum(), axis = 1)
  return df_norm
def scaler(df):
  scaler = preprocessing.StandardScaler().fit(df)
  data_scaled = scaler.transform(df)
  return data_scaled

def get_confidence_ellipse(data, n_std=1.96):
  mean = np.mean(data, axis=0)
  mean = np.mean(data, axis=0)
  cov = np.cov(data, rowvar=False)
  # Use Singular Value Decomposition (SVD) para garantir valores válidos
  U, S, Vt = np.linalg.svd(cov)
  eigvals = S
  eigvecs = U
  width, height = 2 * n_std * np.sqrt(eigvals)
  angle = np.degrees(np.arctan2(*eigvecs[:, 0][::-1]))
  return mean, width, height, angle

def get_axis_limits_with_margin(data, margin=0.3):
    data_min, data_max = np.min(data), np.max(data)
    data_range = data_max - data_min
    return data_min - margin * data_range, data_max + margin * data_range

def plot2D(data, colors, dm, blank_tag):
    fig = go.Figure()
    for i, label in enumerate(labels.Method.unique()):
        print(label)
        print(colors[i])
        df = data[data.Method == label]
        fig.add_trace(go.Scatter(
        x = df.PC1.to_list(),
        y = df.PC2.to_list(),
        mode='markers+text',
        text = df.Sample.to_list(),
        textposition="top center",
        marker=dict(color=colors[i], size=8),
        name=f'{label}'
    ))
        try:
            mean, width, height, angle = get_confidence_ellipse(df[["PC1","PC2"]])
            ellipse_x = mean[0] + width * np.cos(np.linspace(0, 2 * np.pi, 100)) * np.cos(np.radians(angle)) - height * np.sin(np.linspace(0, 2 * np.pi, 100)) * np.sin(np.radians(angle))
            ellipse_y = mean[1] + width * np.cos(np.linspace(0, 2 * np.pi, 100)) * np.sin(np.radians(angle)) + height * np.sin(np.linspace(0, 2 * np.pi, 100)) * np.cos(np.radians(angle))

            fig.add_trace(go.Scatter(
            x=ellipse_x,
            y=ellipse_y,
            mode='lines',
            line=dict(color=colors[i]),
            showlegend=False

        ))
        except:
            pass
    fig.update_layout(
        title = f"PCA {table}",
        xaxis=dict(
            title = f"PC2 {round(dm.explained_variance_ratio_[0]*100)}%",
            showgrid=True,
            showline=True,
            linewidth=1,
            linecolor='black'
        ),
        yaxis=dict(
            title = f"PC2 {round(dm.explained_variance_ratio_[1]*100)}%",
            showgrid=True,
            showline=True,
            linewidth=1,
            linecolor='black'
        ),
        plot_bgcolor='white',
        paper_bgcolor='white',
        height=600,
        width=800
    )
    fig.show()
    fig.write_image(f"Figures/PCA-2D-{blank_tag}.png", format = "png")

def plot3D(data, colors, dm, blank_tag):

    x_limits = get_axis_limits_with_margin(data.PC1)
    y_limits = get_axis_limits_with_margin(data.PC2)
    z_limits = get_axis_limits_with_margin(data.PC3)

    fig = go.Figure()
    for i, label in enumerate(labels.Method.unique()):
        df = data[data.Method == label]

        fig.add_trace(go.Scatter3d(
            x=df.PC1.to_list(),
            y=df.PC2.to_list(),
            z=df.PC3.to_list(),
            mode='markers+text',
            text=df.Sample.to_list(),
            textposition="top center",
            marker=dict(color=colors[i], size=8),
            name=f'{label}'
        ))

    fig.update_layout(
        title=f"PCA {table}",
        scene=dict(
            xaxis=dict(
                title=f"PC1 {round(dm.explained_variance_ratio_[0]*100)}%",
                showgrid=True,
                showline=True,
                linewidth=1,
                linecolor='black',
                range = x_limits
            ),
            yaxis=dict(
                title=f"PC2 {round(dm.explained_variance_ratio_[1]*100)}%",
                showgrid=True,
                showline=True,
                linewidth=1,
                linecolor='black',
                range = y_limits
            ),
            zaxis=dict(
                title=f"PC3 {round(dm.explained_variance_ratio_[2]*100)}%",
                showgrid=True,
                showline=True,
                linewidth=1,
                linecolor='black',
                range = z_limits
            )
        ),
        plot_bgcolor='white',
        paper_bgcolor='white',
        height=600,
        width=800
    )

    fig.show()
    # fig.write_image(f"Figures/PCA-3D-{blank_tag}.png", format = "png")

In [38]:
table = "Quant_table_mzmine"
df = pd.read_excel(f"data/{table}.xlsx")
df.fillna(0.0, inplace = True)
labels = df[["Step","Method","Sample"]]
colors = ["black","blue","green","cyan","pink"]

In [39]:
df_with_blank = df.copy()
data_with_blank = df_with_blank.iloc[:,3:]
data_with_blank = scaler(data_with_blank)

In [40]:
df_no_blank = df.loc[df.Method != "Blank"].copy()
data_no_blank = df_no_blank.iloc[:,3:]
data_no_blank = scaler(data_no_blank)

In [41]:
pca_with_blank = PCA(n_components = 3)
pca_with_blank.fit(data_with_blank)
pca_with_blank_ = pca_with_blank.transform(data_with_blank)
pca_with_blank_df = pd.DataFrame({
    'PC1':pca_with_blank_[:,0],
    'PC2':pca_with_blank_[:,1],
    'PC3':pca_with_blank_[:,2],
    "Step": labels.Step.to_list(),
    "Method": labels.Method.to_list(),
    "Sample": labels.Sample.to_list()
})

pca_no_blank = PCA(n_components = 3)
pca_no_blank.fit(data_no_blank)
pca_no_blank_ = pca_no_blank.transform(data_no_blank)
pca_no_blank_df = pd.DataFrame({
    'PC1':pca_no_blank_[:,0],
    'PC2':pca_no_blank_[:,1],
    'PC3':pca_no_blank_[:,2],
    "Step":   labels[labels.Method != "Blank"].Step.to_list(),
    "Method": labels[labels.Method != "Blank"].Method.to_list(),
    "Sample": labels[labels.Method != "Blank"].Sample.to_list()
})

In [42]:
plot2D(pca_no_blank_df, colors, pca_no_blank, "noblank");

Blank
black
TRADITIONAL METHOD
blue
ALTERNATIVE METHOD
green
TRADITIONAL MODIFIED
cyan
ALTERNATIVE MODIFIED
pink



Mean of empty slice.


invalid value encountered in divide


Degrees of freedom <= 0 for slice


divide by zero encountered in divide


invalid value encountered in multiply


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). 

In [43]:
plot2D(pca_with_blank_df, colors, pca_with_blank, "blank");

Blank
black
TRADITIONAL METHOD
blue
ALTERNATIVE METHOD
green
TRADITIONAL MODIFIED
cyan
ALTERNATIVE MODIFIED
pink



Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as la

In [44]:
plot3D(pca_with_blank_df, colors, pca_with_blank, "blank");

In [45]:
plot3D(pca_no_blank_df, colors, pca_no_blank, "noblank");