In [2]:
import os
import requests
import json
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
pd.set_option("display.float_format", "{:.2f}".format)

In [None]:
data_path = os.path.join(os.getcwd(), "data")
seller_items_path = os.path.join(data_path, "input", "seller_items")
parquet_file_path = os.path.join(data_path, "input", "sellers_items.parquet")

In [None]:
#monta dataframe unificado e salva como parquet para proxiumas execuções

# Lista para armazenar os dados
data = []

if os.path.exists(parquet_file_path):
    df = pd.read_parquet(parquet_file_path)
else:
    # Percorrer todos os subdiretórios no diretório seller_items_path
    for subfolder_name in os.listdir(seller_items_path):
        subfolder_path = os.path.join(seller_items_path, subfolder_name)
        if os.path.isdir(subfolder_path):  # Verificar se é um diretório
            # Percorrer todos os arquivos no subdiretório
            for filename in os.listdir(subfolder_path):
                if filename.endswith("_items.json"):  # Garantir que são arquivos de sellers
                    file_path = os.path.join(subfolder_path, filename)

                    # Abrir e ler o arquivo linha por linha
                    with open(file_path, "r", encoding="utf-8") as file:
                        for line in file:
                            try:
                                item_data = json.loads(line)  # Converter JSON para dicionário
                                item_data["subfolder_name"] = subfolder_name  # Adicionar o nome da subpasta
                                data.append(item_data)
                            except json.JSONDecodeError as e:
                                print(f"Erro ao ler {filename}: {e}")

    # Criar DataFrame do pandas
    df = pd.DataFrame(data)
    # Salvar o DataFrame como arquivo Parquet
    df.to_parquet(os.path.join(data_path, "input", "sellers_items.parquet"))

data = None

In [None]:
# Exibir as primeiras linhas
print(df.shape)
display(df.head())

In [None]:
#-------data quality------------
# Verificar valores nulos
print(df.isnull().sum())

In [None]:
df.describe()

In [None]:
for col in df.columns:
    print(f"{col}: {df[col].nunique()}")


In [None]:
df["available_qty"].value_counts()

In [None]:
numeric_cols = [
    "price",
    "original_price",
    "discount",
    "available_qty",
    "installments",
    "num_attributes",
]
plt.figure(figsize=(12, 6))
for col in numeric_cols:
    plt.figure()
    sns.boxplot(x=df[col])
    plt.title(f"Boxplot de {col}")
    plt.show()

In [None]:


# dolarize price
currencies = {
    "MLB": "BRL",  # Brasil
    "MPE": "PEN",  # Perú
    "MBO": "BOB",  # Bolivia
    "MLU": "UYU",  # Uruguay
    "MLA": "ARS",  # Argentina
    "MPY": "PYG",  # Paraguay
    "MLM": "MXN",  # Mexico
}

# Function to get conversion rate from API
def get_conversion_rate(currency):
    data = {
        "BRL": 0.173,
        "PEN": 0.269,
        "BOB": 0.144,
        "UYU": 0.0229,
        "ARS": 0.000947,
        "PYG": 0.000126,
        "MXN": 0.0485,
    }
    # url = f"https://api.exchangerate-api.com/v4/latest/{currency}"
    # response = requests.get(url)
    # data = response.json()
    # return data["rates"]["USD"]
    return data[currency]

# Function to convert prices to USD
def convert_to_usd(price, currency):
    conversion_rate = get_conversion_rate(currency)
    return price * conversion_rate

# Create a dictionary to store conversion rates for each currency
conversion_rates = {currency: get_conversion_rate(currency) for currency in currencies.values()}
conversion_rates

In [None]:
# Apply conversion to the dataframe
df["currency"] = df["subfolder_name"].map(currencies)
df["currency_rate"] = df["currency"].map(conversion_rates)
df["original_price"] = df["original_price"].fillna(df["price"])
df["dollar_price"] = df["price"] * df["currency_rate"]
df["dollar_original_price"] = df["original_price"] * df["currency_rate"]
df["dollar_discount"] = df["discount"] * df["currency_rate"]
display(df.head())

In [None]:
numeric_cols = [
    "dollar_price",
    "dollar_original_price",
    "dollar_discount",
    "available_qty",
    "installments",
    "num_attributes",
]
plt.figure(figsize=(12, 6))
for col in numeric_cols:
    plt.figure()
    sns.boxplot(x=df[col])
    plt.title(f"Boxplot de {col}")
    plt.show()

In [None]:
# -------data_preparation-------------
# drop col because it has only one value
df = df.drop(
    columns=[
        "buying_mode",
        "price",
        "original_price",
        "discount",
        "currency",
        "currency_rate",
    ],
    axis=1,
)
# calculate total amount available
df["available_amount"] = df["dollar_price"] * df["available_qty"]

In [None]:
df_sorted = df.sort_values(by='dollar_price', ascending=False)
df_sorted.head()

In [None]:
df_grouped = df.groupby("seller_id").agg(
    {
        "category_id": [
            lambda x: list(set(x)),  # Unique categories
            lambda x: len(set(x)),  # Number of unique categories
            lambda x: (
                x.mode().iloc[0] if not x.mode().empty else None
            ),  # Most frequent category
            lambda x: (
                x.value_counts().max() if not x.empty else 0
            ),  # Dominant category ratio
        ],
        "dollar_price": [
            "count",
            "mean",
            lambda x: np.mean(np.abs(x - np.mean(x))),  # MAE
            lambda x: x.mode().iloc[0] if not x.mode().empty else None,  # Mode
            lambda x: np.percentile(x, 75)
            - np.percentile(x, 25),  # IQR (Interquartile Range)
            lambda x: (
                np.std(x) / np.mean(x) if np.mean(x) > 0 else 0
            ),  # Coefficient of Variation (CV)
        ],
        "dollar_original_price": [
            "mean",
            lambda x: np.mean(np.abs(x - np.mean(x))),
        ],  # MAE
        "dollar_discount": ["mean", lambda x: np.mean(np.abs(x - np.mean(x)))],  # MAE
        "available_qty": [
            "sum",
            "mean",
            lambda x: np.mean(np.abs(x - np.mean(x))),
        ],  # MAE
        "cataloged": "sum",
        "free_shipping": "sum",
        "condition": lambda x: (
            x.mode().iloc[0] if not x.mode().empty else None
        ),  # Most frequent condition
        "installments": ["mean", lambda x: np.mean(np.abs(x - np.mean(x)))],  # MAE
        # Most frequent buying mode
        "city": [
            lambda x: (
                x.mode().iloc[0] if not x.mode().empty else None
            ),  # Most frequent city
        ],
        "state": lambda x: (
            x.mode().iloc[0] if not x.mode().empty else None
        ),  # Most frequent state
        "has_gtin": "sum",
        "num_attributes": [
            "mean",
            lambda x: np.mean(np.abs(x - np.mean(x))),
            "median",
        ],  # MAE
        "available_amount": [
            "mean",
            lambda x: np.mean(np.abs(x - np.mean(x))),
            "median",
        ],  # MAE
    }
)

# Rename Columns
df_grouped.columns = [
    "unique_categories",
    "num_categories",
    "mode_category",
    "dominant_category_ratio",
    "dollar_price_count",
    "dollar_price_mean",
    "dollar_price_mae",
    "dollar_price_mode",
    "dollar_price_iqr",
    "dollar_price_cv",
    "dollar_original_price_mean",
    "dollar_original_price_mae",
    "dollar_discount_mean",
    "dollar_discount_mae",
    "available_qty_sum",
    "available_qty_mean",
    "available_qty_mae",
    "cataloged_sum",
    "free_shipping_sum",
    "condition_mode",
    "installments_mean",
    "installments_mae",
    "most_frequent_city",
    "most_frequent_state",
    "has_gtin_sum",
    "num_attributes_mean",
    "num_attributes_mae",
    "num_attributes_median",
    "available_amount_mean",
    "available_amount_mae",
    "available_amount_median",
]

df_grouped = df_grouped.reset_index()

df_grouped["total_offers"] = df.groupby("seller_id")["seller_id"].agg('count').values

# Compute proportions
df_grouped["proportion_cataloged"] = (
    df_grouped["cataloged_sum"] / df_grouped["total_offers"]
)
df_grouped["proportion_free_shipping"] = (
    df_grouped["free_shipping_sum"] / df_grouped["total_offers"]
)
df_grouped["proportion_has_gtin"] = (
    df_grouped["has_gtin_sum"] / df_grouped["total_offers"]
)

# Compute the count of "new" condition for each seller
df_grouped["condition_new_count"] = df.groupby("seller_id")["condition"].apply(
    lambda x: (x == "new").sum()  # Count how many "new" conditions
).values

# Compute the proportion of "new" condition
df_grouped["proportion_new"] = (
    df_grouped["condition_new_count"] / df_grouped["total_offers"]
).values

# Compute items per category
df_grouped["items_per_category"] = (
    df_grouped["total_offers"] / df_grouped["num_categories"]
)

# Compute frequency of each city in the dataset
city_counts = df["city"].value_counts().to_dict()

# Map city frequencies to df_grouped based on the most frequent city of each seller
df_grouped["city_frequency"] = df_grouped["most_frequent_city"].map(city_counts)

# Compute frequency of each city in the dataset
category_counts = df["category_id"].value_counts().to_dict()

# Map category frequencies to df_grouped based on the most frequencies city of each seller
df_grouped["category_frequency"] = df_grouped["mode_category"].map(category_counts)

# Count distinct cities per seller
df_grouped["distinct_cities_count"] = (
    df.groupby("seller_id")["city"].nunique().reset_index()["city"]
)
df_grouped.to_parquet(os.path.join(data_path, "input", "sellers_items_grouped.parquet"))
display(df_grouped.head())

In [None]:
df_fields_ori = df_grouped[
    [
        "num_categories",
        "category_frequency",
        "dominant_category_ratio",
        "dollar_price_mean",
        "dollar_price_mae",
        "dollar_price_mode",
        "dollar_price_iqr",
        "dollar_price_cv",
        "dollar_original_price_mean",
        "dollar_original_price_mae",
        "dollar_discount_mean",
        "dollar_discount_mae",
        "available_qty_mean",
        "available_qty_mae",
        "proportion_cataloged",
        "proportion_free_shipping",
        "condition_mode",
        "proportion_new",
        "installments_mean",
        "installments_mae",
        "most_frequent_city",
        "distinct_cities_count",
        "proportion_has_gtin",
        "num_attributes_mean",
        "num_attributes_mae",
        "num_attributes_median",
        "available_amount_mean",
        "available_amount_mae",
        "available_amount_median",
        "total_offers",
    ]
]
df_fields_ori = df_fields_ori.fillna(0)
display(df_fields_ori.head())

In [None]:
# Function to check if a column is numeric
def is_numeric(col):
    return pd.api.types.is_numeric_dtype(col)


# Iterate through the columns and apply frequency encoding if not numeric
for column in df_fields_ori.columns:
    if not is_numeric(df_fields_ori[column]):
        # Apply frequency encoding to non-numeric columns
        freq_encoding = df_fields_ori[column].value_counts().to_dict()
        df_fields_ori.loc[:, column] = df_fields_ori[column].map(freq_encoding)

display(df_fields_ori.head())

In [None]:
pca_components=6
pca_col_list = ["PC1", "PC2", "PC3", "PC4", "PC5", "PC6"]

In [None]:
# 🔹 Passo 1: Selecionar as features relevantes
df_fields = df_fields_ori.copy()  # Criar uma cópia para evitar avisos de mutação


# 🔹 Passo 4: Normalizar os dados
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_fields)

# 🔹 Passo 1: Reduzir para 3 dimensões com PCA
pca = PCA(n_components=pca_components)


X_pca_3d = pca.fit_transform(X_scaled)



# 🔹 Passo 3: Criar gráfico 3D com Plotly
df_pca_normalized = pd.DataFrame(X_pca_3d, columns= pca_col_list)


# Criar um dicionário para mapear seller_id -> subfolder_name
seller_to_country = df.set_index("seller_id")["subfolder_name"].to_dict()
# Mapear country_id para df_pca_normalized com base no seller_id
df_pca_normalized["country_id"] = df_grouped["seller_id"].map(seller_to_country)


# Criar o gráfico interativo



fig = px.scatter_3d(
    df_pca_normalized,

    x="PC2",
    y="PC3",
    z="PC6",

    color="country_id",

    title="Clusters de Vendedores (PCA 3D - Normalizado)",

    color_continuous_scale="viridis",
)



# Exibir o gráfico


fig.show()

In [None]:
df_fields = df_fields_ori.copy()  # Criar uma cópia para evitar avisos de mutação
# Get the indices of the top 3 highest values from each PCA column (PCA1, PCA2, PCA3)
top_indices_pca1 = df_pca_normalized["PC6"].idxmax()
# Remove outliers (top 3 highest values for each PCA column) from df_fields
outliers = []
outliers.append(df_fields[top_indices_pca1-1:top_indices_pca1])
df_fields = df_fields.drop(top_indices_pca1, axis=0)
# Remover a linha com o maior valor de PC4
df_pca_normalized = df_pca_normalized.drop(top_indices_pca1, axis=0)
# Get the indices of the top 3 highest values from each PCA column (PCA1, PCA2, PCA3)
top_indices_pca1 = df_pca_normalized["PC6"].idxmin()
# Remove outliers (top 3 highest values for each PCA column) from df_fields
outliers = []
outliers.append(df_fields[top_indices_pca1 - 1 : top_indices_pca1])
df_fields = df_fields.drop(top_indices_pca1, axis=0)
# Remover a linha com o maior valor de PC4
df_pca_normalized = df_pca_normalized.drop(top_indices_pca1, axis=0)
outliers

In [None]:
fig = px.scatter_3d(
    df_pca_normalized,

    x="PC2",
    y="PC3",
    z="PC6",

    color="country_id",

    title="Clusters de Vendedores (PCA 3D - Normalizado)",

    color_continuous_scale="viridis",
)



# Exibir o gráfico


fig.show()

In [None]:
# 🔹 Passo 5: Encontrar o número ideal de clusters (Método do Cotovelo)
inertia = []
K_range = range(1, 16)  # Testando de 1 a 10 clusters
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(df_pca_normalized.drop(columns=["country_id"]))
    inertia.append(kmeans.inertia_)

# 🔹 Passo 6: Plotar o Método do Cotovelo
plt.figure(figsize=(8, 6))
plt.plot(K_range, inertia, marker="o", linestyle="-")
plt.title("Método do Cotovelo")
plt.xlabel("Número de Clusters (k)")
plt.ylabel("Inertia")
plt.show()

In [None]:
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    df_fields["cluster"] = kmeans.fit_predict(
        df_pca_normalized.drop(columns=["country_id"])
    )
    try:
        sil_score = silhouette_score(df_pca_normalized.drop(columns=["country_id"]), df_fields["cluster"])
        print(f"Silhouette Score: {sil_score:.4f} for k={k}")
    except:
        pass

In [None]:
# Passo 7: Aplicar K-Means com k escolhido
kmeans = KMeans(n_clusters=6, random_state=42)
df_fields["cluster"] = kmeans.fit_predict(
    df_pca_normalized.drop(columns=["country_id"])
)
df_pca_normalized["cluster"] = df_fields["cluster"]
# Passo 8: Visualizar os primeiros vendedores e seus clusters
df_fields.head()

In [None]:
pca_col_list_c = ["PC1", "PC2", "PC3", "PC4", "PC5", "PC6","cluster"]
# Criar o gráfico interative

fig = px.scatter_3d(
    df_pca_normalized.drop(columns=["country_id"])[pca_col_list_c],
    x="PC2",
    y="PC3",
    z="PC6",
    color="cluster",
    title="Clusters de Vendedores (PCA 3D - Normalizado)",
    labels={
        "PC2": "Componente Principal 2 (Normalizado)",
        "PC5": "Componente Principal 5 (Normalizado)",
        "PC6": "Componente Principal 6 (Normalizado)",
    },
    color_continuous_scale="viridis",
)



# Exibir o gráfico


fig.show()

In [None]:
print(df_fields["cluster"].value_counts())

In [None]:
df_best_pca = df_pca_normalized.drop(columns=["country_id"])[[
    "PC2", "PC3", "PC6", "cluster"
]]
df_best_pca = df_best_pca.rename(columns={"PC2": "PC1", "PC3": "PC2", "PC6": "PC3"})

In [None]:
# 🔹 Passo 5: Encontrar o número ideal de clusters (Método do Cotovelo)
inertia = []
K_range = range(1, 16)  # Testando de 1 a 10 clusters
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(df_best_pca.drop(columns=["cluster"]))
    inertia.append(kmeans.inertia_)

# 🔹 Passo 6: Plotar o Método do Cotovelo
plt.figure(figsize=(8, 6))
plt.plot(K_range, inertia, marker="o", linestyle="-")
plt.title("Método do Cotovelo")
plt.xlabel("Número de Clusters (k)")
plt.ylabel("Inertia")
plt.show()

In [None]:
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    df_fields["cluster"] = kmeans.fit_predict(
        df_best_pca.drop(columns=["cluster"])
    )
    try:
        sil_score = silhouette_score(df_best_pca.drop(columns=["cluster"]), df_fields["cluster"])
        print(f"Silhouette Score: {sil_score:.4f} for k={k}")
    except:
        pass

In [None]:
# Passo 7: Aplicar K-Means com k escolhido
kmeans = KMeans(n_clusters=4, random_state=42)
df_fields["cluster"] = kmeans.fit_predict(df_best_pca.drop(columns=["cluster"]))
df_best_pca["cluster"] = df_fields["cluster"]
# Passo 8: Visualizar os primeiros vendedores e seus clusters
df_fields.head()

In [None]:
fig = px.scatter_3d(
    df_best_pca,
    x="PC1",
    y="PC2",
    z="PC3",
    color="cluster",
    title="Clusters de Vendedores (PCA 3D - Normalizado)",
    labels={
        "PC2": "Componente Principal 2 (Normalizado)",
        "PC5": "Componente Principal 5 (Normalizado)",
        "PC6": "Componente Principal 6 (Normalizado)",
    },
    color_continuous_scale="viridis",
)


# Exibir o gráfico


fig.show()

In [None]:
print(df_fields["cluster"].value_counts())
display(df_fields[df_fields["cluster"] == 3])

In [None]:
# Criar o layout para os subgráficos (3 subgráficos por cluster)
fig = make_subplots(
    rows=len(set(df_best_pca["cluster"])),
    cols=3,  # Uma linha por cluster, 3 colunas para cada par de componentes PCA
    subplot_titles=["PCA1 vs PCA2", "PCA2 vs PCA3", "PCA1 vs PCA3"],
    specs=[
        [{"type": "scatter"}, {"type": "scatter"}, {"type": "scatter"}]
        for _ in range(len(set(df_best_pca["cluster"])))
    ],
)

# Loop para cada cluster e adicionar os gráficos
for i, cluster in enumerate(set(df_best_pca["cluster"])):
    cluster_data = df_best_pca[df_best_pca["cluster"] == cluster]

    # PCA1 vs PCA2 plot
    scatter1 = go.Scatter(
        x=cluster_data["PC1"],
        y=cluster_data["PC2"],
        mode="markers",
        marker=dict(color=cluster_data["cluster"], colorscale="viridis"),
        name=f"Cluster {cluster} (PCA1 vs PCA2)",
    )
    fig.add_trace(scatter1, row=i + 1, col=1)

    # PCA2 vs PCA3 plot
    scatter2 = go.Scatter(
        x=cluster_data["PC2"],
        y=cluster_data["PC3"],
        mode="markers",
        marker=dict(color=cluster_data["cluster"], colorscale="viridis"),
        name=f"Cluster {cluster} (PCA2 vs PCA3)",
    )
    fig.add_trace(scatter2, row=i + 1, col=2)

    # PCA1 vs PCA3 plot
    scatter3 = go.Scatter(
        x=cluster_data["PC1"],
        y=cluster_data["PC3"],
        mode="markers",
        marker=dict(color=cluster_data["cluster"], colorscale="viridis"),
        name=f"Cluster {cluster} (PCA1 vs PCA3)",
    )
    fig.add_trace(scatter3, row=i + 1, col=3)

# Definir os intervalos para todos os subgráficos de forma que os eixos sejam fixos para cada tipo de comparação
fig.update_layout(
    title="Clusters de Vendedores (PCA 2D Comparisons)",
    height=3000,
    width=3000
)

# Exibir a figura
fig.show()

In [None]:
# Obter a matriz de cargas (loadings)
pca_components = pd.DataFrame(
    pca.components_,  # Coeficientes do PCA
    columns=df_fields.drop(columns=['cluster']).columns,  # Nome das variáveis originais
    index=[f"PC{i+1}" for i in range(pca.n_components_)],  # Nome dos componentes
)

# Exibir a matriz de cargas dos 3 primeiros PCs
pca_components.T.sort_values(by="PC1", ascending=False)

In [None]:
# Thresholds
upper_threshold = 0.2
lower_threshold = -0.1

relevant_fields = {}

# Filtrar variáveis com pesos maiores que 0.2 ou menores que -0.1
for pc in pca_components.index:
    relevant_fields[pc] = pca_components.T[pc][
        (pca_components.T[pc] >= upper_threshold)
        | (pca_components.T[pc] <= lower_threshold)
    ]

# Criar o gráfico de barras interativo para os 3 primeiros componentes principais (PC1, PC2, PC3)
fig = make_subplots(
    rows=1,
    cols=3,  # 1 linha com 3 colunas
    subplot_titles=["Pesos do PCA1", "Pesos do PCA2", "Pesos do PCA3"],
)

# Gráfico para PC1
fig.add_trace(
    go.Bar(
        x=relevant_fields["PC1"].index,  # Variáveis relevantes
        y=relevant_fields["PC1"].values,  # Pesos relevantes
        name="PC1",
        marker_color="royalblue",
    ),
    row=1,
    col=1,
)

# Gráfico para PC2
fig.add_trace(
    go.Bar(
        x=relevant_fields["PC2"].index,  # Variáveis relevantes
        y=relevant_fields["PC2"].values,  # Pesos relevantes
        name="PC2",
        marker_color="orange",
    ),
    row=1,
    col=2,
)

# Gráfico para PC3
fig.add_trace(
    go.Bar(
        x=relevant_fields["PC3"].index,  # Variáveis relevantes
        y=relevant_fields["PC3"].values,  # Pesos relevantes
        name="PC3",
        marker_color="green",
    ),
    row=1,
    col=3,
)

# Atualizar o layout para que fique mais legível
fig.update_layout(
    title="Pesos Relevantes dos Componentes Principais (PCA)",
    height=600,
    showlegend=False,
    xaxis=dict(tickangle=90),  # Rotaciona os rótulos do eixo X
    xaxis2=dict(tickangle=90),  # Rotaciona os rótulos do eixo X para o segundo gráfico
    xaxis3=dict(tickangle=90),  # Rotaciona os rótulos do eixo X para o terceiro gráfico
    barmode="group",  # Organiza as barras em grupos
)

# Exibir o gráfico interativo
fig.show()

In [None]:
# Assuming df_fields is your DataFrame and has a 'cluster' column
X = df_fields.drop(columns="cluster").values  # Features
y = df_fields["cluster"].values  # Labels (or clusters)

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Perform PCA with 2 components for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)


# Create a Plotly biplot
def plot_biplot(score, coeff, labels=None):
    xs = score[:, 0]
    ys = score[:, 1]
    n = coeff.shape[0]

    # Create the figure
    fig = go.Figure()

    # Scatter plot for the PCA scores (data points)
    fig.add_trace(
        go.Scatter(
            x=xs,
            y=ys,
            mode="markers",
            marker=dict(
                color=y, colorscale="Viridis", showscale=True
            ),  # Color by clusters
            name="PCA points",
        )
    )

    # Adding arrows for each feature (vectors)
    for i in range(n):
        fig.add_trace(
            go.Scatter(
                x=[0, coeff[i, 0]],  # Start from the origin
                y=[0, coeff[i, 1]],  # End at the component direction
                mode="lines+text",
                line=dict(color="red", width=2),
                text=[
                    None,
                    labels[i] if labels is not None else f"Var{i + 1}",
                ],  # Feature label
                textposition="top center",
                name=f"Feature {labels[i]}" if labels is not None else f"Var{i + 1}",
            )
        )

    # Layout settings
    fig.update_layout(
        xaxis_title="PC1",
        yaxis_title="PC2",
        showlegend=True,
        template="plotly_dark",  # Dark theme (optional)
    )

    # Show the plot
    fig.show()


# Call the plotting function
plot_biplot(
    X_pca[:, 0:2], pca.components_.T, labels=df_fields.columns
)  # Feature names as labels

In [None]:
# Assuming df_fields is your DataFrame and has a 'cluster' column
X = df_fields.drop(columns="cluster").values  # Features
y = df_fields["cluster"].values  # Labels (or clusters)

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Perform PCA with 2 components for visualization
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)


# Create a Plotly biplot
def plot_biplot(score, coeff, labels=None):
    xs = score[:, 0]
    ys = score[:, 1]
    n = coeff.shape[0]

    # Create the figure
    fig = go.Figure()

    # Scatter plot for the PCA scores (data points)
    fig.add_trace(
        go.Scatter(
            x=xs,
            y=ys,
            mode="markers",
            marker=dict(
                color=y, colorscale="Viridis", showscale=True
            ),  # Color by clusters
            name="PCA points",
        )
    )

    # Adding arrows for each feature (vectors)
    for i in range(n):
        fig.add_trace(
            go.Scatter(
                x=[0, coeff[i, 0]],  # Start from the origin
                y=[0, coeff[i, 1]],  # End at the component direction
                mode="lines+text",
                line=dict(color="red", width=2),
                text=[
                    None,
                    labels[i] if labels is not None else f"Var{i + 1}",
                ],  # Feature label
                textposition="top center",
                name=f"Feature {labels[i]}" if labels is not None else f"Var{i + 1}",
            )
        )

    # Layout settings
    fig.update_layout(
        xaxis_title="PC2",
        yaxis_title="PC3",
        showlegend=True,
        template="plotly_dark",  # Dark theme (optional)
    )

    # Show the plot
    fig.show()


# Call the plotting function
plot_biplot(
    X_pca[:, 1:3], pca.components_.T, labels=df_fields.columns
)  # Feature names as labels

In [None]:
df_cluster_summary = df_fields.groupby("cluster").mean()
# Variáveis numéricas para gerar boxplots
numeric_cols = df_cluster_summary.columns

# Gerar boxplots para cada variável numérica usando facet_col
for col in numeric_cols:
    fig = px.box(
        df_fields,
        x="cluster",  # Eixo X representando os clusters
        y=col,  # Variável numérica
        title=f"Distribuição de {col} por Cluster",
        labels={"cluster": "Cluster", col: f"{col} Value"},  # Rótulos dos eixos
        color="cluster",  # Colorir os boxplots por cluster
        boxmode="group",  # Organizar os boxplots em grupos
    )

    # Exibir o gráfico interativo
    fig.show()

In [None]:
df_cluster_summary_r = df_cluster_summary.reset_index()
df_cluster_summary_tr = df_cluster_summary_r.T.drop("cluster", axis = 0)
df_cluster_summary_tr

In [None]:
df_heatmap = df_cluster_summary_tr
# Create subplots for each feature (row)
fig = make_subplots(
    rows=len(df_heatmap.index),
    cols=1,
    shared_xaxes=True,
    vertical_spacing=0.02,
    subplot_titles=[],  # Row titles will be the feature names
)

# For each row (feature), add a heatmap with its own color scale
for i, feature in enumerate(df_heatmap.index):
    fig.add_trace(
        go.Heatmap(
            z=[df_heatmap.loc[feature].values],  # Select values for this row
            x=df_heatmap.columns,  # Cluster labels (columns)
            y=[feature],  # Feature label (row)
            colorscale="YlGnBu",  # Choose a color scale
            colorbar=dict(title="Feature Value"),
            showscale=True,  # Display the color scale for each row
            zmin=df_heatmap.loc[feature].min(),  # Set color scale minimum for this row
            zmax=df_heatmap.loc[feature].max(),  # Set color scale maximum for this row
        ),
        row=i + 1,
        col=1,  # Place the heatmap in the corresponding subplot row
    )

# Update layout for better presentation
fig.update_layout(
    height=1500,  # Adjust height as needed
    width=1500,  # Adjust width as needed
    showlegend=False,
)

fig.update_xaxes(
    title_text="Clusters",  # X-axis titl
    row=len(df_heatmap.index),  # Apply title to the last row
    col=1,
)

fig.update_yaxes(
    title_text="Features",  # X-axis title
    row=len(df_heatmap.index) // 2,  # Apply title to the last row
    col=1,
)

# Show plot
fig.show()