In [None]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [None]:
data_path = os.path.join(os.getcwd(), "data")
seller_items_path = os.path.join(data_path, "input", "seller_items")

In [None]:
# Lista para armazenar os dados
data = []

# Percorrer todos os arquivos no diretório
for filename in os.listdir(seller_items_path):
    if filename.endswith("_items.json"):  # Garantir que são arquivos de sellers
        file_path = os.path.join(seller_items_path, filename)

        # Abrir e ler o arquivo linha por linha
        with open(file_path, "r", encoding="utf-8") as file:
            for line in file:
                try:
                    item_data = json.loads(line)  # Converter JSON para dicionário
                    data.append(item_data)
                except json.JSONDecodeError as e:
                    print(f"Erro ao ler {filename}: {e}")

In [None]:
# Criar DataFrame do pandas
df = pd.DataFrame(data)
df["available_amount"] = df["price"] * df["available_qty"]
# Exibir as primeiras linhas
display(df.head())

In [None]:
import numpy as np
import scipy.stats as stats

df_grouped = df.groupby("seller_id").agg(
    {
        "category_id": [
            lambda x: list(set(x)),  # Unique categories
            lambda x: len(set(x)),  # Number of unique categories
            lambda x: (
                x.mode().iloc[0] if not x.mode().empty else None
            ),  # Most frequent category
            lambda x: (
                x.value_counts(normalize=True).max() if not x.empty else 0
            ),  # Dominant category ratio
        ],
        "price": [
            "count",
            "mean",
            "std",
            lambda x: x.mode().iloc[0] if not x.mode().empty else None,  # Mode
            lambda x: np.percentile(x, 75)
            - np.percentile(x, 25),  # IQR (Interquartile Range)
            lambda x: (
                np.std(x) / np.mean(x) if np.mean(x) > 0 else 0
            ),  # Coefficient of Variation (CV)
        ],
        "original_price": ["mean", "std"],
        "discount": ["mean", "std", "max"],
        "available_qty": ["sum", "mean", "std"],
        "cataloged": "sum",
        "free_shipping": "sum",
        "condition": lambda x: (
            x.mode().iloc[0] if not x.mode().empty else None
        ),  # Most frequent condition
        "installments": ["mean", "std"],
        "buying_mode": lambda x: (
            x.mode().iloc[0] if not x.mode().empty else None
        ),  # Most frequent buying mode
        "city": [
            lambda x: (
                x.mode().iloc[0] if not x.mode().empty else None
            ),  # Most frequent city
        ],
        "state": lambda x: (
            x.mode().iloc[0] if not x.mode().empty else None
        ),  # Most frequent state
        "has_gtin": "sum",
        "num_attributes": ["mean", "std", "median"],  # Product complexity
    }
)

# Rename Columns
df_grouped.columns = [
    "unique_categories",
    "num_categories",
    "mode_category",
    "dominant_category_ratio",
    "total_items",
    "price_mean",
    "price_std",
    "price_mode",
    "price_iqr",
    "price_cv",
    "original_price_mean",
    "original_price_std",
    "discount_mean",
    "discount_std",
    "discount_max",
    "available_qty_sum",
    "available_qty_mean",
    "available_qty_std",
    "cataloged_sum",
    "free_shipping_sum",
    "condition_mode",
    "installments_mean",
    "installments_std",
    "buying_mode_mode",
    "most_frequent_city",
    "most_frequent_state",
    "has_gtin_sum",
    "num_attributes_mean",
    "num_attributes_std",
    "num_attributes_median",
]

df_grouped = df_grouped.reset_index()

# Compute proportions
df_grouped["proportion_cataloged"] = (
    df_grouped["cataloged_sum"] / df_grouped["total_items"]
)
df_grouped["proportion_free_shipping"] = (
    df_grouped["free_shipping_sum"] / df_grouped["total_items"]
)
df_grouped["proportion_has_gtin"] = (
    df_grouped["has_gtin_sum"] / df_grouped["total_items"]
)

# Compute the count of "new" condition for each seller
df_grouped["condition_new_count"] = df.groupby("seller_id")["condition"].apply(
    lambda x: (x == "new").sum()  # Count how many "new" conditions
)

# Compute the proportion of "new" condition
df_grouped["proportion_new"] = (
    df_grouped["condition_new_count"] / df_grouped["total_items"]
)

# Compute the count of "buy_it_now" buying mode for each seller
df_grouped["buying_mode_buy_it_now_count"] = df.groupby("seller_id")[
    "buying_mode"
].apply(
    lambda x: (x == "buy_it_now").sum()  # Count how many "buy_it_now" modes
)

# Compute the proportion of "buy_it_now"
df_grouped["proportion_buy_it_now"] = (
    df_grouped["buying_mode_buy_it_now_count"] / df_grouped["total_items"]
)

# Compute items per category
df_grouped["items_per_category"] = (
    df_grouped["total_items"] / df_grouped["num_categories"]
)

# Compute frequency of each city in the dataset
city_counts = df["city"].value_counts(normalize=True).to_dict()

# Map city frequencies to df_grouped based on the most frequent city of each seller
df_grouped["city_frequency"] = df_grouped["most_frequent_city"].map(city_counts)

# Compute frequency of each city in the dataset
category_counts = df["category_id"].value_counts(normalize=True).to_dict()

# Map city frequencies to df_grouped based on the most frequent city of each seller
df_grouped["category_frequency"] = df_grouped["mode_category"].map(category_counts)

# Count distinct cities per seller
df_grouped["distinct_cities_count"] = (
    df.groupby("seller_id")["city"].nunique().reset_index()["city"]
)

display(df_grouped.head())

In [None]:
df_fields = df_grouped[
    [
        # "unique_categories",
        "num_categories",
        # "mode_category",
        "category_frequency",
        "dominant_category_ratio",
        "total_items",
        "price_mean",
        "price_std",
        "price_mode",
        "price_iqr",
        "price_cv",
        "original_price_mean",
        "original_price_std",
        "discount_mean",
        "discount_std",
        "discount_max",
        # "available_qty_sum",
        "available_qty_mean",
        "available_qty_std",
        # "cataloged_sum",
        "proportion_cataloged",
        # "free_shipping_sum",
        "proportion_free_shipping",
        "condition_mode",
        "proportion_new",
        "installments_mean",
        "installments_std",
        "buying_mode_mode",
        "proportion_buy_it_now",
        "most_frequent_city",
        "distinct_cities_count",
        "proportion_has_gtin",
        "num_attributes_mean",
        "num_attributes_std",
        "num_attributes_median",
    ]
]
display(df_fields.head())

In [None]:
# Function to check if a column is numeric
def is_numeric(col):
    return pd.api.types.is_numeric_dtype(col)


# Iterate through the columns and apply frequency encoding if not numeric
for column in df_fields.columns:
    if not is_numeric(df_fields[column]):
        # Apply frequency encoding to non-numeric columns
        freq_encoding = df_fields[column].value_counts(normalize=True).to_dict()
        df_fields.loc[:, column] = df_fields[column].map(
            freq_encoding
        )

display(df_fields.head())

In [None]:
# 🔹 Passo 1: Selecionar as features relevantes
df_fields = df_fields.copy()  # Criar uma cópia para evitar avisos de mutação

# 🔹 Passo 2: Aplicar Frequency Encoding nas variáveis categóricas
# categorical_cols = ["most_frequent_city"]
# for col in categorical_cols:
#     freq_map = df_fields[col].value_counts(normalize=True).to_dict()
#     df_fields[col] = df_fields[col].map(freq_map)

df_fields = df_fields.fillna(0)

# # 🔹 Passo 3: Remover Outliers usando Z-score
# z_scores = np.abs(stats.zscore(df_fields))
# df_filtered = df_fields[
#     (z_scores < 3).all(axis=1)
# ]  # Mantém apenas valores dentro de 3 desvios padrão

# 🔹 Passo 4: Normalizar os dados
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_fields)

# 🔹 Passo 5: Encontrar o número ideal de clusters (Método do Cotovelo)
inertia = []
K_range = range(1, 16)  # Testando de 1 a 10 clusters
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

# 🔹 Passo 6: Plotar o Método do Cotovelo
plt.figure(figsize=(8, 6))
plt.plot(K_range, inertia, marker="o", linestyle="-")
plt.title("Método do Cotovelo")
plt.xlabel("Número de Clusters (k)")
plt.ylabel("Inertia")
plt.show()

In [None]:
# 🔹 Passo 7: Aplicar K-Means com k escolhido (exemplo: k=4)
kmeans = KMeans(n_clusters=8, random_state=42)
df_fields["cluster"] = kmeans.fit_predict(X_scaled)

# 🔹 Passo 8: Visualizar os primeiros vendedores e seus clusters
df_fields.head()

In [None]:
# 🔹 Passo 1: Reduzir para 3 dimensões com PCA
pca = PCA(n_components=3)
X_pca_3d = pca.fit_transform(X_scaled)

# 🔹 Passo 2: Normalizar os componentes principais
scaler_pca = StandardScaler()
X_pca_3d_normalized = scaler_pca.fit_transform(X_pca_3d)

# 🔹 Passo 3: Criar gráfico 3D com Plotly
df_pca_normalized = pd.DataFrame(X_pca_3d_normalized, columns=["PC1", "PC2", "PC3"])
df_pca_normalized["cluster"] = df_fields["cluster"]

# Criar o gráfico interativo
fig = px.scatter_3d(
    df_pca_normalized,
    x="PC1",
    y="PC2",
    z="PC3",
    color="cluster",
    title="Clusters de Vendedores (PCA 3D - Normalizado)",
    labels={
        "PC1": "Componente Principal 1 (Normalizado)",
        "PC2": "Componente Principal 2 (Normalizado)",
        "PC3": "Componente Principal 3 (Normalizado)",
    },
    color_continuous_scale="viridis",
)

# Exibir o gráfico
fig.show()

In [None]:
# Assuming X_pca_3d_normalized is your PCA-transformed data
# Get the indices of the top 3 highest values from each PCA column (PCA1, PCA2, PCA3)
top_indices_pca1 = np.argsort(X_pca_3d_normalized[:, 0])[::-1][
    :3
]  # Top 3 indices for PCA1
top_indices_pca2 = np.argsort(X_pca_3d_normalized[:, 1])[::-1][
    :3
]  # Top 3 indices for PCA2
top_indices_pca3 = np.argsort(X_pca_3d_normalized[:, 2])[::-1][
    :3
]  # Top 3 indices for PCA3

# Combine all top indices (ensure uniqueness)
top_indices = np.unique(
    np.concatenate([top_indices_pca1, top_indices_pca2, top_indices_pca3])
)

# Remove outliers (top 3 highest values for each PCA column) from df_fields
df_fields = df_fields.drop(top_indices).reset_index(drop=True)

In [None]:
display(df_fields.iloc[top_indices])

In [None]:
X_scaled = scaler.fit_transform(df_fields.drop("cluster", axis=1))

# 🔹 Passo 5: Encontrar o número ideal de clusters (Método do Cotovelo)
inertia = []
K_range = range(1, 16)  # Testando de 1 a 10 clusters
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

# 🔹 Passo 6: Plotar o Método do Cotovelo
plt.figure(figsize=(8, 6))
plt.plot(K_range, inertia, marker="o", linestyle="-")
plt.title("Método do Cotovelo")
plt.xlabel("Número de Clusters (k)")
plt.ylabel("Inertia")
plt.show()

In [None]:
# 🔹 Passo 7: Aplicar K-Means com k escolhido (exemplo: k=4)
kmeans = KMeans(n_clusters=5, random_state=42)
df_fields["cluster"] = kmeans.fit_predict(X_scaled)

# 🔹 Passo 8: Visualizar os primeiros vendedores e seus clusters
df_fields.head()

In [None]:
# 🔹 Passo 1: Reduzir para 3 dimensões com PCA
pca = PCA(n_components=3)
X_pca_3d = pca.fit_transform(X_scaled)

# 🔹 Passo 2: Normalizar os componentes principais
scaler_pca = StandardScaler()
X_pca_3d_normalized = scaler_pca.fit_transform(X_pca_3d)

# 🔹 Passo 3: Criar gráfico 3D com Plotly
df_pca_normalized = pd.DataFrame(X_pca_3d_normalized, columns=["PC1", "PC2", "PC3"])
df_pca_normalized["cluster"] = df_fields["cluster"]

# Criar o gráfico interativo
fig = px.scatter_3d(
    df_pca_normalized,
    x="PC1",
    y="PC2",
    z="PC3",
    color="cluster",
    title="Clusters de Vendedores (PCA 3D - Normalizado)",
    labels={
        "PC1": "Componente Principal 1 (Normalizado)",
        "PC2": "Componente Principal 2 (Normalizado)",
        "PC3": "Componente Principal 3 (Normalizado)",
    },
    color_continuous_scale="viridis",
)

# Exibir o gráfico
fig.show()

In [None]:
# Criar o gráfico interativo para cada cluster
for cluster in list(set(df_pca_normalized["cluster"])):
    fig = px.scatter_3d(
        df_pca_normalized[df_pca_normalized["cluster"] == cluster],
        x="PC1",
        y="PC2",
        z="PC3",
        color="cluster",
        title=f"Clusters de Vendedores (PCA 3D - Normalizado) - Cluster {cluster}",
        labels={
            "PC1": "Componente Principal 1 (Normalizado)",
            "PC2": "Componente Principal 2 (Normalizado)",
            "PC3": "Componente Principal 3 (Normalizado)",
        },
        color_continuous_scale="viridis",
    )

    # Exibir o gráfico
    fig.show()

In [None]:
# Obter a matriz de cargas (loadings)
pca_components = pd.DataFrame(
    pca.components_,  # Coeficientes do PCA
    columns=df_fields.drop(columns='cluster').columns,  # Nome das variáveis originais
    index=[f"PC{i+1}" for i in range(pca.n_components_)],  # Nome dos componentes
)

# Exibir a matriz de cargas dos 3 primeiros PCs
pca_components.T.sort_values(by="PC1", ascending=False)

In [None]:
pca.components_.shape

In [None]:
df_fields.columns.shape

In [None]:
df_fields.drop(columns="cluster").columns

In [None]:
df_cluster_summary = df_fields.groupby("cluster").mean()
df_cluster_summary

In [None]:
df_cluster_summary.T.sort_values(
    by=0, ascending=False
)  # Ordena pela média do cluster 0

In [None]:
# Variáveis numéricas para gerar boxplots
numeric_cols = df_cluster_summary.columns

# Gerar boxplots para cada variável numérica
for col in numeric_cols:
    plt.figure(figsize=(12, 6))
    sns.boxplot(x=df_fields["cluster"], y=df_fields[col])
    plt.title(f"Distribuição de {col} por Cluster")
    plt.show()

In [None]:
# Assuming df_fields is your DataFrame and has a 'cluster' column
X = df_fields.drop(columns="cluster").values  # Features
y = df_fields["cluster"].values  # Labels (or clusters)

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Perform PCA with 2 components for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)


# Create a Plotly biplot
def plot_biplot(score, coeff, labels=None):
    xs = score[:, 0]
    ys = score[:, 1]
    n = coeff.shape[0]

    # Create the figure
    fig = go.Figure()

    # Scatter plot for the PCA scores (data points)
    fig.add_trace(
        go.Scatter(
            x=xs,
            y=ys,
            mode="markers",
            marker=dict(
                color=y, colorscale="Viridis", showscale=True
            ),  # Color by clusters
            name="PCA points",
        )
    )

    # Adding arrows for each feature (vectors)
    for i in range(n):
        fig.add_trace(
            go.Scatter(
                x=[0, coeff[i, 0]],  # Start from the origin
                y=[0, coeff[i, 1]],  # End at the component direction
                mode="lines+text",
                line=dict(color="red", width=2),
                text=[
                    None,
                    labels[i] if labels is not None else f"Var{i + 1}",
                ],  # Feature label
                textposition="top center",
                name=f"Feature {labels[i]}" if labels is not None else f"Var{i + 1}",
            )
        )

    # Layout settings
    fig.update_layout(
        title="PCA Biplot - First Two Principal Components",
        xaxis_title="PC1",
        yaxis_title="PC2",
        showlegend=True,
        template="plotly_dark",  # Dark theme (optional)
    )

    # Show the plot
    fig.show()


# Call the plotting function
plot_biplot(
    X_pca[:, 0:2], pca.components_.T, labels=df_fields.columns
)  # Feature names as labels