In [3]:

# Punto 1: Descargar el archivo CSV desde el enlace proporcionado

import requests
import pandas as pd
import numpy as np
from scipy import stats
import plotly.express as px
import seaborn as sns


In [None]:

url = 'https://github.com/jdramirez/UCO_ML_AI/raw/bf4036cf62b1b620a1ab1dade46361941ee1b8f0/src/notebook_class/supervisado/data/amazon_product.csv'
response = requests.get(url)

with open('../datos-estadistica/amazon_product.csv', 'wb') as file:
    file.write(response.content)

print("Archivo descargado y guardado como 'amazon_product.csv'")

In [4]:
# Punto 2: Leer el archivo CSV y convertirlo a formato XLSX


# Leer el archivo CSV
df = pd.read_csv('../datos-estadistica/amazon_product.csv')

# Guardar el archivo como XLSX
df.to_excel('../datos-estadistica/amazon_product.xlsx', index=False)

print("Archivo convertido a formato 'amazon_product.xlsx'")


Archivo convertido a formato 'amazon_product.xlsx'


In [5]:
# Función para limpiar y convertir columnas que contienen símbolos monetarios
def clean_and_convert_to_numeric(column):
    return pd.to_numeric(column.replace('[\$,]', '', regex=True), errors='coerce')

# Limpiar y convertir columnas que contienen valores monetarios
for column in ['product_price', 'product_original_price', 'product_minimum_offer_price', 'unit_price']:
    if column in df.columns:
        df[column] = clean_and_convert_to_numeric(df[column])

# Asegúrate de que 'sales_volume' también sea numérico
df['sales_volume'] = pd.to_numeric(df['sales_volume'].replace('[\$,]', '', regex=True), errors='coerce')
# Mostrar las primeras filas para verificar
print(df.head())

   Unnamed: 0        asin                                      product_title  \
0           0  B0BQ118F2T  Moto G Play 2023 3-Day Battery Unlocked Made f...   
1           1  B0CTD47P22  SAMSUNG Galaxy A15 5G (SM-156M/DSN), 128GB 6GB...   
2           2  B0CHH6X6H2  Total by Verizon | Samsung Galaxy A03s | Locke...   
3           3  B0BZ9XNBRB  Google Pixel 7a - Unlocked Android Cell Phone ...   
4           4  B0CN1QSH8Q  SAMSUNG Galaxy A15 5G A Series Cell Phone, 128...   

   product_price  product_original_price currency  product_star_rating  \
0          99.99                  169.99      USD                  4.0   
1         149.74                  158.00      USD                  4.2   
2          49.88                     NaN      USD                  3.9   
3         335.00                  499.00      USD                  4.3   
4         199.99                     NaN      USD                  4.1   

   product_num_ratings                           product_url  \
0         

In [17]:
# Leer el archivo CSV
df = pd.read_csv('../datos-estadistica/amazon_product.csv')
df = df.drop(columns=['Unnamed: 0'])
# Elimina el signo de dólar y convierte a tipo numérico
df['product_price'] = df['product_price'].replace('[\$,]', '', regex=True).astype(float)
df['product_original_price'] = df['product_original_price'].replace('[\$,]', '', regex=True).astype(float)
df['product_minimum_offer_price'] = df['product_minimum_offer_price'].replace('[\$,]', '', regex=True).astype(float)
print(df.head())

         asin                                      product_title  \
0  B0BQ118F2T  Moto G Play 2023 3-Day Battery Unlocked Made f...   
1  B0CTD47P22  SAMSUNG Galaxy A15 5G (SM-156M/DSN), 128GB 6GB...   
2  B0CHH6X6H2  Total by Verizon | Samsung Galaxy A03s | Locke...   
3  B0BZ9XNBRB  Google Pixel 7a - Unlocked Android Cell Phone ...   
4  B0CN1QSH8Q  SAMSUNG Galaxy A15 5G A Series Cell Phone, 128...   

   product_price  product_original_price currency  product_star_rating  \
0          99.99                  169.99      USD                  4.0   
1         149.74                  158.00      USD                  4.2   
2          49.88                     NaN      USD                  3.9   
3         335.00                  499.00      USD                  4.3   
4         199.99                     NaN      USD                  4.1   

   product_num_ratings                           product_url  \
0                 2929  https://www.amazon.com/dp/B0BQ118F2T   
1                 

In [26]:


categorical_columns = df.select_dtypes(include=[np.object_]).columns

# Crear gráficos de barras para cada columna categórica
for column in categorical_columns:
    # Verificar si la columna está en el DataFrame
    if column in df.columns:
        # Contar la frecuencia de cada categoría
        freq = df[column].value_counts()
        
        # Crear un DataFrame para Plotly
        freq_df = pd.DataFrame({
            column: freq.index,
            'Count': freq.values
        })
        
        # Crear gráfico de barras
        fig = px.bar(freq_df, x=column, y='Count', title=f'Distribución de {column}', text='Count')
        
        # Mostrar gráfico
        fig.show()
    else:
        print(f"La columna '{column}' no está en el DataFrame.")

In [29]:


# Identificar variables numéricas

numerical_vars = df.select_dtypes(include=[np.number]).columns

# Crear histogramas para variables numéricas
for var in numerical_vars:
    fig = px.histogram(df, x=var, title=f'Histograma de {var}')
    fig.show()
    
    # Identificar outliers (más de 5 desviaciones estándar)
    mean = df[var].mean()
    std = df[var].std()
    outliers = df[np.abs(df[var] - mean) > 5 * std]
    print(outliers)

    # Test de normalidad
    stat, p_value = stats.shapiro(df[var].dropna())
    print(f'Test de Shapiro-Wilk para {var}: Estadístico={stat}, p-valor={p_value}')


Empty DataFrame
Columns: [Unnamed: 0, asin, product_title, product_price, product_original_price, currency, product_star_rating, product_num_ratings, product_url, product_photo, product_num_offers, product_minimum_offer_price, is_best_seller, is_amazon_choice, is_prime, climate_pledge_friendly, sales_volume, delivery, has_variations, product_availability, unit_price, unit_count]
Index: []

[0 rows x 22 columns]
Test de Shapiro-Wilk para Unnamed: 0: Estadístico=0.955129086971283, p-valor=0.020617350935935974


Empty DataFrame
Columns: [Unnamed: 0, asin, product_title, product_price, product_original_price, currency, product_star_rating, product_num_ratings, product_url, product_photo, product_num_offers, product_minimum_offer_price, is_best_seller, is_amazon_choice, is_prime, climate_pledge_friendly, sales_volume, delivery, has_variations, product_availability, unit_price, unit_count]
Index: []

[0 rows x 22 columns]
Test de Shapiro-Wilk para product_price: Estadístico=0.6241337060928345, p-valor=1.5730359723131748e-11


Empty DataFrame
Columns: [Unnamed: 0, asin, product_title, product_price, product_original_price, currency, product_star_rating, product_num_ratings, product_url, product_photo, product_num_offers, product_minimum_offer_price, is_best_seller, is_amazon_choice, is_prime, climate_pledge_friendly, sales_volume, delivery, has_variations, product_availability, unit_price, unit_count]
Index: []

[0 rows x 22 columns]
Test de Shapiro-Wilk para product_original_price: Estadístico=0.7093768119812012, p-valor=5.2496334319585e-06


Empty DataFrame
Columns: [Unnamed: 0, asin, product_title, product_price, product_original_price, currency, product_star_rating, product_num_ratings, product_url, product_photo, product_num_offers, product_minimum_offer_price, is_best_seller, is_amazon_choice, is_prime, climate_pledge_friendly, sales_volume, delivery, has_variations, product_availability, unit_price, unit_count]
Index: []

[0 rows x 22 columns]
Test de Shapiro-Wilk para product_star_rating: Estadístico=0.7836762070655823, p-valor=1.6984222384053282e-07


    Unnamed: 0        asin product_title  product_price  \
55          55  B07Y8SJGCV       Disney+            0.0   

    product_original_price currency  product_star_rating  product_num_ratings  \
55                     NaN      USD                  4.4              1015448   

                             product_url  \
55  https://www.amazon.com/dp/B07Y8SJGCV   

                                        product_photo  ...  is_best_seller  \
55  https://m.media-amazon.com/images/I/719t3jd2Ne...  ...           False   

   is_amazon_choice  is_prime  climate_pledge_friendly  sales_volume  \
55            False     False                    False          11.0   

                                     delivery  has_variations  \
55  Available instantly on compatible devices           False   

   product_availability  unit_price unit_count  
55                  NaN         NaN        NaN  

[1 rows x 22 columns]
Test de Shapiro-Wilk para product_num_ratings: Estadístico=0.37461602687835

Empty DataFrame
Columns: [Unnamed: 0, asin, product_title, product_price, product_original_price, currency, product_star_rating, product_num_ratings, product_url, product_photo, product_num_offers, product_minimum_offer_price, is_best_seller, is_amazon_choice, is_prime, climate_pledge_friendly, sales_volume, delivery, has_variations, product_availability, unit_price, unit_count]
Index: []

[0 rows x 22 columns]
Test de Shapiro-Wilk para product_num_offers: Estadístico=0.5428234338760376, p-valor=8.132128867868738e-13


    Unnamed: 0        asin                         product_title  \
54          54  B0CTWDLJ99  Mission: Impossible - Dead Reckoning   

    product_price  product_original_price currency  product_star_rating  \
54           5.99                     NaN      USD                  4.7   

    product_num_ratings                           product_url  \
54                 2440  https://www.amazon.com/dp/B0CTWDLJ99   

                                        product_photo  ...  is_best_seller  \
54  https://m.media-amazon.com/images/I/91r5EG+v1s...  ...           False   

   is_amazon_choice  is_prime  climate_pledge_friendly  sales_volume  \
54            False     False                    False        2023.0   

    delivery  has_variations product_availability  unit_price unit_count  
54       NaN           False                  NaN         NaN        NaN  

[1 rows x 22 columns]
Test de Shapiro-Wilk para sales_volume: Estadístico=0.4143970012664795, p-valor=1.932558713751975e-12


Empty DataFrame
Columns: [Unnamed: 0, asin, product_title, product_price, product_original_price, currency, product_star_rating, product_num_ratings, product_url, product_photo, product_num_offers, product_minimum_offer_price, is_best_seller, is_amazon_choice, is_prime, climate_pledge_friendly, sales_volume, delivery, has_variations, product_availability, unit_price, unit_count]
Index: []

[0 rows x 22 columns]
Test de Shapiro-Wilk para unit_count: Estadístico=0.6856882572174072, p-valor=0.006743819452822208


In [20]:
# Crear boxplots para variables categóricas
categorical_vars = ['is_best_seller', 'is_amazon_choice', 'is_prime', 'climate_pledge_friendly', 'delivery', 'has_variations', 'product_availability']

for var in categorical_vars:
    fig = px.box(df, x=var, y='sales_volume', title=f'Boxplot de {var} vs. Sales Volume')
    fig.show()

In [31]:
# Crear scatter plots para variables numéricas
numerical_vars = df.select_dtypes(include=[float, int]).columns

for var in numerical_vars:
    if var != 'sales_volume':  # Evitar graficar 'sales_volume' contra sí mismo
        fig = px.scatter(df, x=var, y='sales_volume', title=f'{var} vs. Sales Volume')
        fig.show()

In [34]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px

# Cargar el dataset
df = pd.read_csv('../datos-estadistica/amazon_product.csv')

# Convertir 'sales_volume' a numérico
df['sales_volume'] = df['sales_volume'].str.extract('(\d+)').astype(float)

# Convertir 'product_price' y 'product_original_price' a numérico
df['product_price'] = df['product_price'].str.replace('$', '').astype(float)
df['product_original_price'] = df['product_original_price'].str.replace('$', '').astype(float)

# Seleccionar las variables numéricas relevantes
numeric_vars = ['product_price', 'product_original_price', 'product_star_rating', 'product_num_ratings', 'sales_volume']
numeric_data = df[numeric_vars].dropna()

# Calcular la matriz de correlación para las variables numéricas
corr_matrix = numeric_data.corr()

# Crear un mapa de calor para la matriz de correlación
fig = go.Figure(data=go.Heatmap(
                z=corr_matrix.values,
                x=corr_matrix.columns,
                y=corr_matrix.columns,
                colorscale='RdBu',
                zmin=-1, zmax=1,
                text=corr_matrix.values,
                texttemplate='%{text:.2f}',
                textfont={"size":10},
                hoverongaps=False))

fig.update_layout(title='Matriz de Correlación (Variables Numéricas)', width=800, height=600)
fig.show()

print("Correlaciones con 'sales_volume':")
print(corr_matrix['sales_volume'].sort_values(ascending=False))

# Verificar cuántos valores únicos tiene cada variable categórica
categorical_columns = df.select_dtypes(include=['object']).columns
categorical_columns = categorical_columns.difference(['sales_volume', 'product_price', 'product_original_price'])

print("Valores únicos por cada columna categórica:")
for col in categorical_columns:
    print(f"{col}: {df[col].nunique()} valores únicos")

# Convertir variables categóricas en dummies automáticamente (solo si tienen más de un valor único)
df_dummies = pd.get_dummies(df, columns=[col for col in categorical_columns if df[col].nunique() > 1], drop_first=True)

# Asegurarse de que 'sales_volume' esté presente después de crear las dummies
if 'sales_volume' not in df_dummies.columns:
    df_dummies['sales_volume'] = df['sales_volume']

# Combinar variables numéricas y dummies
all_vars = numeric_vars + list(df_dummies.columns.difference(numeric_vars))
all_data = df_dummies[all_vars].dropna()

# Verificar si hay columnas constantes
constant_columns = [col for col in all_data.columns if all_data[col].nunique() <= 1]
if constant_columns:
    print("Columnas constantes (sin variabilidad):")
    print(constant_columns)
    all_data = all_data.drop(columns=constant_columns)

# Asegurarse de que 'sales_volume' esté en los datos después de eliminar columnas constantes
if 'sales_volume' not in all_data.columns:
    raise KeyError("'sales_volume' ha sido eliminada accidentalmente.")

# Calcular la nueva matriz de correlación
new_corr_matrix = all_data.corr()

# Crear un nuevo mapa de calor para la matriz de correlación con variables dummy
fig = go.Figure(data=go.Heatmap(
                z=new_corr_matrix.values,
                x=new_corr_matrix.columns,
                y=new_corr_matrix.columns,
                colorscale='RdBu',
                zmin=-1, zmax=1,
                text=new_corr_matrix.values,
                texttemplate='%{text:.2f}',
                textfont={"size":10},
                hoverongaps=False))

fig.update_layout(title='Nueva Matriz de Correlación (incluyendo Variables Dummy)', width=1000, height=800)
fig.show()

print("\nNuevas correlaciones con 'sales_volume':")
print(new_corr_matrix['sales_volume'].sort_values(ascending=False))

# Crear un gráfico de barras para visualizar las correlaciones con 'sales_volume'
correlations = new_corr_matrix['sales_volume'].sort_values(ascending=False)
fig = px.bar(x=correlations.index, y=correlations.values,
             labels={'x': 'Variables', 'y': 'Correlación con sales_volume'},
             title='Correlaciones con sales_volume')
fig.update_layout(xaxis_tickangle=-45, width=1000, height=600)
fig.show()

# Obtener la variable categórica con mayor correlación con 'sales_volume'
categorical_corr = new_corr_matrix['sales_volume'].drop(numeric_vars).sort_values(ascending=False)
max_corr_variable = categorical_corr.idxmax()
max_corr_value = categorical_corr.max()

print(f"\nLa variable categórica con mayor correlación es '{max_corr_variable}' con un valor de {max_corr_value:.4f}")



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.



Correlaciones con 'sales_volume':
sales_volume              1.000000
product_num_ratings       0.352812
product_star_rating       0.088878
product_original_price   -0.279153
product_price            -0.307284
Name: sales_volume, dtype: float64
Valores únicos por cada columna categórica:
asin: 64 valores únicos
currency: 1 valores únicos
delivery: 23 valores únicos
product_availability: 1 valores únicos
product_minimum_offer_price: 49 valores únicos
product_photo: 64 valores únicos
product_title: 64 valores únicos
product_url: 64 valores únicos
unit_price: 5 valores únicos
Columnas constantes (sin variabilidad):
['product_price', 'product_original_price', 'product_star_rating', 'product_num_ratings', 'sales_volume', 'Unnamed: 0', 'asin_052802759X', 'asin_1368092209', 'asin_1649374186', 'asin_B005ZXWMUS', 'asin_B00FRSYS12', 'asin_B00XZFCVK4', 'asin_B0117U0G3M', 'asin_B01GGU0XWC', 'asin_B07RFSSYBH', 'asin_B07Y8SJGCV', 'asin_B086QB7WZ1', 'asin_B08BG9MPT9', 'asin_B09SM24S8C', 'asin_B09Z7J4L

KeyError: "'sales_volume' ha sido eliminada accidentalmente."