In [None]:
# Importaci√≥n de bibliotecas necesarias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from scipy import stats
from sklearn.preprocessing import LabelEncoder
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Configuraci√≥n de visualizaci√≥n
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
warnings.filterwarnings('ignore')

# Configuraci√≥n de pandas para mostrar m√°s columnas
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print("‚úÖ Bibliotecas importadas correctamente")
print("üìä Configuraci√≥n de visualizaci√≥n establecida")


In [None]:
# Cargar el dataset
df = pd.read_csv('../retail_sales_dataset.csv')

print("‚úÖ Dataset cargado exitosamente")
print(f"üìä Dimensiones del dataset: {df.shape[0]} filas x {df.shape[1]} columnas")
print("\n" + "="*50)
print("üìã INFORMACI√ìN B√ÅSICA DEL DATASET")
print("="*50)

# Informaci√≥n general del dataset
print(f"üî¢ N√∫mero total de transacciones: {len(df):,}")
print(f"üí∞ Valor total de ventas: ${df['Total Amount'].sum():,.2f}")
print(f"üìÖ Per√≠odo de datos: {df['Date'].min()} a {df['Date'].max()}")

# Mostrar primeras filas
print(f"\nüìã Primeras 5 filas del dataset:")
df.head()


In [None]:
# Informaci√≥n detallada de las columnas
print("üìä INFORMACI√ìN DETALLADA DE LAS COLUMNAS")
print("="*50)
print(df.info())

print("\nüìà ESTAD√çSTICAS DESCRIPTIVAS")
print("="*50)
print(df.describe())


In [None]:
# Verificar valores nulos y tipos de datos
print("üîç AN√ÅLISIS DE CALIDAD DE DATOS")
print("="*50)

print("üìã Valores nulos por columna:")
missing_values = df.isnull().sum()
print(missing_values)

print(f"\n‚úÖ Total de valores nulos: {missing_values.sum()}")

print("\nüìä Distribuci√≥n de variables categ√≥ricas:")
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    print(f"\n{col}:")
    print(df[col].value_counts())
    print(f"Valores √∫nicos: {df[col].nunique()}")


In [None]:
# Crear una copia del dataset para trabajar
df_work = df.copy()

# Convertir la columna Date a datetime
df_work['Date'] = pd.to_datetime(df_work['Date'])

# Extraer caracter√≠sticas temporales
df_work['Year'] = df_work['Date'].dt.year
df_work['Month'] = df_work['Date'].dt.month
df_work['Month_Name'] = df_work['Date'].dt.strftime('%B')
df_work['Day'] = df_work['Date'].dt.day
df_work['DayOfWeek'] = df_work['Date'].dt.dayofweek
df_work['DayName'] = df_work['Date'].dt.strftime('%A')
df_work['Quarter'] = df_work['Date'].dt.quarter

# Clasificaci√≥n de ventas (basado en tu trabajo anterior)
def clasificador_ventas(amount):
    if amount >= 1000:
        return 'Alta'
    elif amount >= 300:
        return 'Media'
    else:
        return 'Baja'

df_work['Sales_Category'] = df_work['Total Amount'].apply(clasificador_ventas)

# Clasificaci√≥n de edad (basado en tu trabajo anterior)
def clasificador_edad(age):
    if age >= 50:
        return 'Adulto Mayor'
    elif age >= 30 and age < 50:
        return 'Adulto'
    else:
        return 'Joven'

df_work['Age_Group'] = df_work['Age'].apply(clasificador_edad)

# Normalizaci√≥n Min-Max (basado en tu trabajo anterior)
min_sales = df_work['Total Amount'].min()
max_sales = df_work['Total Amount'].max()
df_work['Total_Amount_Normalized'] = (df_work['Total Amount'] - min_sales) / (max_sales - min_sales)

print("‚úÖ Feature Engineering completado")
print(f"üìä Nuevas caracter√≠sticas creadas: {len(df_work.columns) - len(df.columns)}")
print(f"üìã Columnas totales: {len(df_work.columns)}")

# Mostrar las nuevas caracter√≠sticas
print("\nüÜï Nuevas caracter√≠sticas creadas:")
new_features = set(df_work.columns) - set(df.columns)
for feature in sorted(new_features):
    print(f"   ‚Ä¢ {feature}")

df_work.head()


In [None]:
# Seleccionar solo las columnas num√©ricas para el an√°lisis de correlaci√≥n
numeric_cols = df_work.select_dtypes(include=[np.number]).columns.tolist()
print(f"üìä Columnas num√©ricas para an√°lisis de correlaci√≥n: {len(numeric_cols)}")
print(f"üìã Variables: {', '.join(numeric_cols)}")

# Calcular la matriz de correlaci√≥n
correlation_matrix = df_work[numeric_cols].corr()

# Crear el mapa de calor de correlaci√≥n
plt.figure(figsize=(14, 10))
sns.heatmap(correlation_matrix, 
            annot=True, 
            cmap='RdYlBu_r', 
            center=0,
            square=True, 
            linewidths=0.5,
            cbar_kws={"shrink": .8},
            fmt='.2f')

plt.title('üî• MAPA DE CALOR - MATRIZ DE CORRELACI√ìN\nüìä An√°lisis de Relaciones entre Variables Num√©ricas', 
          fontsize=16, fontweight='bold', pad=20)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Encontrar las correlaciones m√°s fuertes
print("\nüîç CORRELACIONES M√ÅS SIGNIFICATIVAS")
print("="*50)

# Crear una matriz triangular superior para evitar duplicados
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
correlation_matrix_masked = correlation_matrix.mask(mask)

# Convertir a formato largo y ordenar por valor absoluto
corr_pairs = correlation_matrix_masked.unstack().dropna()
corr_pairs_abs = corr_pairs.abs().sort_values(ascending=False)

print("üí™ Top 10 correlaciones m√°s fuertes:")
for (var1, var2), corr_value in corr_pairs_abs.head(10).items():
    original_corr = corr_pairs.loc[(var1, var2)]
    print(f"   üìà {var1} ‚Üî {var2}: {original_corr:.3f}")

# An√°lisis espec√≠fico de correlaciones con Total Amount
print(f"\nüí∞ CORRELACIONES CON 'Total Amount':")
print("="*40)
total_amount_corr = correlation_matrix['Total Amount'].abs().sort_values(ascending=False)
for var, corr in total_amount_corr.items():
    if var != 'Total Amount':
        original_corr = correlation_matrix['Total Amount'][var]
        print(f"   üìä {var}: {original_corr:.3f}")


In [None]:
# Funci√≥n para detectar outliers usando IQR
def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return outliers, lower_bound, upper_bound

# An√°lisis de outliers para variables num√©ricas clave
outlier_vars = ['Total Amount', 'Age', 'Quantity', 'Price per Unit']

print("üîç AN√ÅLISIS DE OUTLIERS")
print("="*50)

outlier_summary = {}

for var in outlier_vars:
    outliers, lower, upper = detect_outliers_iqr(df_work, var)
    outlier_summary[var] = {
        'count': len(outliers),
        'percentage': (len(outliers) / len(df_work)) * 100,
        'lower_bound': lower,
        'upper_bound': upper
    }
    
    print(f"\nüìä {var}:")
    print(f"   üéØ Outliers detectados: {len(outliers)} ({(len(outliers)/len(df_work)*100):.2f}%)")
    print(f"   üìâ L√≠mite inferior: {lower:.2f}")
    print(f"   üìà L√≠mite superior: {upper:.2f}")

# Visualizaci√≥n de outliers con boxplots
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('üìä AN√ÅLISIS DE OUTLIERS - BOXPLOTS\nüéØ Detecci√≥n de Valores At√≠picos', 
             fontsize=16, fontweight='bold', y=0.98)

for i, var in enumerate(outlier_vars):
    row = i // 2
    col = i % 2
    
    # Crear boxplot
    box_plot = axes[row, col].boxplot(df_work[var], patch_artist=True, 
                                     boxprops=dict(facecolor='lightblue', alpha=0.7),
                                     medianprops=dict(color='red', linewidth=2))
    
    axes[row, col].set_title(f'{var}\nüéØ Outliers: {outlier_summary[var]["count"]} ({outlier_summary[var]["percentage"]:.1f}%)', 
                            fontweight='bold')
    axes[row, col].set_ylabel('Valor')
    axes[row, col].grid(True, alpha=0.3)
    
    # A√±adir anotaciones con estad√≠sticas
    q1 = df_work[var].quantile(0.25)
    q3 = df_work[var].quantile(0.75)
    median = df_work[var].median()
    
    axes[row, col].annotate(f'Q1: {q1:.1f}', xy=(1.1, q1), xytext=(1.2, q1),
                           arrowprops=dict(arrowstyle='->', color='blue', alpha=0.7))
    axes[row, col].annotate(f'Mediana: {median:.1f}', xy=(1.1, median), xytext=(1.2, median),
                           arrowprops=dict(arrowstyle='->', color='red', alpha=0.7))
    axes[row, col].annotate(f'Q3: {q3:.1f}', xy=(1.1, q3), xytext=(1.2, q3),
                           arrowprops=dict(arrowstyle='->', color='blue', alpha=0.7))

plt.tight_layout()
plt.show()

# An√°lisis de outliers por categor√≠a de producto
print(f"\nüè∑Ô∏è OUTLIERS POR CATEGOR√çA DE PRODUCTO:")
print("="*50)

for category in df_work['Product Category'].unique():
    category_data = df_work[df_work['Product Category'] == category]
    outliers, _, _ = detect_outliers_iqr(category_data, 'Total Amount')
    print(f"   üì¶ {category}: {len(outliers)} outliers ({len(outliers)/len(category_data)*100:.1f}%)")


In [None]:
# 1. An√°lisis de Ventas por Categor√≠a de Producto
fig, axes = plt.subplots(2, 2, figsize=(18, 14))
fig.suptitle('üìä AN√ÅLISIS COMPLETO POR CATEGOR√çA DE PRODUCTO\nüí∞ Ventas, Distribuci√≥n y Patrones Temporales', 
             fontsize=18, fontweight='bold', y=0.98)

# Subplot 1: Ventas totales por categor√≠a
category_sales = df_work.groupby('Product Category')['Total Amount'].agg(['sum', 'mean', 'count'])
bars1 = axes[0,0].bar(category_sales.index, category_sales['sum'], 
                      color=['#FF6B6B', '#4ECDC4', '#45B7D1'], alpha=0.8)
axes[0,0].set_title('üí∞ Ventas Totales por Categor√≠a', fontweight='bold', fontsize=14)
axes[0,0].set_ylabel('Ventas Totales ($)')
axes[0,0].grid(True, alpha=0.3)

# A√±adir valores en las barras
for bar, value in zip(bars1, category_sales['sum']):
    axes[0,0].annotate(f'${value:,.0f}', 
                       xy=(bar.get_x() + bar.get_width()/2, bar.get_height()),
                       xytext=(0, 5), textcoords='offset points',
                       ha='center', va='bottom', fontweight='bold')

# Subplot 2: Distribuci√≥n de ventas por categor√≠a (Violin Plot)
violin_parts = axes[0,1].violinplot([df_work[df_work['Product Category']==cat]['Total Amount'] 
                                   for cat in df_work['Product Category'].unique()], 
                                   positions=range(len(df_work['Product Category'].unique())))
axes[0,1].set_title('üéª Distribuci√≥n de Ventas por Categor√≠a', fontweight='bold', fontsize=14)
axes[0,1].set_ylabel('Monto de Venta ($)')
axes[0,1].set_xticks(range(len(df_work['Product Category'].unique())))
axes[0,1].set_xticklabels(df_work['Product Category'].unique())
axes[0,1].grid(True, alpha=0.3)

# Colorear los violines
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1']
for pc, color in zip(violin_parts['bodies'], colors):
    pc.set_facecolor(color)
    pc.set_alpha(0.7)

# Subplot 3: Ventas promedio por mes
monthly_sales = df_work.groupby('Month_Name')['Total Amount'].mean().reindex([
    'January', 'February', 'March', 'April', 'May', 'June',
    'July', 'August', 'September', 'October', 'November', 'December'
])
line1 = axes[1,0].plot(range(len(monthly_sales)), monthly_sales.values, 
                       marker='o', linewidth=3, markersize=8, color='#E74C3C')
axes[1,0].set_title('üìà Tendencia de Ventas Promedio por Mes', fontweight='bold', fontsize=14)
axes[1,0].set_ylabel('Venta Promedio ($)')
axes[1,0].set_xlabel('Mes')
axes[1,0].set_xticks(range(len(monthly_sales)))
axes[1,0].set_xticklabels([month[:3] for month in monthly_sales.index], rotation=45)
axes[1,0].grid(True, alpha=0.3)

# A√±adir anotaciones para el mes con mayor y menor venta
max_month_idx = monthly_sales.values.argmax()
min_month_idx = monthly_sales.values.argmin()
axes[1,0].annotate(f'M√°ximo: ${monthly_sales.values[max_month_idx]:.0f}', 
                   xy=(max_month_idx, monthly_sales.values[max_month_idx]),
                   xytext=(max_month_idx, monthly_sales.values[max_month_idx] + 50),
                   arrowprops=dict(arrowstyle='->', color='green', lw=2),
                   fontweight='bold', color='green')

# Subplot 4: Heatmap de ventas por categor√≠a y d√≠a de la semana
heatmap_data = df_work.pivot_table(values='Total Amount', 
                                  index='Product Category', 
                                  columns='DayName', 
                                  aggfunc='mean')
# Reordenar las columnas por orden de d√≠as de la semana
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
heatmap_data = heatmap_data.reindex(columns=day_order)

im = axes[1,1].imshow(heatmap_data.values, cmap='YlOrRd', aspect='auto')
axes[1,1].set_title('üî• Heatmap: Ventas Promedio por Categor√≠a y D√≠a', fontweight='bold', fontsize=14)
axes[1,1].set_xticks(range(len(heatmap_data.columns)))
axes[1,1].set_xticklabels([day[:3] for day in heatmap_data.columns])
axes[1,1].set_yticks(range(len(heatmap_data.index)))
axes[1,1].set_yticklabels(heatmap_data.index)

# A√±adir valores en el heatmap
for i in range(len(heatmap_data.index)):
    for j in range(len(heatmap_data.columns)):
        text = axes[1,1].text(j, i, f'{heatmap_data.iloc[i, j]:.0f}',
                             ha="center", va="center", color="black", fontweight='bold')

# A√±adir colorbar
plt.colorbar(im, ax=axes[1,1])

plt.tight_layout()
plt.show()


In [None]:
# 2. An√°lisis Demogr√°fico y de Comportamiento del Cliente
fig, axes = plt.subplots(2, 3, figsize=(20, 12))
fig.suptitle('üë• AN√ÅLISIS DEMOGR√ÅFICO Y COMPORTAMIENTO DEL CLIENTE\nüéØ Patrones de Compra por Edad, G√©nero y Categor√≠as', 
             fontsize=18, fontweight='bold', y=0.98)

# Subplot 1: Distribuci√≥n por g√©nero
gender_counts = df_work['Gender'].value_counts()
wedges, texts, autotexts = axes[0,0].pie(gender_counts.values, labels=gender_counts.index, 
                                         autopct='%1.1f%%', startangle=90,
                                         colors=['#FF9999', '#66B2FF'])
axes[0,0].set_title('üë• Distribuci√≥n por G√©nero', fontweight='bold', fontsize=14)

# Subplot 2: Ventas promedio por grupo de edad
age_sales = df_work.groupby('Age_Group')['Total Amount'].agg(['mean', 'count'])
bars2 = axes[0,1].bar(age_sales.index, age_sales['mean'], 
                      color=['#FFB84D', '#FF6B6B', '#4ECDC4'], alpha=0.8)
axes[0,1].set_title('üí∞ Ventas Promedio por Grupo de Edad', fontweight='bold', fontsize=14)
axes[0,1].set_ylabel('Venta Promedio ($)')
axes[0,1].grid(True, alpha=0.3)

# A√±adir valores y conteos
for bar, mean_val, count in zip(bars2, age_sales['mean'], age_sales['count']):
    axes[0,1].annotate(f'${mean_val:.0f}\\n(n={count})', 
                       xy=(bar.get_x() + bar.get_width()/2, bar.get_height()),
                       xytext=(0, 5), textcoords='offset points',
                       ha='center', va='bottom', fontweight='bold')

# Subplot 3: Distribuci√≥n de edades
axes[0,2].hist(df_work['Age'], bins=20, color='skyblue', alpha=0.7, edgecolor='black')
axes[0,2].axvline(df_work['Age'].mean(), color='red', linestyle='--', linewidth=2, 
                  label=f'Media: {df_work["Age"].mean():.1f} a√±os')
axes[0,2].axvline(df_work['Age'].median(), color='green', linestyle='--', linewidth=2, 
                  label=f'Mediana: {df_work["Age"].median():.1f} a√±os')
axes[0,2].set_title('üìä Distribuci√≥n de Edades', fontweight='bold', fontsize=14)
axes[0,2].set_xlabel('Edad')
axes[0,2].set_ylabel('Frecuencia')
axes[0,2].legend()
axes[0,2].grid(True, alpha=0.3)

# Subplot 4: Ventas por categor√≠a de venta (Alta, Media, Baja)
sales_category_counts = df_work['Sales_Category'].value_counts()
bars3 = axes[1,0].bar(sales_category_counts.index, sales_category_counts.values,
                      color=['#E74C3C', '#F39C12', '#27AE60'], alpha=0.8)
axes[1,0].set_title('üéØ Distribuci√≥n de Categor√≠as de Venta', fontweight='bold', fontsize=14)
axes[1,0].set_ylabel('N√∫mero de Transacciones')
axes[1,0].grid(True, alpha=0.3)

# A√±adir valores y porcentajes
total_transactions = len(df_work)
for bar, value in zip(bars3, sales_category_counts.values):
    percentage = (value / total_transactions) * 100
    axes[1,0].annotate(f'{value}\\n({percentage:.1f}%)', 
                       xy=(bar.get_x() + bar.get_width()/2, bar.get_height()),
                       xytext=(0, 5), textcoords='offset points',
                       ha='center', va='bottom', fontweight='bold')

# Subplot 5: Relaci√≥n entre Cantidad y Precio por Unidad
scatter = axes[1,1].scatter(df_work['Quantity'], df_work['Price per Unit'], 
                           c=df_work['Total Amount'], cmap='viridis', 
                           alpha=0.6, s=50)
axes[1,1].set_title('üîó Relaci√≥n: Cantidad vs Precio por Unidad', fontweight='bold', fontsize=14)
axes[1,1].set_xlabel('Cantidad')
axes[1,1].set_ylabel('Precio por Unidad ($)')
axes[1,1].grid(True, alpha=0.3)
plt.colorbar(scatter, ax=axes[1,1], label='Total Amount ($)')

# Subplot 6: Ventas por trimestre
quarterly_sales = df_work.groupby('Quarter')['Total Amount'].agg(['sum', 'mean'])
bars4 = axes[1,2].bar([f'Q{q}' for q in quarterly_sales.index], quarterly_sales['sum'],
                      color=['#3498DB', '#9B59B6', '#E67E22', '#1ABC9C'], alpha=0.8)
axes[1,2].set_title('üìà Ventas Totales por Trimestre', fontweight='bold', fontsize=14)
axes[1,2].set_ylabel('Ventas Totales ($)')
axes[1,2].grid(True, alpha=0.3)

# A√±adir valores
for bar, value in zip(bars4, quarterly_sales['sum']):
    axes[1,2].annotate(f'${value:,.0f}', 
                       xy=(bar.get_x() + bar.get_width()/2, bar.get_height()),
                       xytext=(0, 5), textcoords='offset points',
                       ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()


In [None]:
# Resumen ejecutivo de insights clave
print("üéØ INSIGHTS CLAVE DEL AN√ÅLISIS EXPLORATORIO")
print("="*60)

# 1. An√°lisis de ventas por categor√≠a
print("\nüí∞ AN√ÅLISIS DE VENTAS POR CATEGOR√çA:")
print("-" * 40)
category_analysis = df_work.groupby('Product Category')['Total Amount'].agg(['sum', 'mean', 'count', 'std'])
for category in category_analysis.index:
    sales_total = category_analysis.loc[category, 'sum']
    sales_mean = category_analysis.loc[category, 'mean']
    transaction_count = category_analysis.loc[category, 'count']
    sales_std = category_analysis.loc[category, 'std']
    cv = sales_std / sales_mean  # Coeficiente de variaci√≥n
    
    print(f"üì¶ {category}:")
    print(f"   üíµ Ventas totales: ${sales_total:,.2f}")
    print(f"   üìä Venta promedio: ${sales_mean:.2f}")
    print(f"   üî¢ Transacciones: {transaction_count}")
    print(f"   üìà Variabilidad (CV): {cv:.2f}")

# 2. An√°lisis temporal
print(f"\nüìÖ AN√ÅLISIS TEMPORAL:")
print("-" * 40)
best_month = df_work.groupby('Month_Name')['Total Amount'].sum().idxmax()
best_month_sales = df_work.groupby('Month_Name')['Total Amount'].sum().max()
worst_month = df_work.groupby('Month_Name')['Total Amount'].sum().idxmin()
worst_month_sales = df_work.groupby('Month_Name')['Total Amount'].sum().min()

print(f"üèÜ Mejor mes: {best_month} (${best_month_sales:,.2f})")
print(f"üìâ Peor mes: {worst_month} (${worst_month_sales:,.2f})")

best_day = df_work.groupby('DayName')['Total Amount'].sum().idxmax()
best_day_sales = df_work.groupby('DayName')['Total Amount'].sum().max()
print(f"üóìÔ∏è Mejor d√≠a de la semana: {best_day} (${best_day_sales:,.2f})")

# 3. An√°lisis demogr√°fico
print(f"\nüë• AN√ÅLISIS DEMOGR√ÅFICO:")
print("-" * 40)
age_group_analysis = df_work.groupby('Age_Group')['Total Amount'].agg(['mean', 'count'])
print("üí∞ Gasto promedio por grupo de edad:")
for age_group in age_group_analysis.index:
    avg_spending = age_group_analysis.loc[age_group, 'mean']
    count = age_group_analysis.loc[age_group, 'count']
    print(f"   üë§ {age_group}: ${avg_spending:.2f} (n={count})")

gender_analysis = df_work.groupby('Gender')['Total Amount'].agg(['mean', 'count'])
print(f"\nüí∞ Gasto promedio por g√©nero:")
for gender in gender_analysis.index:
    avg_spending = gender_analysis.loc[gender, 'mean']
    count = gender_analysis.loc[gender, 'count']
    print(f"   üë§ {gender}: ${avg_spending:.2f} (n={count})")

# 4. An√°lisis de correlaciones clave
print(f"\nüîó CORRELACIONES PRINCIPALES:")
print("-" * 40)
key_correlations = correlation_matrix['Total Amount'].abs().sort_values(ascending=False).head(5)
for var, corr_value in key_correlations.items():
    if var != 'Total Amount':
        original_corr = correlation_matrix['Total Amount'][var]
        print(f"   üìä {var}: {original_corr:.3f}")

# 5. An√°lisis de patrones de compra
print(f"\nüõí PATRONES DE COMPRA:")
print("-" * 40)
avg_quantity = df_work['Quantity'].mean()
avg_price_per_unit = df_work['Price per Unit'].mean()
avg_total_amount = df_work['Total Amount'].mean()

print(f"üì¶ Cantidad promedio por transacci√≥n: {avg_quantity:.2f}")
print(f"üí∞ Precio promedio por unidad: ${avg_price_per_unit:.2f}")
print(f"üõçÔ∏è Monto promedio por transacci√≥n: ${avg_total_amount:.2f}")

# Distribuci√≥n de categor√≠as de venta
sales_cat_dist = df_work['Sales_Category'].value_counts(normalize=True) * 100
print(f"\nüéØ Distribuci√≥n de categor√≠as de venta:")
for category, percentage in sales_cat_dist.items():
    print(f"   üìä {category}: {percentage:.1f}%")

# 6. Recomendaciones basadas en el an√°lisis
print(f"\nüí° RECOMENDACIONES PARA MACHINE LEARNING:")
print("-" * 50)
print("üéØ Variables m√°s importantes para predecir ventas:")
print("   1. Quantity y Price per Unit (alta correlaci√≥n)")
print("   2. Product Category (diferentes patrones de venta)")
print("   3. Caracter√≠sticas temporales (Month, DayOfWeek)")
print("   4. Age_Group (diferentes comportamientos de gasto)")
print("   5. Gender (ligeras diferencias en patrones)")

print(f"\n‚ö†Ô∏è CONSIDERACIONES PARA PREPROCESAMIENTO:")
print("   ‚Ä¢ Normalizar variables num√©ricas (diferentes escalas)")
print("   ‚Ä¢ Codificar variables categ√≥ricas")
print("   ‚Ä¢ Considerar crear features de interacci√≥n")
print("   ‚Ä¢ Evaluar tratamiento de outliers en Clothing category")

# Guardar el dataset procesado para el siguiente notebook
df_work.to_csv('../data_processed_eda.csv', index=False)
print(f"\n‚úÖ Dataset procesado guardado como 'data_processed_eda.csv'")
print(f"üìä Listo para el siguiente paso: Preprocesamiento para Machine Learning")
