Análisis de los resultados de 6estragia de trading entre benchmarking y la estrategia de trading



In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import scipy.stats as stats
from scipy.stats import gaussian_kde, shapiro, wilcoxon

In [2]:
results_df = pd.read_csv('backtest_results.csv')

In [3]:
# Columnas que quieres resumir
cols_to_summarize = [
    'total_return',
    'benchmark_return',
    'total_trades',
    'win_rate',
    'sharpe_ratio'
]

# Resumen estadístico
results_df[cols_to_summarize].describe()

Unnamed: 0,total_return,benchmark_return,total_trades,win_rate,sharpe_ratio
count,101.0,101.0,101.0,101.0,101.0
mean,163.382693,176.203949,92.316832,46.868596,0.821922
std,176.031798,336.198179,32.273497,5.388231,0.371603
min,-51.965007,-67.422748,22.0,34.453782,-0.172638
25%,54.779069,30.125025,65.0,43.333333,0.591274
50%,113.041212,81.977671,93.0,46.73913,0.837362
75%,206.800697,176.956726,112.0,49.541284,1.066529
max,1002.026788,2191.5,177.0,65.384615,1.662062


In [4]:
# Preparar datos en formato long para boxplot
df_plot = pd.melt(results_df,
                  id_vars=["ticker"],
                  value_vars=["total_return", "benchmark_return"],
                  var_name="Tipo",
                  value_name="Return")

In [5]:
# Shapiro-Wilk Test para normalidad (recomendado para n<5000)
stat_total, p_total = shapiro(results_df["total_return"])
stat_bench, p_bench = shapiro(results_df["benchmark_return"])

print(f"Total Return p-value: {p_total:.4f}")
print(f"Benchmark Return p-value: {p_bench:.4f}")

Total Return p-value: 0.0000
Benchmark Return p-value: 0.0000


Dado que el p-valor es menor que 0.05 con el test de Shapiro quiere decir que las distribuciones no se ajustan a la distribución normal por lo que realizamos un test no paramétrico de Wilcoxon de comparación de mediana

In [6]:
# Valores a graficar
total = results_df["total_return"]
benchmark = results_df["benchmark_return"]

# Estimar densidades
kde_total = gaussian_kde(total)
kde_benchmark = gaussian_kde(benchmark)

# Crear rango común de valores para evaluar ambas densidades
x_vals = np.linspace(
    min(total.min(), benchmark.min()),
    max(total.max(), benchmark.max()),
    500
)

# Evaluar las densidades
y_total = kde_total(x_vals)
y_benchmark = kde_benchmark(x_vals)

# Crear gráfico
fig = go.Figure()

# Densidad total_return
fig.add_trace(go.Scatter(
    x=x_vals,
    y=y_total,
    mode='lines',
    name='Total Return',
    line=dict(color='green'),
    fill='tozeroy',
    opacity=0.6
))

# Densidad benchmark_return
fig.add_trace(go.Scatter(
    x=x_vals,
    y=y_benchmark,
    mode='lines',
    name='Benchmark Return',
    line=dict(color='red'),
    fill='tozeroy',
    opacity=0.6
))

# Layout
fig.update_layout(
    title='Curvas de Densidad: Total Return vs Benchmark',
    xaxis_title='Return (%)',
    yaxis_title='Densidad',
    template='plotly_white',
    width=800,
    height=500
)

fig.show()

In [7]:
# Si p-valor <= 0.05 en cualquiera de los casos de arriba de Shapiro
# Wilcoxon signed-rank test (no paramétrico, pareado)
w_stat, p_value = wilcoxon(results_df["total_return"], results_df["benchmark_return"])
print(f"Wilcoxon p-value: {p_value:.4f}")

Wilcoxon p-value: 0.0067


Rechazamos H0 con una significancia del 0.05 por lo que hay una diferencia de medianas

In [8]:
# Calcular diferencias
diff = results_df["total_return"] - results_df["benchmark_return"]

# Test de Wilcoxon
w_stat, p_value = wilcoxon(results_df["total_return"], results_df["benchmark_return"])

# Mediana de la diferencia
median_diff = np.median(diff)

# Bootstrap para IC de la mediana
n_boot = 10000
boot_medians = []
rng = np.random.default_rng(seed=42)  # Semilla para reproducibilidad
for _ in range(n_boot):
    sample = rng.choice(diff, size=len(diff), replace=True)
    boot_medians.append(np.median(sample))

ci_lower, ci_upper = np.percentile(boot_medians, [2.5, 97.5])

print(f"Wilcoxon p-value: {p_value:.4f}")
print(f"Mediana de la diferencia: {median_diff:.4f}")
print(f"IC 95% de la mediana: [{ci_lower:.4f}, {ci_upper:.4f}]")

# Conclusión
if p_value <= 0.05:
    if median_diff > 0:
        conclusion = "La estrategia obtiene rendimientos significativamente superiores al benchmark."
    elif median_diff < 0:
        conclusion = "La estrategia obtiene rendimientos significativamente inferiores al benchmark."
    else:
        conclusion = "No hay una diferencia direccional clara, aunque es estadísticamente significativa."
else:
    conclusion = "No hay evidencia estadísticamente significativa de que la estrategia y el benchmark difieran en rendimiento."

print(conclusion)

Wilcoxon p-value: 0.0067
Mediana de la diferencia: 27.7167
IC 95% de la mediana: [15.4155, 38.6768]
La estrategia obtiene rendimientos significativamente superiores al benchmark.


In [9]:
# Reestructurar a formato largo
df_long = pd.melt(
    results_df,
    id_vars=['ticker'],
    value_vars=['total_return', 'benchmark_return'],
    var_name='Tipo',
    value_name='Return'
)

# Crear gráfico tipo violin plot
fig = px.violin(df_long,
                x='Tipo',
                y='Return',
                color='Tipo',
                box=True,          # Mostrar boxplot dentro del violín
                points='all',      # Mostrar todos los puntos individuales
                hover_data=['ticker'])

fig.update_layout(
    title='Distribución de Retornos: Estrategia vs Benchmark',
    yaxis_title='Return (%)',
    xaxis_title='Tipo de Retorno',
    legend_title='Tipo',
    width=900,
    height=500,
    template='plotly_white'
)

fig.show()

In [10]:
results_df["diff_return"] = results_df["total_return"] - results_df["benchmark_return"]

In [11]:
# Datos
total = results_df["total_return"]
benchmark = results_df["benchmark_return"]

# IC para total_return
mean_total = np.mean(total)
sem_total = stats.sem(total)
ci_total = stats.t.interval(0.95, df=len(total)-1, loc=mean_total, scale=sem_total)

# IC para benchmark_return
mean_benchmark = np.mean(benchmark)
sem_benchmark = stats.sem(benchmark)
ci_benchmark = stats.t.interval(0.95, df=len(benchmark)-1, loc=mean_benchmark, scale=sem_benchmark)

# Imprimir resultados
print(f"Total Return:     media = {mean_total:.2f}, 95% IC = ({ci_total[0]:.2f}, {ci_total[1]:.2f})")
print(f"Benchmark Return: media = {mean_benchmark:.2f}, 95% IC = ({ci_benchmark[0]:.2f}, {ci_benchmark[1]:.2f})")

Total Return:     media = 163.38, 95% IC = (128.63, 198.13)
Benchmark Return: media = 176.20, 95% IC = (109.83, 242.57)


In [12]:
# Diferencias
diffs = results_df["diff_return"]

# Media y IC
mean_diff = np.mean(diffs)
sem_diff = stats.sem(diffs)
ci_diff = stats.t.interval(0.95, len(diffs)-1, loc=mean_diff, scale=sem_diff)

# Test unilateral: H0: mu >= 0  vs  H1: mu > 0
t_stat, p_value_two_sided = stats.ttest_1samp(diffs, popmean=0)
if mean_diff > 0:
    p_value_one_sided = p_value_two_sided / 2
else:
    p_value_one_sided = 1 - (p_value_two_sided / 2)

print(f"Media diferencia: {mean_diff:.2f}")
print(f"IC 95%: ({ci_diff[0]:.2f}, {ci_diff[1]:.2f})")
print(f"T-statistic: {t_stat:.4f}")
print(f"P-valor (unilateral, H1: media > 0): {p_value_one_sided:.4f}")

Media diferencia: -12.82
IC 95%: (-71.35, 45.70)
T-statistic: -0.4346
P-valor (unilateral, H1: media > 0): 0.6676


In [13]:
# Crear una nueva columna con la categoría
results_df["diff_category"] = np.where(results_df["diff_return"] >= 0, ">= 0 (Gana o Empata)", "< 0 (Pierde)")
counts = results_df["diff_category"].value_counts().reset_index()
counts.columns = ["Categoria", "Cantidad"]

fig = px.bar(
    counts,
    x="Categoria",
    y="Cantidad",
    color="Categoria",
    text="Cantidad",
    title="Comparación: ¿La estrategia supera al benchmark?",
    labels={"Categoria": "Diferencia vs Benchmark", "Cantidad": "Nº de Tickers"},
    template="plotly_white"
)

fig.update_traces(textposition='outside')

# Aquí ajustamos el espaciado del eje Y
fig.update_layout(
    showlegend=False,
    yaxis=dict(tick0=0, dtick=5)
)

fig.show()

In [14]:
# Realizar un filtro donde return sea menor de 1,000 y se queden los mismos ticker de cada Tipo: total_return y benchmark_return
filtered_df = results_df[(results_df["total_return"] < 1000) & (results_df["benchmark_return"] < 1000)]
# Reestructurar a formato largo
df_long = pd.melt(
    filtered_df,
    id_vars=['ticker'],
    value_vars=['total_return', 'benchmark_return'],
    var_name='Tipo',
    value_name='Return'
)

# Crear gráfico tipo violin plot
fig = px.violin(df_long,
                x='Tipo',
                y='Return',
                color='Tipo',
                box=True,          # Mostrar boxplot dentro del violín
                points='all',      # Mostrar todos los puntos individuales
                hover_data=['ticker'])

fig.update_layout(
    title='Distribución de Retornos: Estrategia vs Benchmark',
    yaxis_title='Return (%)',
    xaxis_title='Tipo de Retorno',
    legend_title='Tipo',
    width=900,
    height=500,
    template='plotly_white'
)

fig.show()

In [15]:
filtered_df["diff_return"] = filtered_df["total_return"] - filtered_df["benchmark_return"]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [16]:
# Crear violín (distribución + box + puntos)
fig = go.Figure()
diffs = filtered_df["diff_return"]
fig.add_trace(go.Violin(
    y=diffs,
    box_visible=True,
    meanline_visible=False,
    points='all',
    jitter=0.5,
    scalemode='count',
    line_color='skyblue',
    name='Diff Return',
    hovertext=filtered_df['ticker'],
    hoverinfo='text+y'
))

# Layout
fig.update_layout(
    title='Raincloud Plot: Diferencia Total Return - Benchmark',
    yaxis_title='Diff Return (%)',
    showlegend=True,
    width=600,
    height=500,
    template='plotly_white'
)

fig.show()

In [17]:
# Sacar la descriptiva estilo summary() de R con la mínima, media, primer cuartil, mediana, tercer cuartil, máximo y desviación estándar
filtered_df["diff_return"].describe()

Unnamed: 0,diff_return
count,98.0
mean,29.762798
std,108.429984
min,-260.083012
25%,-18.994672
50%,28.635418
75%,74.106355
max,506.479287


In [18]:
# Diferencias
diffs = filtered_df["diff_return"]

# Media y IC
mean_diff = np.mean(diffs)
sem_diff = stats.sem(diffs)
ci_diff = stats.t.interval(0.95, len(diffs)-1, loc=mean_diff, scale=sem_diff)

# Test bilateral: H0: mu = 0 vs H1: mu ≠ 0
t_stat, p_value = stats.ttest_1samp(diffs, popmean=0)

print(f"Media diferencia: {mean_diff:.2f}")
print(f"IC 95%: ({ci_diff[0]:.2f}, {ci_diff[1]:.2f})")
print(f"T-statistic: {t_stat:.4f}")
print(f"P-valor (bilateral, H1: media ≠ 0): {p_value:.4f}")

Media diferencia: 29.76
IC 95%: (8.02, 51.50)
T-statistic: 2.7173
P-valor (bilateral, H1: media ≠ 0): 0.0078
