From 8e5be48f9c16c791d41222134c665edd8fccab14 Mon Sep 17 00:00:00 2001 From: IsaacMtz19 Date: Wed, 9 Apr 2025 10:52:50 -0600 Subject: [PATCH 01/14] Fix link documentation --- experimental_design.py | 2 +- experimental_evaluation.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/experimental_design.py b/experimental_design.py index 623508a..2f82afc 100644 --- a/experimental_design.py +++ b/experimental_design.py @@ -29,7 +29,7 @@ color: #3e7cb1 !important; } - Murray Documentation + Murray Documentation """, unsafe_allow_html=True ) diff --git a/experimental_evaluation.py b/experimental_evaluation.py index 965d2d6..fe2663f 100644 --- a/experimental_evaluation.py +++ b/experimental_evaluation.py @@ -30,7 +30,7 @@ color: #3e7cb1 !important; } - Murray Documentation + Murray Documentation """, unsafe_allow_html=True ) From 888987df01c0d2cf5f2cf7ccde22df024ba7032f Mon Sep 17 00:00:00 2001 From: IsaacMtz19 Date: Mon, 14 Apr 2025 01:40:48 -0600 Subject: [PATCH 02/14] Fix bug in mmm_option --- experimental_evaluation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/experimental_evaluation.py b/experimental_evaluation.py index fe2663f..dc797c6 100644 --- a/experimental_evaluation.py +++ b/experimental_evaluation.py @@ -688,10 +688,10 @@ def reset_states(): - if mmm_option == "iROAS": + if mmm_option == "iCPA": st.session_state.metric_mmm = spend / st.session_state.incremental else: - st.session_state.metric_mmm = spend / st.session_state.incremental + st.session_state.metric_mmm = st.session_state.incremental / spend From ac56491301af80b18bb69913dabde7f9d5982624 Mon Sep 17 00:00:00 2001 From: IsaacMtz19 Date: Fri, 18 Apr 2025 09:28:34 -0600 Subject: [PATCH 03/14] Fix firt variable --- experimental_evaluation.py | 40 +++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/experimental_evaluation.py b/experimental_evaluation.py index dc797c6..c4e469a 100644 --- a/experimental_evaluation.py +++ b/experimental_evaluation.py @@ -42,8 +42,8 @@ def generate_pdf(treatment_group, control_group, holdout_percentage, impact_graph,percenge_lift,p_value,power,period, - permutation_test,treatment_day,firt_day,last_day, - col_target,metric_mmm,mmm_option,lift_total,firt_report_day,second_report_day, + permutation_test,treatment_day,first_day,last_day, + col_target,metric_mmm,mmm_option,lift_total,first_report_day,second_report_day, pre_treatment,pre_counterfactual,post_treatment,post_counterfactual,att,incremental,df,spend): """ Generates a PDF report with explanations for each aspect. @@ -94,7 +94,7 @@ def generate_pdf(treatment_group, control_group, holdout_percentage, pdf.set_font("Poppins", size=10) pdf.set_text_color(33, 31, 36) pdf.multi_cell(0,5 , f"This report provides information about the the results of the analysis of a treatment on the variable '{col_target}' with a duration of {period} days. " - f"The data included in the design have a period of {firt_day} to {last_day} where the treatment started on {firt_day} until {last_day}." + f"The data included in the design have a period of {first_day} to {last_day} where the treatment started on {first_day} until {last_day}." f"It includes information about the treatment group, control group, and the statistics results of the analysis.") @@ -187,8 +187,8 @@ def generate_pdf(treatment_group, control_group, holdout_percentage, header_texts = [ "Group", - f"Pre-treatment\n({firt_report_day} to {second_report_day})", - f"Post-treatment\n({firt_day} to {last_day})", + f"Pre-treatment\n({first_report_day} to {second_report_day})", + f"Post-treatment\n({first_day} to {last_day})", "Increment" ] @@ -440,10 +440,10 @@ def generate_pdf(treatment_group, control_group, holdout_percentage, st.session_state.incremental_report = None if 'last_day' not in st.session_state: st.session_state.last_day = None -if 'firt_day' not in st.session_state: - st.session_state.firt_day = None -if 'firt_report_day' not in st.session_state: - st.session_state.firt_report_day = None +if 'first_day' not in st.session_state: + st.session_state.first_day = None +if 'first_report_day' not in st.session_state: + st.session_state.first_report_day = None if 'second_report_day' not in st.session_state: st.session_state.second_report_day = None @@ -560,7 +560,7 @@ def reset_states(): st.subheader("3. Experimental evaluation") random_sate = data1['location'].unique()[0] filtered_data = data1[data1['location'] == random_sate] - firt_day = filtered_data['time'].min() + first_day = filtered_data['time'].min() last_day = filtered_data['time'].max() @@ -568,8 +568,8 @@ def reset_states(): st.text("Parameter configuration") - start_treatment = st.date_input("Treatment start date",min_value=firt_day,max_value=last_day,value=firt_day) - end_treatment = st.date_input("Treatment end date",min_value=firt_day,max_value=last_day,value=last_day) + start_treatment = st.date_input("Treatment start date",min_value=first_day,max_value=last_day,value=first_day) + end_treatment = st.date_input("Treatment end date",min_value=first_day,max_value=last_day,value=last_day) treatment_group = st.multiselect("Select treatment group", data1['location'].unique()) spend = st.number_input("Select spend") mmm_option = st.selectbox("Select the option to calculate the iROAS or iCPA", ["iROAS", "iCPA"]) @@ -656,7 +656,7 @@ def reset_states(): st.session_state.permutation_test_report = plot_permutation_test_report(results) st.session_state.period = period second_report_day = last_day - pd.Timedelta(days=period) - firt_report_day = last_day - pd.Timedelta(days=(period*2)-1) + first_report_day = last_day - pd.Timedelta(days=(period*2)-1) treatment_day = last_day - pd.Timedelta(days=period-1) @@ -711,12 +711,16 @@ def reset_states(): last_day = pd.to_datetime(last_day) treatment_day = last_day - pd.Timedelta(days=end_position_treatment - start_position_treatment) second_report_day = last_day - pd.Timedelta(days=st.session_state.period) - firt_report_day = last_day - pd.Timedelta(days=(st.session_state.period*2)-1) + first_report_day = last_day - pd.Timedelta(days=(st.session_state.period*2)-1) treatment_day = last_day - pd.Timedelta(days=st.session_state.period-1) last_day = last_day.strftime('%Y-%m-%d') - firt_day = firt_day.strftime('%Y-%m-%d') - firt_report_day = firt_report_day.strftime('%Y-%m-%d') + first_day = first_day.strftime('%Y-%m-%d') + first_report_day = first_report_day.strftime('%Y-%m-%d') second_report_day = second_report_day.strftime('%Y-%m-%d') + st.write(f"Last day: {last_day}") + st.write(f"First day: {first_day}") + st.write(f"First report day: {first_report_day}") + st.write(f"Second report day: {second_report_day}") @@ -772,13 +776,13 @@ def reset_states(): st.session_state.period, st.session_state.permutation_test_report, treatment_day, - firt_day, + first_day, last_day, col_target, st.session_state.metric_mmm, st.session_state.mmm_option, st.session_state.lift_total, - firt_report_day, + first_report_day, second_report_day, st.session_state.pre_treatment, st.session_state.pre_counterfactual, From 67ae5eeb9037fd159a5604370823a9d883c631fe Mon Sep 17 00:00:00 2001 From: IsaacMtz19 Date: Fri, 18 Apr 2025 09:45:15 -0600 Subject: [PATCH 04/14] Fix report dates --- experimental_evaluation.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/experimental_evaluation.py b/experimental_evaluation.py index c4e469a..fa76c41 100644 --- a/experimental_evaluation.py +++ b/experimental_evaluation.py @@ -94,7 +94,7 @@ def generate_pdf(treatment_group, control_group, holdout_percentage, pdf.set_font("Poppins", size=10) pdf.set_text_color(33, 31, 36) pdf.multi_cell(0,5 , f"This report provides information about the the results of the analysis of a treatment on the variable '{col_target}' with a duration of {period} days. " - f"The data included in the design have a period of {first_day} to {last_day} where the treatment started on {first_day} until {last_day}." + f"The data included in the design have a period of {first_day} to {last_day} where the treatment started on {treatment_day} until {last_day}." f"It includes information about the treatment group, control group, and the statistics results of the analysis.") @@ -188,7 +188,7 @@ def generate_pdf(treatment_group, control_group, holdout_percentage, header_texts = [ "Group", f"Pre-treatment\n({first_report_day} to {second_report_day})", - f"Post-treatment\n({first_day} to {last_day})", + f"Post-treatment\n({treatment_day} to {last_day})", "Increment" ] @@ -713,6 +713,7 @@ def reset_states(): second_report_day = last_day - pd.Timedelta(days=st.session_state.period) first_report_day = last_day - pd.Timedelta(days=(st.session_state.period*2)-1) treatment_day = last_day - pd.Timedelta(days=st.session_state.period-1) + treatment_day = treatment_day.strftime('%Y-%m-%d') last_day = last_day.strftime('%Y-%m-%d') first_day = first_day.strftime('%Y-%m-%d') first_report_day = first_report_day.strftime('%Y-%m-%d') @@ -721,6 +722,7 @@ def reset_states(): st.write(f"First day: {first_day}") st.write(f"First report day: {first_report_day}") st.write(f"Second report day: {second_report_day}") + st.write(f"Treatment day: {treatment_day}") From 7a416f24a69e73d7fbaebfa00f4e1c98b7df1dec Mon Sep 17 00:00:00 2001 From: IsaacMtz19 Date: Wed, 23 Apr 2025 12:27:10 -0600 Subject: [PATCH 05/14] Commented innecesary variables --- experimental_evaluation.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/experimental_evaluation.py b/experimental_evaluation.py index fa76c41..ae6e7c7 100644 --- a/experimental_evaluation.py +++ b/experimental_evaluation.py @@ -718,11 +718,11 @@ def reset_states(): first_day = first_day.strftime('%Y-%m-%d') first_report_day = first_report_day.strftime('%Y-%m-%d') second_report_day = second_report_day.strftime('%Y-%m-%d') - st.write(f"Last day: {last_day}") - st.write(f"First day: {first_day}") - st.write(f"First report day: {first_report_day}") - st.write(f"Second report day: {second_report_day}") - st.write(f"Treatment day: {treatment_day}") + # st.write(f"Last day: {last_day}") + # st.write(f"First day: {first_day}") + # st.write(f"First report day: {first_report_day}") + # st.write(f"Second report day: {second_report_day}") + # st.write(f"Treatment day: {treatment_day}") From d7be6b7659689980b873d361db858225e91e27a3 Mon Sep 17 00:00:00 2001 From: IsaacMtz19 Date: Wed, 23 Apr 2025 14:23:55 -0600 Subject: [PATCH 06/14] Update print_incremental_results --- Murray/plots.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/Murray/plots.py b/Murray/plots.py index 76758a2..5ea0d59 100644 --- a/Murray/plots.py +++ b/Murray/plots.py @@ -686,11 +686,26 @@ def print_incremental_results(geo_test, period, treatment_percentage): holdout_percentage = 100 - treatment_percentage title = "Incremental Results" att, incremental, fig = plot_impact_streamlit_app(geo_test, period, holdout_percentage) + + # Get the MDE from the sensitivity_results + sensitivity_results = geo_test['sensitivity_results'] + results_by_size = geo_test['simulation_results'] + + target_size_key = None + target_mde = None + for size_key, result in results_by_size.items(): + current_holdout = result['Holdout Percentage'] + if abs(current_holdout - holdout_percentage) < 0.01: + target_size_key = size_key + target_mde = sensitivity_results[size_key][period].get('MDE', None) + break + print("=" * 30) print(title.center(30)) print("=" * 30) print(f"ATT: {round(att,2)}") print(f"Lift total: {round(incremental,2)}") + print(f"MDE: {round(target_mde*100,2)}%") print("=" * 30) From b73f54a63ed9b235469918eece349469c81899fb Mon Sep 17 00:00:00 2001 From: IsaacMtz19 Date: Wed, 23 Apr 2025 14:25:56 -0600 Subject: [PATCH 07/14] Add install_requires --- setup.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 05f55de..7c4bf27 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,10 @@ "tqdm", "matplotlib", "seaborn", - "plotly" + "plotly", + "millify", + "statsmodels", + ], author="Entropy Team", author_email="dev@entropy.tech", From e1632f4ef6b5d0327e43de8b8659e23526b9f4de Mon Sep 17 00:00:00 2001 From: IsaacMtz19 Date: Wed, 23 Apr 2025 15:24:14 -0600 Subject: [PATCH 08/14] Add mde to hover on --- Murray/plots.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/Murray/plots.py b/Murray/plots.py index 5ea0d59..5c73c26 100644 --- a/Murray/plots.py +++ b/Murray/plots.py @@ -388,10 +388,19 @@ def calculate_penalty_score(mde, period_idx, total_periods, size, results_by_siz custom_data = [] for s in sorted_sizes: - custom_data.append([s] * len(periods)) + # Get the MDE for each size and period + mde_data = [] + for period in periods: + mde = sensitivity_results[s][period].get('MDE', None) + mde_data.append(f"{mde:.2%}" if mde is not None else "N/A") + custom_data.append(mde_data) fig.data[0].customdata = custom_data - fig.data[0].hovertemplate = "Treatment size: %{customdata}
" + fig.data[0].hovertemplate = ( + "Treatment size: %{customdata}
" + + "MDE: %{customdata}
" + + "" + ) fig.data[0].hoverinfo = "skip" return fig From afe37351f60cded4c9fd7fed3f670b894d1fde24 Mon Sep 17 00:00:00 2001 From: IsaacMtz19 Date: Wed, 23 Apr 2025 22:33:21 -0600 Subject: [PATCH 09/14] Converted treatment array to 1D --- Murray/plots.py | 42 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/Murray/plots.py b/Murray/plots.py index 5c73c26..0ae020c 100644 --- a/Murray/plots.py +++ b/Murray/plots.py @@ -409,6 +409,39 @@ def calculate_penalty_score(mde, period_idx, total_periods, size, results_by_siz + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + def print_weights(geo_test, treatment_percentage=None, num_locations=None): """ Extracts control group weights based on holdout percentage or number of locations. @@ -937,12 +970,19 @@ def plot_impact_evaluation(results_evaluation): treatment (array): Treatment group values period (int): Treatment period length """ + counterfactual = results_evaluation['predictions'] treatment = results_evaluation['treatment'] period = results_evaluation['period'] length_treatment = results_evaluation['length_treatment'] + if len(treatment.shape) > 1: + treatment = treatment.squeeze() + + if len(counterfactual.shape) > 1: + counterfactual = counterfactual.squeeze() + point_difference = treatment - counterfactual cumulative_effect = ([0] * (len(treatment) - period)) + (np.cumsum(point_difference[len(treatment)-period:])).tolist() @@ -1574,4 +1614,4 @@ def calculate_confidence_bands(data, alpha=0.05): lower_bound = data - margin upper_bound = data + margin - return lower_bound, upper_bound \ No newline at end of file + return lower_bound, upper_bound \ No newline at end of file From f0d459ac9837453ea7f413446e00f283d06d2868 Mon Sep 17 00:00:00 2001 From: IsaacMtz19 Date: Wed, 23 Apr 2025 23:23:33 -0600 Subject: [PATCH 10/14] Optimized gaussian_kde --- Murray/plots.py | 69 +++++++++++++++++++++---------------------------- 1 file changed, 30 insertions(+), 39 deletions(-) diff --git a/Murray/plots.py b/Murray/plots.py index 0ae020c..3556ed8 100644 --- a/Murray/plots.py +++ b/Murray/plots.py @@ -1182,94 +1182,85 @@ def plot_permutation_test(results_evaluation, Significance_level=0.1): Returns: fig: Plotly figure. """ - null_stats = results_evaluation['null_stats'] observed_stat = results_evaluation['observed_stat'] - + # Calcular percentiles para los límites upper_bound = np.percentile(null_stats, 100 * (1 - (Significance_level / 2))) lower_bound = np.percentile(null_stats, 100 * (Significance_level / 2)) - - + # Optimización: Reducir el número de puntos para el KDE + # Usar menos puntos para el histograma y el KDE kde = stats.gaussian_kde(null_stats) - x_kde = np.linspace(min(null_stats), max(null_stats), 300) + x_kde = np.linspace(min(null_stats), max(null_stats), 100) # Reducido de 300 a 100 y_kde = kde(x_kde) - - - max_hist_y = max(kde(null_stats)) - - + + # Calcular el máximo de densidad para la línea vertical + max_hist_y = max(y_kde) # Usar y_kde en lugar de kde(null_stats) + fig = go.Figure() - - + + # Histograma con menos bins fig.add_trace(go.Histogram( x=null_stats, - nbinsx=30, + nbinsx=20, # Reducido de 30 a 20 histnorm='probability density', name="Null Stats", - marker=dict(color=blue,line=dict(color="black",width=1)), + marker=dict(color=blue, line=dict(color="black", width=1)), opacity=0.6 )) - - + + # KDE plot fig.add_trace(go.Scatter( x=x_kde, y=y_kde, mode="lines", name="KDE Density", showlegend=False, - line=dict(color="darkblue", width=2) )) - - - + + # Línea vertical para el estadístico observado fig.add_trace(go.Scatter( x=[observed_stat, observed_stat], - y=[0, max_hist_y], + y=[0, max_hist_y], mode="lines", name="Observed Stat", line=dict(color="black", dash="dash", width=1.5) )) - + def hex_to_rgba(hex_color, alpha=0.4): - """Convierte un color HEX a RGBA con transparencia controlada.""" - hex_color = hex_color.lstrip("#") - r, g, b = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4)) - return f"rgba({r},{g},{b},{alpha})" - - - + hex_color = hex_color.lstrip("#") + r, g, b = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4)) + return f"rgba({r},{g},{b},{alpha})" + + # Zonas de significancia fig.add_trace(go.Scatter( x=[upper_bound, max(null_stats), max(null_stats), upper_bound], - - y=[0, 0, max_hist_y, max_hist_y], + y=[0, 0, max_hist_y, max_hist_y], fill="toself", - fillcolor=hex_to_rgba(purple_light, 0.3), + fillcolor=hex_to_rgba(purple_light, 0.3), line=dict(color="rgba(255,0,0,0)"), name="Upper Significance Zone" )) - + fig.add_trace(go.Scatter( x=[min(null_stats), lower_bound, lower_bound, min(null_stats), min(null_stats)], - y=[0, 0, max_hist_y, max_hist_y, 0], + y=[0, 0, max_hist_y, max_hist_y, 0], fill="toself", - fillcolor=hex_to_rgba(purple_light, 0.3), + fillcolor=hex_to_rgba(purple_light, 0.3), line=dict(color="rgba(255,0,0,0)"), name="Lower Significance Zone" )) - + fig.update_layout( title="Permutation Test", xaxis_title="Conformity Score", yaxis_title="Density", template="plotly_white", - bargap=0 - ) - + return fig From 252aff015d761855abeba314ef64dac373e2dcb7 Mon Sep 17 00:00:00 2001 From: IsaacMtz19 Date: Thu, 24 Apr 2025 04:00:58 -0600 Subject: [PATCH 11/14] Fix hover on --- Murray/plots.py | 82 +++++++++++++++++++++++++++---------------------- 1 file changed, 46 insertions(+), 36 deletions(-) diff --git a/Murray/plots.py b/Murray/plots.py index 3556ed8..d532622 100644 --- a/Murray/plots.py +++ b/Murray/plots.py @@ -385,20 +385,21 @@ def calculate_penalty_score(mde, period_idx, total_periods, size, results_by_siz tickfont=dict(size=12, color='black')) ) - custom_data = [] - for s in sorted_sizes: - # Get the MDE for each size and period + for i, s in enumerate(sorted_sizes): mde_data = [] for period in periods: mde = sensitivity_results[s][period].get('MDE', None) - mde_data.append(f"{mde:.2%}" if mde is not None else "N/A") + mde_data.append([ + s, # Treatment size + f"{mde:.2%}" if mde is not None else "N/A" # MDE + ]) custom_data.append(mde_data) fig.data[0].customdata = custom_data fig.data[0].hovertemplate = ( - "Treatment size: %{customdata}
" + - "MDE: %{customdata}
" + + "Treatment size: %{customdata[0]}
" + + "MDE: %{customdata[1]}
" + "" ) fig.data[0].hoverinfo = "skip" @@ -1182,85 +1183,94 @@ def plot_permutation_test(results_evaluation, Significance_level=0.1): Returns: fig: Plotly figure. """ + null_stats = results_evaluation['null_stats'] observed_stat = results_evaluation['observed_stat'] - # Calcular percentiles para los límites + upper_bound = np.percentile(null_stats, 100 * (1 - (Significance_level / 2))) lower_bound = np.percentile(null_stats, 100 * (Significance_level / 2)) - # Optimización: Reducir el número de puntos para el KDE - # Usar menos puntos para el histograma y el KDE + + kde = stats.gaussian_kde(null_stats) - x_kde = np.linspace(min(null_stats), max(null_stats), 100) # Reducido de 300 a 100 + x_kde = np.linspace(min(null_stats), max(null_stats), 300) y_kde = kde(x_kde) - - # Calcular el máximo de densidad para la línea vertical - max_hist_y = max(y_kde) # Usar y_kde en lugar de kde(null_stats) - + + + max_hist_y = max(kde(null_stats)) + + fig = go.Figure() - - # Histograma con menos bins + + fig.add_trace(go.Histogram( x=null_stats, - nbinsx=20, # Reducido de 30 a 20 + nbinsx=30, histnorm='probability density', name="Null Stats", - marker=dict(color=blue, line=dict(color="black", width=1)), + marker=dict(color=blue,line=dict(color="black",width=1)), opacity=0.6 )) - - # KDE plot + + fig.add_trace(go.Scatter( x=x_kde, y=y_kde, mode="lines", name="KDE Density", showlegend=False, + line=dict(color="darkblue", width=2) )) - - # Línea vertical para el estadístico observado + + + fig.add_trace(go.Scatter( x=[observed_stat, observed_stat], - y=[0, max_hist_y], + y=[0, max_hist_y], mode="lines", name="Observed Stat", line=dict(color="black", dash="dash", width=1.5) )) - + def hex_to_rgba(hex_color, alpha=0.4): - hex_color = hex_color.lstrip("#") - r, g, b = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4)) - return f"rgba({r},{g},{b},{alpha})" - - # Zonas de significancia + """Convierte un color HEX a RGBA con transparencia controlada.""" + hex_color = hex_color.lstrip("#") + r, g, b = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4)) + return f"rgba({r},{g},{b},{alpha})" + + + fig.add_trace(go.Scatter( x=[upper_bound, max(null_stats), max(null_stats), upper_bound], - y=[0, 0, max_hist_y, max_hist_y], + + y=[0, 0, max_hist_y, max_hist_y], fill="toself", - fillcolor=hex_to_rgba(purple_light, 0.3), + fillcolor=hex_to_rgba(purple_light, 0.3), line=dict(color="rgba(255,0,0,0)"), name="Upper Significance Zone" )) - + fig.add_trace(go.Scatter( x=[min(null_stats), lower_bound, lower_bound, min(null_stats), min(null_stats)], - y=[0, 0, max_hist_y, max_hist_y, 0], + y=[0, 0, max_hist_y, max_hist_y, 0], fill="toself", - fillcolor=hex_to_rgba(purple_light, 0.3), + fillcolor=hex_to_rgba(purple_light, 0.3), line=dict(color="rgba(255,0,0,0)"), name="Lower Significance Zone" )) - + fig.update_layout( title="Permutation Test", xaxis_title="Conformity Score", yaxis_title="Density", template="plotly_white", + bargap=0 + ) - + return fig From 68fa1609b48b0924017f5de0187e39efd4e32079 Mon Sep 17 00:00:00 2001 From: IsaacMtz19 Date: Thu, 24 Apr 2025 04:38:18 -0600 Subject: [PATCH 12/14] Optimized plot_permutation_test --- Murray/plots.py | 48 ++++++++++++++---------------------------------- 1 file changed, 14 insertions(+), 34 deletions(-) diff --git a/Murray/plots.py b/Murray/plots.py index d532622..fff71b2 100644 --- a/Murray/plots.py +++ b/Murray/plots.py @@ -1183,81 +1183,63 @@ def plot_permutation_test(results_evaluation, Significance_level=0.1): Returns: fig: Plotly figure. """ - null_stats = results_evaluation['null_stats'] observed_stat = results_evaluation['observed_stat'] - upper_bound = np.percentile(null_stats, 100 * (1 - (Significance_level / 2))) lower_bound = np.percentile(null_stats, 100 * (Significance_level / 2)) - - - kde = stats.gaussian_kde(null_stats) - x_kde = np.linspace(min(null_stats), max(null_stats), 300) + kde = stats.gaussian_kde(null_stats, bw_method='scott') + x_kde = np.linspace(min(null_stats), max(null_stats), 100) y_kde = kde(x_kde) - - max_hist_y = max(kde(null_stats)) - + max_hist_y = np.max(y_kde) * 1.1 fig = go.Figure() - fig.add_trace(go.Histogram( x=null_stats, - nbinsx=30, + nbinsx=20, histnorm='probability density', name="Null Stats", - marker=dict(color=blue,line=dict(color="black",width=1)), + marker=dict(color=blue, line=dict(color="black", width=1)), opacity=0.6 )) - fig.add_trace(go.Scatter( x=x_kde, y=y_kde, mode="lines", name="KDE Density", showlegend=False, - line=dict(color="darkblue", width=2) )) - - fig.add_trace(go.Scatter( x=[observed_stat, observed_stat], - y=[0, max_hist_y], + y=[0, max_hist_y], mode="lines", name="Observed Stat", line=dict(color="black", dash="dash", width=1.5) )) - def hex_to_rgba(hex_color, alpha=0.4): - """Convierte un color HEX a RGBA con transparencia controlada.""" - hex_color = hex_color.lstrip("#") - r, g, b = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4)) - return f"rgba({r},{g},{b},{alpha})" - - + significance_color = f"rgba(187,178,199,0.3)" fig.add_trace(go.Scatter( x=[upper_bound, max(null_stats), max(null_stats), upper_bound], - - y=[0, 0, max_hist_y, max_hist_y], + y=[0, 0, max_hist_y, max_hist_y], fill="toself", - fillcolor=hex_to_rgba(purple_light, 0.3), - line=dict(color="rgba(255,0,0,0)"), + fillcolor=significance_color, + line=dict(width=0), name="Upper Significance Zone" )) fig.add_trace(go.Scatter( - x=[min(null_stats), lower_bound, lower_bound, min(null_stats), min(null_stats)], - y=[0, 0, max_hist_y, max_hist_y, 0], + x=[min(null_stats), lower_bound, lower_bound, min(null_stats)], + y=[0, 0, max_hist_y, max_hist_y], fill="toself", - fillcolor=hex_to_rgba(purple_light, 0.3), - line=dict(color="rgba(255,0,0,0)"), + fillcolor=significance_color, + line=dict(width=0), name="Lower Significance Zone" )) @@ -1266,9 +1248,7 @@ def hex_to_rgba(hex_color, alpha=0.4): xaxis_title="Conformity Score", yaxis_title="Density", template="plotly_white", - bargap=0 - ) return fig From 87f24e627d2391c7ffa269f9d2231647372ae473 Mon Sep 17 00:00:00 2001 From: IsaacMtz19 Date: Thu, 24 Apr 2025 05:44:17 -0600 Subject: [PATCH 13/14] update readme --- Murray/tests/__init__.py | 17 ---- Murray/tests/test_better_groups.py | 113 ----------------------- Murray/tests/test_market_correlations.py | 25 ----- Murray/tests/test_power_analysis.py | 90 ------------------ Murray/tests/test_run_geo_analysis.py | 38 -------- Murray/tests/test_run_geo_evaluation.py | 46 --------- Murray/tests/test_select_markets.py | 86 ----------------- Murray/tests/test_synthetic_control.py | 51 ---------- Murray/tests/test_upload_data.py | 27 ------ README.md | 2 +- 10 files changed, 1 insertion(+), 494 deletions(-) delete mode 100644 Murray/tests/__init__.py delete mode 100644 Murray/tests/test_better_groups.py delete mode 100644 Murray/tests/test_market_correlations.py delete mode 100644 Murray/tests/test_power_analysis.py delete mode 100644 Murray/tests/test_run_geo_analysis.py delete mode 100644 Murray/tests/test_run_geo_evaluation.py delete mode 100644 Murray/tests/test_select_markets.py delete mode 100644 Murray/tests/test_synthetic_control.py delete mode 100644 Murray/tests/test_upload_data.py diff --git a/Murray/tests/__init__.py b/Murray/tests/__init__.py deleted file mode 100644 index 05cc6ef..0000000 --- a/Murray/tests/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -from Murray.main import run_geo_analysis -from Murray.post_analysis import run_geo_evaluation -from Murray.auxiliary import cleaned_data,market_correlations -from Murray.plots import ( - plot_geodata, - print_locations, - print_weights, - plot_impact_graphs, - print_incremental_results, - plot_metrics, - plot_impact_graphs_evaluation, - print_incremental_results_evaluation, - plot_permutation_test -) - -__version__ = "1.1.0" - diff --git a/Murray/tests/test_better_groups.py b/Murray/tests/test_better_groups.py deleted file mode 100644 index ba805f8..0000000 --- a/Murray/tests/test_better_groups.py +++ /dev/null @@ -1,113 +0,0 @@ -import pytest -import numpy as np -import pandas as pd -from sklearn.preprocessing import MinMaxScaler -from Murray.main import BetterGroups, SyntheticControl, select_treatments, select_controls -from Murray.auxiliary import market_correlations, cleaned_data - -@pytest.fixture(scope="module") -def cleaned_dataframe(): - """Fixture that creates synthetic test data""" - np.random.seed(42) - - dates = pd.date_range(start='2023-01-01', periods=100) - regions = ['Region_A', 'Region_B', 'Region_C', 'Region_D', 'Region_E'] - - data = [] - for region in regions: - base_value = np.random.randint(50, 100) - for date in dates: - value = base_value + np.sin(date.day/15) * 10 + np.random.normal(0, 2) - data.append({ - 'date': date, - 'region': region, - 'add_to_carts': max(0, int(value)) - }) - - df = pd.DataFrame(data) - return cleaned_data(df, "add_to_carts", "region", "date") - -@pytest.fixture(scope="module") -def correlation_matrix(cleaned_dataframe): - """Fixture that generates the correlation matrix from synthetic data""" - return market_correlations(cleaned_dataframe) - -@pytest.fixture(scope="module") -def similarity_matrix(correlation_matrix): - """Fixture to generate a similarity matrix""" - return correlation_matrix.copy() - -@pytest.fixture -def test_data(cleaned_dataframe): - """Fixture to generate test data""" - return cleaned_dataframe.copy() - - -def test_better_groups_valid(similarity_matrix, correlation_matrix, test_data): - results = BetterGroups( - similarity_matrix=similarity_matrix, - excluded_locations=[], - data=test_data, - correlation_matrix=correlation_matrix, - maximum_treatment_percentage=0.50 - ) - - assert isinstance(results, dict), "The result must be a dictionary" - assert len(results) > 0, "There must be at least one evaluated treatment group" - for size, result in results.items(): - assert "Best Treatment Group" in result, "Missing treatment group" - assert "Control Group" in result, "Missing control group" - assert "MAPE" in result, "Missing MAPE metric" - assert "SMAPE" in result, "Missing SMAPE metric" - assert "Holdout Percentage" in result, "Missing holdout percentage" - assert result["MAPE"] >= 0, "MAPE must be a positive number" - assert 0 <= result["Holdout Percentage"] <= 100, "Holdout must be between 0 and 100" - - -def test_better_groups_no_valid_treatments(similarity_matrix, correlation_matrix, test_data): - test_data = test_data[test_data["location"].isin(["X", "Y"])] - print(f"test data: {test_data}") - results = BetterGroups( - similarity_matrix=similarity_matrix, - excluded_locations=[], - data=test_data, - correlation_matrix=correlation_matrix, - maximum_treatment_percentage=0.50 - ) - - assert results is None, "If there are no valid locations, the result must be None" - - -def test_better_groups_scaled_data(similarity_matrix, correlation_matrix, test_data): - scaler = MinMaxScaler() - test_data["Y"] = scaler.fit_transform(test_data["Y"].values.reshape(-1, 1)) - - results = BetterGroups( - similarity_matrix=similarity_matrix, - excluded_locations=[], - data=test_data, - correlation_matrix=correlation_matrix, - maximum_treatment_percentage=0.50 - ) - - assert isinstance(results, dict), "The result must be a dictionary" - assert all(isinstance(result["MAPE"], (float, int)) for result in results.values()), "MAPE must be a number" - - -def test_better_groups_no_control(monkeypatch, similarity_matrix, correlation_matrix, test_data): - - def fake_select_controls(correlation_matrix, treatment_group, min_correlation): - return [] - - monkeypatch.setattr("Murray.main.select_controls", fake_select_controls) - - results = BetterGroups( - similarity_matrix=similarity_matrix, - excluded_locations=[], - data=test_data, - correlation_matrix=correlation_matrix, - maximum_treatment_percentage=0.50 - ) - - for result in results.values(): - assert result["MAPE"] == float('inf'), "If there are no controls, MAPE must be infinite" diff --git a/Murray/tests/test_market_correlations.py b/Murray/tests/test_market_correlations.py deleted file mode 100644 index 8bcc404..0000000 --- a/Murray/tests/test_market_correlations.py +++ /dev/null @@ -1,25 +0,0 @@ -import os -import pandas as pd -import pytest -from Murray.auxiliary import market_correlations,cleaned_data - - -DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data")) - - -tests = [ - (os.path.join(DATA_DIR, "data1.csv"), "add_to_carts", "region", "date"), - (os.path.join(DATA_DIR, "data2.csv"), "sessions", "location", "day"), -] - -@pytest.fixture -def cleaned_data_fixture(dataset_path, col_target, col_locations, col_dates): - df = pd.read_csv(dataset_path) - return cleaned_data(df, col_target, col_locations, col_dates) - - -@pytest.mark.parametrize("dataset_path, col_target, col_locations, col_dates", tests) -def test_market_correlations(cleaned_data_fixture): - - correlation_matrix = market_correlations(cleaned_data_fixture) - assert isinstance(correlation_matrix, pd.DataFrame), "market_correlations should return a DataFrame" diff --git a/Murray/tests/test_power_analysis.py b/Murray/tests/test_power_analysis.py deleted file mode 100644 index ca1f409..0000000 --- a/Murray/tests/test_power_analysis.py +++ /dev/null @@ -1,90 +0,0 @@ -import pytest -import numpy as np -import pandas as pd -from Murray.main import apply_lift, calculate_conformity, simulate_power, run_simulation, evaluate_sensitivity - -@pytest.fixture -def synthetic_series(): - """Fixture to generate a synthetic time series.""" - np.random.seed(42) - y = np.random.rand(100) * 100 - return y - - -def test_apply_lift(synthetic_series): - y = synthetic_series.copy() - y_lifted = apply_lift(y, delta=0.1, start_treatment=50, end_treatment=70) - - assert np.all(y_lifted[:50] == y[:50]), "Values before the treatment should not change" - assert np.all(y_lifted[70:] == y[70:]), "Values after the treatment should not change" - assert np.all(y_lifted[50:70] == y[50:70] * 1.1), "The lift should be applied in the treatment period" - - -def test_calculate_conformity(synthetic_series): - y_real = synthetic_series.copy() - y_control = synthetic_series.copy() * 0.9 - - conformity = calculate_conformity(y_real, y_control, start_treatment=50, end_treatment=70) - - expected_conformity = np.mean(y_real[50:70]) - np.mean(y_control[50:70]) - assert np.isclose(conformity, expected_conformity), "The calculated conformity should match the expected value" - - -def test_simulate_power(synthetic_series): - y_real = synthetic_series.copy() - y_control = synthetic_series.copy() * 0.95 - - delta, power, y_lifted = simulate_power( - y_real=y_real, - y_control=y_control, - delta=0.1, - period=20, - n_permutations=100, - significance_level=0.05 - ) - - assert isinstance(delta, float), "Delta must be a float" - assert isinstance(power, float), "Statistical power must be a float" - assert isinstance(y_lifted, np.ndarray), "The adjusted series must be a NumPy array" - assert len(y_lifted) == len(y_real), "The adjusted series must have the same length as the original" - - -def test_run_simulation(synthetic_series): - y_real = synthetic_series.copy() - y_control = synthetic_series.copy() * 0.98 - - delta, power, y_lifted = run_simulation( - delta=0.2, - y_real=y_real, - y_control=y_control, - period=20, - n_permutations=100, - significance_level=0.05 - ) - - assert isinstance(delta, float), "Delta must be a float" - assert isinstance(power, float), "Statistical power must be a float" - assert isinstance(y_lifted, np.ndarray), "The adjusted series must be a NumPy array" - - -def test_evaluate_sensitivity(): - """Test the sensitivity evaluation function""" - results_by_size = { - 50: {"Actual Target Metric (y)": np.random.rand(100) * 100, "Predictions": np.random.rand(100) * 100} - } - deltas = [0.05, 0.1, 0.2] - periods = [10, 20, 30] - n_permutations = 50 - - sensitivity_results, lift_series = evaluate_sensitivity( - results_by_size=results_by_size, - deltas=deltas, - periods=periods, - n_permutations=n_permutations, - significance_level=0.05 - ) - - assert isinstance(sensitivity_results, dict), "The result must be a dictionary" - assert isinstance(lift_series, dict), "The lift series must be a dictionary" - assert all(isinstance(v, dict) for v in sensitivity_results.values()), "Each value in sensitivity_results must be a dictionary" - assert all(isinstance(v, np.ndarray) for v in lift_series.values()), "Each value in lift_series must be a NumPy array" diff --git a/Murray/tests/test_run_geo_analysis.py b/Murray/tests/test_run_geo_analysis.py deleted file mode 100644 index 5d621cf..0000000 --- a/Murray/tests/test_run_geo_analysis.py +++ /dev/null @@ -1,38 +0,0 @@ -import pytest -import numpy as np -import pandas as pd -from Murray.main import run_geo_analysis_streamlit_app -from Murray.auxiliary import market_correlations, cleaned_data - -@pytest.fixture -def sample_data(): - """Fixture that generates a test DataFrame with synthetic data.""" - np.random.seed(42) - data = pd.DataFrame({ - "time": np.tile(pd.date_range("2023-01-01", periods=100, freq="D"), 10), - "location": np.repeat([f"Location_{i}" for i in range(10)], 100), - "Y": np.random.rand(1000) * 100 - }) - return data - - -def test_run_geo_analysis(sample_data): - """Checks that the analysis function runs correctly.""" - results = run_geo_analysis_streamlit_app( - data=sample_data, - maximum_treatment_percentage=0.50, - significance_level=0.05, - deltas_range=(0.05, 0.2, 0.05), - periods_range=(10, 30, 10), - excluded_locations=["Location_1"], - n_permutations=100 - ) - - assert isinstance(results, dict), "The result must be a dictionary" - assert "simulation_results" in results, "Missing 'simulation_results' in the results" - assert "sensitivity_results" in results, "Missing 'sensitivity_results' in the results" - assert "series_lifts" in results, "Missing 'series_lifts' in the results" - - assert isinstance(results["simulation_results"], dict), "simulation_results must be a dictionary" - assert isinstance(results["sensitivity_results"], dict), "sensitivity_results must be a dictionary" - assert isinstance(results["series_lifts"], dict), "series_lifts must be a dictionary" diff --git a/Murray/tests/test_run_geo_evaluation.py b/Murray/tests/test_run_geo_evaluation.py deleted file mode 100644 index ed6b022..0000000 --- a/Murray/tests/test_run_geo_evaluation.py +++ /dev/null @@ -1,46 +0,0 @@ -import pytest -import numpy as np -import pandas as pd -from Murray.post_analysis import run_geo_evaluation -from Murray.auxiliary import market_correlations, cleaned_data - -@pytest.fixture -def sample_data(): - """Fixture that generates a test DataFrame with fictitious data""" - np.random.seed(42) - data = pd.DataFrame({ - "time": np.tile(pd.date_range("2023-01-01", periods=100, freq="D"), 10), - "location": np.repeat([f"Location_{i}" for i in range(10)], 100), - "Y": np.random.rand(1000) * 100 - }) - return data - - -def test_run_geo_evaluation(sample_data): - """Checks that the geographic evaluation function runs correctly""" - results = run_geo_evaluation( - data_input=sample_data, - start_treatment="2023-03-01", - end_treatment="2023-03-10", - treatment_group=["Location_0", "Location_1"], - spend=50000, - n_permutations=100, - inference_type="iid", - significance_level=0.05 - ) - - assert isinstance(results, dict), "The result must be a dictionary" - expected_keys = [ - "MAPE", "SMAPE", "predictions", "treatment", "p_value", "power", - "percenge_lift", "control_group", "observed_stat", - "null_stats", "weights", "period", "spend", "length_treatment" - ] - for key in expected_keys: - assert key in results, f"Missing the key '{key}' in the results" - - assert isinstance(results["MAPE"], float), "MAPE must be a float" - assert isinstance(results["p_value"], float), "p_value must be a float" - assert isinstance(results["power"], float), "Power must be a float" - assert isinstance(results["control_group"], list), "Control group must be a list" - assert 0 <= results["power"] <= 1, "Power must be between 0 and 1" - assert 0 <= results["p_value"] <= 1, "p_value must be between 0 and 1" diff --git a/Murray/tests/test_select_markets.py b/Murray/tests/test_select_markets.py deleted file mode 100644 index 25ca78b..0000000 --- a/Murray/tests/test_select_markets.py +++ /dev/null @@ -1,86 +0,0 @@ -import pytest -import numpy as np -import pandas as pd -from Murray.main import select_treatments, select_controls -from Murray.auxiliary import market_correlations, cleaned_data - -@pytest.fixture(scope="module") -def cleaned_dataframe(): - """Fixture that creates synthetic test data""" - np.random.seed(42) - - dates = pd.date_range(start='2023-01-01', periods=100) - regions = ['Region_A', 'Region_B', 'Region_C', 'Region_D', 'Region_E'] - - data = [] - for region in regions: - base_value = np.random.randint(50, 100) - for date in dates: - value = base_value + np.sin(date.day/15) * 10 + np.random.normal(0, 2) - data.append({ - 'date': date, - 'region': region, - 'add_to_carts': max(0, int(value)) - }) - - df = pd.DataFrame(data) - return cleaned_data(df, "add_to_carts", "region", "date") - -@pytest.fixture(scope="module") -def correlation_matrix(cleaned_dataframe): - """Fixture that generates the correlation matrix""" - return market_correlations(cleaned_dataframe) - -def test_select_treatments_valid(cleaned_dataframe, correlation_matrix): - """Test to verify that treatments are correctly selected with a randomly excluded location""" - excluded_location = np.random.choice(cleaned_dataframe["location"].unique()) - treatments = select_treatments(correlation_matrix, treatment_size=2, excluded_locations=[excluded_location]) - - assert isinstance(treatments, list), "The result must be a list" - assert all(isinstance(group, list) for group in treatments), "Each combination must be a list" - assert all(len(group) == 2 for group in treatments), "Each combination must have 2 treatments" - assert excluded_location not in [loc for group in treatments for loc in group], "The excluded location must not appear in the treatments" - -def test_select_treatments_invalid_location(correlation_matrix): - """Should raise a KeyError if an excluded location is not in the matrix""" - with pytest.raises(KeyError, match="not present in the similarity matrix"): - select_treatments(correlation_matrix, treatment_size=2, excluded_locations=["X", "Y"]) - -def test_select_treatments_treatment_size_too_large(correlation_matrix): - """Should raise ValueError if treatment_size is greater than the number of available columns""" - with pytest.raises(ValueError, match="The treatment size .* exceeds the available number of columns"): - select_treatments(correlation_matrix, treatment_size=100, excluded_locations=[]) - -def test_select_treatments_treatment_size_equals_columns(correlation_matrix): - """Should return only one combination when treatment_size is equal to the available columns""" - num_columns = correlation_matrix.shape[1] - treatments = select_treatments(correlation_matrix, treatment_size=num_columns, excluded_locations=[]) - - assert len(treatments) == 1, "There must be only one possible combination" - assert set(treatments[0]) == set(correlation_matrix.columns), "It must contain all possible locations" - - -def test_select_controls_valid(cleaned_dataframe, correlation_matrix): - """Test to verify that controls are correctly selected based on treatments""" - excluded_location = np.random.choice(cleaned_dataframe["location"].unique()) - treatments = select_treatments(correlation_matrix, treatment_size=2, excluded_locations=[excluded_location]) - - for treatment_group in treatments: - controls = select_controls(correlation_matrix, treatment_group) - assert isinstance(controls, list), "The result must be a list" - assert len(controls) > 0, "There must be at least one control available" - assert all(loc not in treatment_group for loc in controls), "Controls must not be in the treatment group" - -def test_select_controls_invalid_treatments(correlation_matrix): - """Should handle nonexistent treatments without failing""" - fake_treatment_group = ["X", "Y", "Z"] - controls = select_controls(correlation_matrix, fake_treatment_group) - assert controls == [], "If the treatment does not exist, the output must be an empty list" - -def test_select_controls_fallback(correlation_matrix,cleaned_dataframe): - """Should select the `fallback_n` most correlated if no locations meet the min_correlation""" - treatment_group = np.random.choice(cleaned_dataframe["location"].unique()) - treatment_group = [treatment_group] - controls = select_controls(correlation_matrix, treatment_group, min_correlation=0.99, fallback_n=3) - - assert len(controls) == 3, "It should select 3 fallback controls" diff --git a/Murray/tests/test_synthetic_control.py b/Murray/tests/test_synthetic_control.py deleted file mode 100644 index a25e6f7..0000000 --- a/Murray/tests/test_synthetic_control.py +++ /dev/null @@ -1,51 +0,0 @@ -import pytest -import numpy as np -import pandas as pd -from Murray.main import SyntheticControl -from Murray.auxiliary import cleaned_data, market_correlations - -@pytest.fixture(scope="module") -def synthetic_data(): - """Fixture that creates synthetic test data""" - np.random.seed(42) - X = np.random.rand(100, 3) - y = X @ np.array([0.3, 0.5, 0.2]) + np.random.normal(0, 0.1, 100) - - return X, y - -@pytest.fixture(scope="module") -def correlation_matrix(synthetic_data): - """Fixture that generates correlation matrix from synthetic data""" - return market_correlations(synthetic_data) - -@pytest.fixture(scope="module") -def synthetic_control(): - """Fixture that creates a synthetic control instance""" - return SyntheticControl( - regularization_strength_l1=0.1, - regularization_strength_l2=0.1, - seasonality=None, - delta=1.0 - ) - -def test_synthetic_control_fit(synthetic_control, synthetic_data): - """Test that synthetic control can fit the data""" - X, y = synthetic_data - synthetic_control.fit(X, y) - - assert hasattr(synthetic_control, 'is_fitted_') - assert hasattr(synthetic_control, 'w_') - assert isinstance(synthetic_control.w_, np.ndarray) - assert len(synthetic_control.w_) == X.shape[1] - -def test_synthetic_control_predict(synthetic_control, synthetic_data): - """Test that synthetic control can make predictions""" - X, y = synthetic_data - synthetic_control.fit(X, y) - predictions, weights = synthetic_control.predict(X) - - assert isinstance(predictions, np.ndarray) - assert len(predictions) == len(y) - assert not np.isnan(predictions).any() - assert isinstance(weights, np.ndarray) - assert len(weights) == X.shape[1] diff --git a/Murray/tests/test_upload_data.py b/Murray/tests/test_upload_data.py deleted file mode 100644 index 08f044d..0000000 --- a/Murray/tests/test_upload_data.py +++ /dev/null @@ -1,27 +0,0 @@ -import os -import pandas as pd -import pytest -import Murray as mp - - -DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data")) - - -tests = [ - (os.path.join(DATA_DIR, "data1.csv"), "add_to_carts", "region", "date"), - (os.path.join(DATA_DIR, "data2.csv"), "sessions", "location", "day"), -] - - - -@pytest.mark.parametrize("dataset_path, col_target, col_locations, col_dates", tests) -def test_cleaned_data(dataset_path, col_target, col_locations, col_dates): - - assert os.path.exists(dataset_path), f"File {dataset_path} not found" - df = pd.read_csv(dataset_path) - df_cleaned = mp.cleaned_data(df, col_target, col_locations, col_dates) - - assert isinstance(df_cleaned, pd.DataFrame), "Output is not a DataFrame" - assert df_cleaned.isnull().sum().sum() == 0, "Cleaned data contains NaN values" - - diff --git a/README.md b/README.md index be78874..cfe945b 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ data = pd.DataFrame({ ```python results = run_geo_analysis( data = data, - excluded_locations = ['mexico city', 'méxico'], + excluded_locations = ['mexico city', 'mexico'], maximum_treatment_percentage=0.30, significance_level = 0.1, deltas_range = (0.01, 0.3, 0.02), From 55bee5dfe5a60db1d32516112c6b40443cd7dd61 Mon Sep 17 00:00:00 2001 From: IsaacMtz19 Date: Thu, 24 Apr 2025 05:55:33 -0600 Subject: [PATCH 14/14] Add test files --- Murray/tests/__init__.py | 17 ++++ Murray/tests/test_better_groups.py | 113 +++++++++++++++++++++++ Murray/tests/test_market_correlations.py | 25 +++++ Murray/tests/test_power_analysis.py | 90 ++++++++++++++++++ Murray/tests/test_run_geo_analysis.py | 38 ++++++++ Murray/tests/test_run_geo_evaluation.py | 46 +++++++++ Murray/tests/test_select_markets.py | 86 +++++++++++++++++ Murray/tests/test_synthetic_control.py | 51 ++++++++++ Murray/tests/test_upload_data.py | 27 ++++++ 9 files changed, 493 insertions(+) create mode 100644 Murray/tests/__init__.py create mode 100644 Murray/tests/test_better_groups.py create mode 100644 Murray/tests/test_market_correlations.py create mode 100644 Murray/tests/test_power_analysis.py create mode 100644 Murray/tests/test_run_geo_analysis.py create mode 100644 Murray/tests/test_run_geo_evaluation.py create mode 100644 Murray/tests/test_select_markets.py create mode 100644 Murray/tests/test_synthetic_control.py create mode 100644 Murray/tests/test_upload_data.py diff --git a/Murray/tests/__init__.py b/Murray/tests/__init__.py new file mode 100644 index 0000000..05cc6ef --- /dev/null +++ b/Murray/tests/__init__.py @@ -0,0 +1,17 @@ +from Murray.main import run_geo_analysis +from Murray.post_analysis import run_geo_evaluation +from Murray.auxiliary import cleaned_data,market_correlations +from Murray.plots import ( + plot_geodata, + print_locations, + print_weights, + plot_impact_graphs, + print_incremental_results, + plot_metrics, + plot_impact_graphs_evaluation, + print_incremental_results_evaluation, + plot_permutation_test +) + +__version__ = "1.1.0" + diff --git a/Murray/tests/test_better_groups.py b/Murray/tests/test_better_groups.py new file mode 100644 index 0000000..ba805f8 --- /dev/null +++ b/Murray/tests/test_better_groups.py @@ -0,0 +1,113 @@ +import pytest +import numpy as np +import pandas as pd +from sklearn.preprocessing import MinMaxScaler +from Murray.main import BetterGroups, SyntheticControl, select_treatments, select_controls +from Murray.auxiliary import market_correlations, cleaned_data + +@pytest.fixture(scope="module") +def cleaned_dataframe(): + """Fixture that creates synthetic test data""" + np.random.seed(42) + + dates = pd.date_range(start='2023-01-01', periods=100) + regions = ['Region_A', 'Region_B', 'Region_C', 'Region_D', 'Region_E'] + + data = [] + for region in regions: + base_value = np.random.randint(50, 100) + for date in dates: + value = base_value + np.sin(date.day/15) * 10 + np.random.normal(0, 2) + data.append({ + 'date': date, + 'region': region, + 'add_to_carts': max(0, int(value)) + }) + + df = pd.DataFrame(data) + return cleaned_data(df, "add_to_carts", "region", "date") + +@pytest.fixture(scope="module") +def correlation_matrix(cleaned_dataframe): + """Fixture that generates the correlation matrix from synthetic data""" + return market_correlations(cleaned_dataframe) + +@pytest.fixture(scope="module") +def similarity_matrix(correlation_matrix): + """Fixture to generate a similarity matrix""" + return correlation_matrix.copy() + +@pytest.fixture +def test_data(cleaned_dataframe): + """Fixture to generate test data""" + return cleaned_dataframe.copy() + + +def test_better_groups_valid(similarity_matrix, correlation_matrix, test_data): + results = BetterGroups( + similarity_matrix=similarity_matrix, + excluded_locations=[], + data=test_data, + correlation_matrix=correlation_matrix, + maximum_treatment_percentage=0.50 + ) + + assert isinstance(results, dict), "The result must be a dictionary" + assert len(results) > 0, "There must be at least one evaluated treatment group" + for size, result in results.items(): + assert "Best Treatment Group" in result, "Missing treatment group" + assert "Control Group" in result, "Missing control group" + assert "MAPE" in result, "Missing MAPE metric" + assert "SMAPE" in result, "Missing SMAPE metric" + assert "Holdout Percentage" in result, "Missing holdout percentage" + assert result["MAPE"] >= 0, "MAPE must be a positive number" + assert 0 <= result["Holdout Percentage"] <= 100, "Holdout must be between 0 and 100" + + +def test_better_groups_no_valid_treatments(similarity_matrix, correlation_matrix, test_data): + test_data = test_data[test_data["location"].isin(["X", "Y"])] + print(f"test data: {test_data}") + results = BetterGroups( + similarity_matrix=similarity_matrix, + excluded_locations=[], + data=test_data, + correlation_matrix=correlation_matrix, + maximum_treatment_percentage=0.50 + ) + + assert results is None, "If there are no valid locations, the result must be None" + + +def test_better_groups_scaled_data(similarity_matrix, correlation_matrix, test_data): + scaler = MinMaxScaler() + test_data["Y"] = scaler.fit_transform(test_data["Y"].values.reshape(-1, 1)) + + results = BetterGroups( + similarity_matrix=similarity_matrix, + excluded_locations=[], + data=test_data, + correlation_matrix=correlation_matrix, + maximum_treatment_percentage=0.50 + ) + + assert isinstance(results, dict), "The result must be a dictionary" + assert all(isinstance(result["MAPE"], (float, int)) for result in results.values()), "MAPE must be a number" + + +def test_better_groups_no_control(monkeypatch, similarity_matrix, correlation_matrix, test_data): + + def fake_select_controls(correlation_matrix, treatment_group, min_correlation): + return [] + + monkeypatch.setattr("Murray.main.select_controls", fake_select_controls) + + results = BetterGroups( + similarity_matrix=similarity_matrix, + excluded_locations=[], + data=test_data, + correlation_matrix=correlation_matrix, + maximum_treatment_percentage=0.50 + ) + + for result in results.values(): + assert result["MAPE"] == float('inf'), "If there are no controls, MAPE must be infinite" diff --git a/Murray/tests/test_market_correlations.py b/Murray/tests/test_market_correlations.py new file mode 100644 index 0000000..8bcc404 --- /dev/null +++ b/Murray/tests/test_market_correlations.py @@ -0,0 +1,25 @@ +import os +import pandas as pd +import pytest +from Murray.auxiliary import market_correlations,cleaned_data + + +DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data")) + + +tests = [ + (os.path.join(DATA_DIR, "data1.csv"), "add_to_carts", "region", "date"), + (os.path.join(DATA_DIR, "data2.csv"), "sessions", "location", "day"), +] + +@pytest.fixture +def cleaned_data_fixture(dataset_path, col_target, col_locations, col_dates): + df = pd.read_csv(dataset_path) + return cleaned_data(df, col_target, col_locations, col_dates) + + +@pytest.mark.parametrize("dataset_path, col_target, col_locations, col_dates", tests) +def test_market_correlations(cleaned_data_fixture): + + correlation_matrix = market_correlations(cleaned_data_fixture) + assert isinstance(correlation_matrix, pd.DataFrame), "market_correlations should return a DataFrame" diff --git a/Murray/tests/test_power_analysis.py b/Murray/tests/test_power_analysis.py new file mode 100644 index 0000000..ca1f409 --- /dev/null +++ b/Murray/tests/test_power_analysis.py @@ -0,0 +1,90 @@ +import pytest +import numpy as np +import pandas as pd +from Murray.main import apply_lift, calculate_conformity, simulate_power, run_simulation, evaluate_sensitivity + +@pytest.fixture +def synthetic_series(): + """Fixture to generate a synthetic time series.""" + np.random.seed(42) + y = np.random.rand(100) * 100 + return y + + +def test_apply_lift(synthetic_series): + y = synthetic_series.copy() + y_lifted = apply_lift(y, delta=0.1, start_treatment=50, end_treatment=70) + + assert np.all(y_lifted[:50] == y[:50]), "Values before the treatment should not change" + assert np.all(y_lifted[70:] == y[70:]), "Values after the treatment should not change" + assert np.all(y_lifted[50:70] == y[50:70] * 1.1), "The lift should be applied in the treatment period" + + +def test_calculate_conformity(synthetic_series): + y_real = synthetic_series.copy() + y_control = synthetic_series.copy() * 0.9 + + conformity = calculate_conformity(y_real, y_control, start_treatment=50, end_treatment=70) + + expected_conformity = np.mean(y_real[50:70]) - np.mean(y_control[50:70]) + assert np.isclose(conformity, expected_conformity), "The calculated conformity should match the expected value" + + +def test_simulate_power(synthetic_series): + y_real = synthetic_series.copy() + y_control = synthetic_series.copy() * 0.95 + + delta, power, y_lifted = simulate_power( + y_real=y_real, + y_control=y_control, + delta=0.1, + period=20, + n_permutations=100, + significance_level=0.05 + ) + + assert isinstance(delta, float), "Delta must be a float" + assert isinstance(power, float), "Statistical power must be a float" + assert isinstance(y_lifted, np.ndarray), "The adjusted series must be a NumPy array" + assert len(y_lifted) == len(y_real), "The adjusted series must have the same length as the original" + + +def test_run_simulation(synthetic_series): + y_real = synthetic_series.copy() + y_control = synthetic_series.copy() * 0.98 + + delta, power, y_lifted = run_simulation( + delta=0.2, + y_real=y_real, + y_control=y_control, + period=20, + n_permutations=100, + significance_level=0.05 + ) + + assert isinstance(delta, float), "Delta must be a float" + assert isinstance(power, float), "Statistical power must be a float" + assert isinstance(y_lifted, np.ndarray), "The adjusted series must be a NumPy array" + + +def test_evaluate_sensitivity(): + """Test the sensitivity evaluation function""" + results_by_size = { + 50: {"Actual Target Metric (y)": np.random.rand(100) * 100, "Predictions": np.random.rand(100) * 100} + } + deltas = [0.05, 0.1, 0.2] + periods = [10, 20, 30] + n_permutations = 50 + + sensitivity_results, lift_series = evaluate_sensitivity( + results_by_size=results_by_size, + deltas=deltas, + periods=periods, + n_permutations=n_permutations, + significance_level=0.05 + ) + + assert isinstance(sensitivity_results, dict), "The result must be a dictionary" + assert isinstance(lift_series, dict), "The lift series must be a dictionary" + assert all(isinstance(v, dict) for v in sensitivity_results.values()), "Each value in sensitivity_results must be a dictionary" + assert all(isinstance(v, np.ndarray) for v in lift_series.values()), "Each value in lift_series must be a NumPy array" diff --git a/Murray/tests/test_run_geo_analysis.py b/Murray/tests/test_run_geo_analysis.py new file mode 100644 index 0000000..5d621cf --- /dev/null +++ b/Murray/tests/test_run_geo_analysis.py @@ -0,0 +1,38 @@ +import pytest +import numpy as np +import pandas as pd +from Murray.main import run_geo_analysis_streamlit_app +from Murray.auxiliary import market_correlations, cleaned_data + +@pytest.fixture +def sample_data(): + """Fixture that generates a test DataFrame with synthetic data.""" + np.random.seed(42) + data = pd.DataFrame({ + "time": np.tile(pd.date_range("2023-01-01", periods=100, freq="D"), 10), + "location": np.repeat([f"Location_{i}" for i in range(10)], 100), + "Y": np.random.rand(1000) * 100 + }) + return data + + +def test_run_geo_analysis(sample_data): + """Checks that the analysis function runs correctly.""" + results = run_geo_analysis_streamlit_app( + data=sample_data, + maximum_treatment_percentage=0.50, + significance_level=0.05, + deltas_range=(0.05, 0.2, 0.05), + periods_range=(10, 30, 10), + excluded_locations=["Location_1"], + n_permutations=100 + ) + + assert isinstance(results, dict), "The result must be a dictionary" + assert "simulation_results" in results, "Missing 'simulation_results' in the results" + assert "sensitivity_results" in results, "Missing 'sensitivity_results' in the results" + assert "series_lifts" in results, "Missing 'series_lifts' in the results" + + assert isinstance(results["simulation_results"], dict), "simulation_results must be a dictionary" + assert isinstance(results["sensitivity_results"], dict), "sensitivity_results must be a dictionary" + assert isinstance(results["series_lifts"], dict), "series_lifts must be a dictionary" diff --git a/Murray/tests/test_run_geo_evaluation.py b/Murray/tests/test_run_geo_evaluation.py new file mode 100644 index 0000000..ed6b022 --- /dev/null +++ b/Murray/tests/test_run_geo_evaluation.py @@ -0,0 +1,46 @@ +import pytest +import numpy as np +import pandas as pd +from Murray.post_analysis import run_geo_evaluation +from Murray.auxiliary import market_correlations, cleaned_data + +@pytest.fixture +def sample_data(): + """Fixture that generates a test DataFrame with fictitious data""" + np.random.seed(42) + data = pd.DataFrame({ + "time": np.tile(pd.date_range("2023-01-01", periods=100, freq="D"), 10), + "location": np.repeat([f"Location_{i}" for i in range(10)], 100), + "Y": np.random.rand(1000) * 100 + }) + return data + + +def test_run_geo_evaluation(sample_data): + """Checks that the geographic evaluation function runs correctly""" + results = run_geo_evaluation( + data_input=sample_data, + start_treatment="2023-03-01", + end_treatment="2023-03-10", + treatment_group=["Location_0", "Location_1"], + spend=50000, + n_permutations=100, + inference_type="iid", + significance_level=0.05 + ) + + assert isinstance(results, dict), "The result must be a dictionary" + expected_keys = [ + "MAPE", "SMAPE", "predictions", "treatment", "p_value", "power", + "percenge_lift", "control_group", "observed_stat", + "null_stats", "weights", "period", "spend", "length_treatment" + ] + for key in expected_keys: + assert key in results, f"Missing the key '{key}' in the results" + + assert isinstance(results["MAPE"], float), "MAPE must be a float" + assert isinstance(results["p_value"], float), "p_value must be a float" + assert isinstance(results["power"], float), "Power must be a float" + assert isinstance(results["control_group"], list), "Control group must be a list" + assert 0 <= results["power"] <= 1, "Power must be between 0 and 1" + assert 0 <= results["p_value"] <= 1, "p_value must be between 0 and 1" diff --git a/Murray/tests/test_select_markets.py b/Murray/tests/test_select_markets.py new file mode 100644 index 0000000..25ca78b --- /dev/null +++ b/Murray/tests/test_select_markets.py @@ -0,0 +1,86 @@ +import pytest +import numpy as np +import pandas as pd +from Murray.main import select_treatments, select_controls +from Murray.auxiliary import market_correlations, cleaned_data + +@pytest.fixture(scope="module") +def cleaned_dataframe(): + """Fixture that creates synthetic test data""" + np.random.seed(42) + + dates = pd.date_range(start='2023-01-01', periods=100) + regions = ['Region_A', 'Region_B', 'Region_C', 'Region_D', 'Region_E'] + + data = [] + for region in regions: + base_value = np.random.randint(50, 100) + for date in dates: + value = base_value + np.sin(date.day/15) * 10 + np.random.normal(0, 2) + data.append({ + 'date': date, + 'region': region, + 'add_to_carts': max(0, int(value)) + }) + + df = pd.DataFrame(data) + return cleaned_data(df, "add_to_carts", "region", "date") + +@pytest.fixture(scope="module") +def correlation_matrix(cleaned_dataframe): + """Fixture that generates the correlation matrix""" + return market_correlations(cleaned_dataframe) + +def test_select_treatments_valid(cleaned_dataframe, correlation_matrix): + """Test to verify that treatments are correctly selected with a randomly excluded location""" + excluded_location = np.random.choice(cleaned_dataframe["location"].unique()) + treatments = select_treatments(correlation_matrix, treatment_size=2, excluded_locations=[excluded_location]) + + assert isinstance(treatments, list), "The result must be a list" + assert all(isinstance(group, list) for group in treatments), "Each combination must be a list" + assert all(len(group) == 2 for group in treatments), "Each combination must have 2 treatments" + assert excluded_location not in [loc for group in treatments for loc in group], "The excluded location must not appear in the treatments" + +def test_select_treatments_invalid_location(correlation_matrix): + """Should raise a KeyError if an excluded location is not in the matrix""" + with pytest.raises(KeyError, match="not present in the similarity matrix"): + select_treatments(correlation_matrix, treatment_size=2, excluded_locations=["X", "Y"]) + +def test_select_treatments_treatment_size_too_large(correlation_matrix): + """Should raise ValueError if treatment_size is greater than the number of available columns""" + with pytest.raises(ValueError, match="The treatment size .* exceeds the available number of columns"): + select_treatments(correlation_matrix, treatment_size=100, excluded_locations=[]) + +def test_select_treatments_treatment_size_equals_columns(correlation_matrix): + """Should return only one combination when treatment_size is equal to the available columns""" + num_columns = correlation_matrix.shape[1] + treatments = select_treatments(correlation_matrix, treatment_size=num_columns, excluded_locations=[]) + + assert len(treatments) == 1, "There must be only one possible combination" + assert set(treatments[0]) == set(correlation_matrix.columns), "It must contain all possible locations" + + +def test_select_controls_valid(cleaned_dataframe, correlation_matrix): + """Test to verify that controls are correctly selected based on treatments""" + excluded_location = np.random.choice(cleaned_dataframe["location"].unique()) + treatments = select_treatments(correlation_matrix, treatment_size=2, excluded_locations=[excluded_location]) + + for treatment_group in treatments: + controls = select_controls(correlation_matrix, treatment_group) + assert isinstance(controls, list), "The result must be a list" + assert len(controls) > 0, "There must be at least one control available" + assert all(loc not in treatment_group for loc in controls), "Controls must not be in the treatment group" + +def test_select_controls_invalid_treatments(correlation_matrix): + """Should handle nonexistent treatments without failing""" + fake_treatment_group = ["X", "Y", "Z"] + controls = select_controls(correlation_matrix, fake_treatment_group) + assert controls == [], "If the treatment does not exist, the output must be an empty list" + +def test_select_controls_fallback(correlation_matrix,cleaned_dataframe): + """Should select the `fallback_n` most correlated if no locations meet the min_correlation""" + treatment_group = np.random.choice(cleaned_dataframe["location"].unique()) + treatment_group = [treatment_group] + controls = select_controls(correlation_matrix, treatment_group, min_correlation=0.99, fallback_n=3) + + assert len(controls) == 3, "It should select 3 fallback controls" diff --git a/Murray/tests/test_synthetic_control.py b/Murray/tests/test_synthetic_control.py new file mode 100644 index 0000000..a25e6f7 --- /dev/null +++ b/Murray/tests/test_synthetic_control.py @@ -0,0 +1,51 @@ +import pytest +import numpy as np +import pandas as pd +from Murray.main import SyntheticControl +from Murray.auxiliary import cleaned_data, market_correlations + +@pytest.fixture(scope="module") +def synthetic_data(): + """Fixture that creates synthetic test data""" + np.random.seed(42) + X = np.random.rand(100, 3) + y = X @ np.array([0.3, 0.5, 0.2]) + np.random.normal(0, 0.1, 100) + + return X, y + +@pytest.fixture(scope="module") +def correlation_matrix(synthetic_data): + """Fixture that generates correlation matrix from synthetic data""" + return market_correlations(synthetic_data) + +@pytest.fixture(scope="module") +def synthetic_control(): + """Fixture that creates a synthetic control instance""" + return SyntheticControl( + regularization_strength_l1=0.1, + regularization_strength_l2=0.1, + seasonality=None, + delta=1.0 + ) + +def test_synthetic_control_fit(synthetic_control, synthetic_data): + """Test that synthetic control can fit the data""" + X, y = synthetic_data + synthetic_control.fit(X, y) + + assert hasattr(synthetic_control, 'is_fitted_') + assert hasattr(synthetic_control, 'w_') + assert isinstance(synthetic_control.w_, np.ndarray) + assert len(synthetic_control.w_) == X.shape[1] + +def test_synthetic_control_predict(synthetic_control, synthetic_data): + """Test that synthetic control can make predictions""" + X, y = synthetic_data + synthetic_control.fit(X, y) + predictions, weights = synthetic_control.predict(X) + + assert isinstance(predictions, np.ndarray) + assert len(predictions) == len(y) + assert not np.isnan(predictions).any() + assert isinstance(weights, np.ndarray) + assert len(weights) == X.shape[1] diff --git a/Murray/tests/test_upload_data.py b/Murray/tests/test_upload_data.py new file mode 100644 index 0000000..08f044d --- /dev/null +++ b/Murray/tests/test_upload_data.py @@ -0,0 +1,27 @@ +import os +import pandas as pd +import pytest +import Murray as mp + + +DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data")) + + +tests = [ + (os.path.join(DATA_DIR, "data1.csv"), "add_to_carts", "region", "date"), + (os.path.join(DATA_DIR, "data2.csv"), "sessions", "location", "day"), +] + + + +@pytest.mark.parametrize("dataset_path, col_target, col_locations, col_dates", tests) +def test_cleaned_data(dataset_path, col_target, col_locations, col_dates): + + assert os.path.exists(dataset_path), f"File {dataset_path} not found" + df = pd.read_csv(dataset_path) + df_cleaned = mp.cleaned_data(df, col_target, col_locations, col_dates) + + assert isinstance(df_cleaned, pd.DataFrame), "Output is not a DataFrame" + assert df_cleaned.isnull().sum().sum() == 0, "Cleaned data contains NaN values" + +