In [78]:
import pandas as pd

from typing import Tuple, List, Optional

import pandas as pd

from variatio.data_validation import validate_data
from variatio.metrics import Metric, MetricType, MetricParams, MetricResult, AggregationOperation
from variatio.stat_significance import StatTests
from variatio.vizualizer import format_metrics_to_html


from variatio.analyzer import *


import numpy as np

In [79]:
def create_dataframes(df):
    # Create 'event_data' DataFrame simulating purchase events using 'value'
    event_data1 = pd.DataFrame({
        "timestamp": pd.to_datetime(["2022-12-10"] * len(df)),
        "userid": df["userid"],
        "event_name": ["purchase"] * len(df),
        "purchase_value": df["value"]
    })

    event_data2 = pd.DataFrame({
        "timestamp": pd.to_datetime(["2022-12-01"] * len(df)),
        "userid": df["userid"],
        "event_name": ["purchase"] * len(df),
        "purchase_value": df["pre_test_value"]
    })
    
    # Create 'user_allocations' DataFrame
    user_allocations = pd.DataFrame({
        "timestamp": pd.to_datetime(["2022-12-05"] * len(df)),
        "userid": df["userid"],
        "abgroup": df["abgroup"]
    })
    
    # Create 'user_properties' DataFrame
    user_properties = pd.DataFrame({
        "userid": df["userid"],
        "age": df["age"],
        "country": df["country"],
        "device_type": df["platform"],  # Simplified mapping
        "membership_status": ["Free"] * len(df)  # Placeholder value
    })
    event_data = pd.concat([event_data1, event_data2])
    event_data['timestamp'] = pd.to_datetime(event_data['timestamp'])
    
    return event_data, user_allocations, user_properties


In [81]:
np.corrcoef(df['value'], df['pre_test_value'])

array([[1.        , 0.09386146],
       [0.09386146, 1.        ]])

In [68]:
df = pd.read_csv("0_generated_data.csv")
print(len(df))
df = df.iloc[: 20000]

100000


In [69]:
df.head()

Unnamed: 0,userid,country,platform,user_segment,abgroup,age,engagement_score,country_idx,platform_idx,segment_idx,per_user_component,value,pre_test_value
0,1,JP,iOS,Segment_2,a2,33,5.119263,0,0,1,1.042619,18.82795,19.827209
1,2,FR,Android,Segment_1,b,40,9.991298,0,1,0,0.940335,2.74952,19.118973
2,3,IN,Android,Segment_4,a2,25,3.712433,1,1,3,1.051917,38.103762,20.542199
3,4,AU,Web,Segment_2,a2,57,0.093958,2,0,1,0.989763,164.068516,22.442407
4,5,US,Android,Segment_1,b,33,8.16048,0,1,0,1.03939,120.973807,23.418094


In [70]:
set(df['abgroup'])

{'a1', 'a2', 'b'}

In [71]:
event_data, user_allocations, user_properties = create_dataframes(df)


In [72]:
event_data

Unnamed: 0,timestamp,userid,event_name,purchase_value
0,2022-12-10,1,purchase,18.827950
1,2022-12-10,2,purchase,2.749520
2,2022-12-10,3,purchase,38.103762
3,2022-12-10,4,purchase,164.068516
4,2022-12-10,5,purchase,120.973807
...,...,...,...,...
19995,2022-12-01,19996,purchase,13.992818
19996,2022-12-01,19997,purchase,13.441233
19997,2022-12-01,19998,purchase,19.057411
19998,2022-12-01,19999,purchase,15.670144


In [73]:
from variatio import VariatioAnalyzer
import pandas as pd
analyzer = VariatioAnalyzer(event_data, user_allocations, "a1", user_properties)

In [74]:
analyzer.calculate_event_attribute_sum_per_user('purchase', 'purchase_value', "no_enhancement")


merged pretest
       abgroup  purchase_value
userid                        
1           a2       19.827209
2            b       19.118973
3           a2       20.542199
4           a2       22.442407
5            b       23.418094
...        ...             ...
19996        b       13.992818
19997       a1       13.441233
19998       a1       19.057411
19999        b       15.670144
20000       a1       21.961628

[20000 rows x 2 columns]
result_intest
       abgroup  purchase_value
userid                        
1           a2       18.827950
2            b        2.749520
3           a2       38.103762
4           a2      164.068516
5            b      120.973807
...        ...             ...
19996        b      112.079070
19997       a1       60.601378
19998       a1       36.445046
19999        b       70.913497
20000       a1       93.500610

[20000 rows x 2 columns]
use_enhansement False


(         purchase_value
 abgroup                
 a1           127.659983
 a2           128.211856
 b            136.537016,
 StatSignificanceResult(method_used=StatSignificanceMethod.CATBOOST_CUPED_T_TEST, p_values=[0.7481045752403999, 3.729498973391336e-07]))

In [75]:
analyzer.calculate_event_attribute_sum_per_user('purchase', 'purchase_value', "cuped")


merged pretest
       abgroup  purchase_value
userid                        
1           a2       19.827209
2            b       19.118973
3           a2       20.542199
4           a2       22.442407
5            b       23.418094
...        ...             ...
19996        b       13.992818
19997       a1       13.441233
19998       a1       19.057411
19999        b       15.670144
20000       a1       21.961628

[20000 rows x 2 columns]
result_intest
       abgroup  purchase_value
userid                        
1           a2       18.827950
2            b        2.749520
3           a2       38.103762
4           a2      164.068516
5            b      120.973807
...        ...             ...
19996        b      112.079070
19997       a1       60.601378
19998       a1       36.445046
19999        b       70.913497
20000       a1       93.500610

[20000 rows x 2 columns]
[[23.53881343]
 [18.50525087]
 [13.62103584]
 ...
 [13.44123271]
 [19.05741055]
 [21.96162845]] [ 72.1768745   57

(         purchase_value
 abgroup                
 a1           127.659983
 a2           128.211856
 b            136.537016,
 StatSignificanceResult(method_used=StatSignificanceMethod.PURE_CUPED_T_TEST, p_values=[0.8024137272477632, 4.2898694799530873e-07]))

In [76]:
analyzer.calculate_event_attribute_sum_per_user('purchase', 'purchase_value', "catboost_cuped")


merged pretest
       abgroup  purchase_value
userid                        
1           a2       19.827209
2            b       19.118973
3           a2       20.542199
4           a2       22.442407
5            b       23.418094
...        ...             ...
19996        b       13.992818
19997       a1       13.441233
19998       a1       19.057411
19999        b       15.670144
20000       a1       21.961628

[20000 rows x 2 columns]
result_intest
       abgroup  purchase_value
userid                        
1           a2       18.827950
2            b        2.749520
3           a2       38.103762
4           a2      164.068516
5            b      120.973807
...        ...             ...
19996        b      112.079070
19997       a1       60.601378
19998       a1       36.445046
19999        b       70.913497
20000       a1       93.500610

[20000 rows x 2 columns]
use_enhansement True
      userid  purchase_value  age country device_type membership_status
0          9       2

(         purchase_value
 abgroup                
 a1           127.659983
 a2           128.211856
 b            136.537016,
 StatSignificanceResult(method_used=StatSignificanceMethod.CATBOOST_CUPED_T_TEST, p_values=[0.48990671318935286, 3.0054448102662935e-10]))