## Summary

Walkthrough of GeoLift to check that PyGeoLift works correctly.

In [1]:
import sys
import os
sys.path.append("..")
os.environ["R_HOME"] = "/Library/Frameworks/R.framework/Resources"

In [2]:
%reload_ext autoreload
%autoreload 2
%aimport pygeolift

In [3]:
import pygeolift.data

In [4]:
from rpy2.robjects.packages import importr, data as r_package_data
from rpy2.robjects import pandas2ri, default_converter, numpy2ri, NULL
from rpy2.robjects.conversion import localconverter

In [5]:
from pygeolift import geolift

Load example package data, `GeoLift_PreTest`.

In [6]:
geo_lift_pre_test = pygeolift.data.load_GeoLift_PreTest()

In [7]:
geo_lift_pre_test.head()

Unnamed: 0,location,Y,date
0,new york,3300,2021-01-01
1,new york,3202,2021-01-02
2,new york,4138,2021-01-03
3,new york,3716,2021-01-04
4,new york,3270,2021-01-05


In [8]:
geo_test_data_pre_test =  geolift.geo_data_read(geo_lift_pre_test,
                                        date_id = "date",
                                        location_id = "location",
                                        Y_id = "Y",
                                        X = [], #empty list as we have no covariates
                                        format = "yyyy-mm-dd",
                                        summary = True)

R[write to console]: ##################################
#####       Summary       #####
##################################

* Raw Number of Locations: 40
* Time Periods: 90
* Final Number of Locations (Complete): 40



[{'location': 'new york', 'Y': 3300, 'date': '2021-01-01'},
 {'location': 'new york', 'Y': 3202, 'date': '2021-01-02'},
 {'location': 'new york', 'Y': 4138, 'date': '2021-01-03'},
 {'location': 'new york', 'Y': 3716, 'date': '2021-01-04'},
 {'location': 'new york', 'Y': 3270, 'date': '2021-01-05'},
 {'location': 'new york', 'Y': 3260, 'date': '2021-01-06'},
 {'location': 'new york', 'Y': 3438, 'date': '2021-01-07'},
 {'location': 'new york', 'Y': 3427, 'date': '2021-01-08'},
 {'location': 'new york', 'Y': 3930, 'date': '2021-01-09'},
 {'location': 'new york', 'Y': 4939, 'date': '2021-01-10'},
 {'location': 'new york', 'Y': 4368, 'date': '2021-01-11'},
 {'location': 'new york', 'Y': 3812, 'date': '2021-01-12'},
 {'location': 'new york', 'Y': 3523, 'date': '2021-01-13'},
 {'location': 'new york', 'Y': 3444, 'date': '2021-01-14'},
 {'location': 'new york', 'Y': 3378, 'date': '2021-01-15'},
 {'location': 'new york', 'Y': 3403, 'date': '2021-01-16'},
 {'location': 'new york', 'Y': 3760, 'da

In [21]:
input_data = [MarketSelectionInputDataRow(**row) for row in geo_lift_pre_test.to_dict(orient='records')]
df = pd.DataFrame.from_records(row.dict() for row in input_data)
df['date'] = [d.strftime("%Y-%m-%d") for d in df['date']]
data_clean = geo_data_read(
    df,
    date_id = "date",
    Y_id = "Y",
    location_id = "location",
    format = "yyyy-mm-dd",
    keep_unix_time=True,
    summary = False
)


Unnamed: 0,location,time,date_unix,Y
1,atlanta,1,1.609459e+09,3384.0
2,atlanta,2,1.609546e+09,3904.0
3,atlanta,3,1.609632e+09,5734.0
4,atlanta,4,1.609718e+09,4311.0
5,atlanta,5,1.609805e+09,3686.0
...,...,...,...,...
3596,washington,86,1.616803e+09,3037.0
3597,washington,87,1.616890e+09,4453.0
3598,washington,88,1.616976e+09,3923.0
3599,washington,89,1.617062e+09,2144.0


In [30]:
import numpy as np
market_selections = geolift.geo_lift_market_selection(
                                        data = data_clean, #geo_test_data_pre_test,
                                        treatment_periods = [10,15],
                                        N = [2,3,4,5],
                                        Y_id = "Y",
                                        location_id = "location",
                                        time_id = "time",
                                        effect_size = list(np.arange(-0.25, 0.25, 0.05)),
                                        lookback_window = 1, 
                                        include_markets = ["chicago"],
                                        exclude_markets = ["honolulu"],
                                        cpic = 7.50,
                                        budget = 100000,
                                        alpha = 0.1,
                                        Correlations = True,
                                        fixed_effects = True,
                                        side_of_test = "one_sided"
                                        )



R[write to console]: Setting up cluster.

R[write to console]: Importing functions into cluster.

R[write to console]: 
Deterministic setup with 2 locations in treatment.

R[write to console]: 
Deterministic setup with 3 locations in treatment.

R[write to console]: 
Deterministic setup with 4 locations in treatment.

R[write to console]: 
Deterministic setup with 5 locations in treatment.



  ID                                           location duration EffectSize
1  1            atlanta, chicago, las vegas, saint paul       15       0.05
2  2                                  chicago, portland       15       0.05
3  3             chicago, cincinnati, houston, portland       15       0.05
4  4                  chicago, houston, nashville, reno       15       0.05
5  5       atlanta, chicago, cleveland, las vegas, reno       10       0.05
6  6 atlanta, chicago, cleveland, las vegas, saint paul       10       0.05
  Power AvgScaledL2Imbalance Investment   AvgATT Average_MDE ProportionTotal_Y
1     1            0.5558341   85305.00 189.2567  0.04991417        0.08767192
2     1            0.1738778   32281.87 146.5321  0.05111983        0.03306537
3     1            0.1971864   74118.37 159.3627  0.04829913        0.07576405
4     1            0.3321341   75556.12 174.0647  0.05193036        0.07816073
5     1            0.4536741   69300.00 195.0787  0.05292824        0.107

In [72]:
from typing import List, Optional
import pandas as pd
from pydantic import BaseModel, Field, confloat, PositiveInt, PositiveFloat, root_validator, validator
from pandas import DataFrame
from typing import TypedDict
import datetime
from enum import Enum

from pygeolift.geolift import geo_lift_market_selection, geo_data_read

class EnumStringPrint:
    def __str__(self):
        return str(self.value)    

class TestSidedness(EnumStringPrint, str, Enum):
    one_sided = "one_sided"
    two_sided = "two_sided"

class MarketSelectionInputDataRow(BaseModel):
    location: str
    date: datetime.date
    Y: float

class PowerCurveValue(BaseModel):
    """Power curve value in the market selection API results."""
    location: str = Field(..., description="Location id")
    duration: int = Field(..., description="Length of experiment assignment (in days)")
    EffectSize: float = Field(..., description="Effect size used in the simulation (relative)")
    power: float = Field(..., description="Power (proportion of stat sig simulations)")
    Investment: float = Field(..., description="Investment, equal to CPIC * conversions)")
    AvgATT: float = Field(..., description="Average treatment effect for the treated units")
    AvgDetectedLift: float = Field(..., description="Average detected lift.")
    

class LocationAssignment(BaseModel):
    location: List[str] = Field(..., description="Sorted list of location identifiers")
    duration: int = Field(..., description="Length of experiment assignment (in days)")
    EffectSize: float = Field(..., description="Smallest effect size for that (location combination, duration) where power is at least 80%.")
    Power: float = Field(..., description="Power at the smallest effect size")
    AvgScaledL2Imbalance: float = Field(..., description="Average scaled L2 imbalance")
    Investment: float   = Field(..., description="Estimated marketing budget for this")
    AvgATT: float  = Field(..., description="Average ATT estimate in simulations")
    Average_MDE: float  = Field(..., description="Average MDE in simulations")
    ProportionTotal_Y: float  = Field(..., description="Proportion of total Y")
    abs_lift_in_zero: float  = Field(..., description="Estimated lift when there is no treatment effect. This should be close to 0.")
    Holdout: float = Field(...)
    rank: int = Field(..., description="Ranking of best designs. This the average rank of the ranks of (TODO).")
    correlation: Optional[float]  = Field(None)
    power_curve: List[PowerCurveValue] = Field(..., description="A data frame with the results for all effect sizes that were estimated")
    
    class Config:
        arbitrary_types_allowed = True

class MarketSelectionResult(BaseModel):
    __root__: List[LocationAssignment] = Field(..., description="List of assignments and information about those assignments.")

class MarketSelectionInput(BaseModel):
    data: List[MarketSelectionInputDataRow] = Field(..., description="Data ")
    treatment_periods: List[PositiveInt] = Field(..., description="List of the number of experiment lengths (in days) to simulation.")
    num_locations: List[PositiveInt] = Field(...)
    effect_sizes: List[float] = Field([0, 0.05, 0.10, 0.15, 0.20, 0.25])
    lookback_window: PositiveInt = Field(1)
    include_locations: List[str] = Field([])
    exclude_locations: List[str] = Field([])
    cpic: PositiveFloat = Field(1)
    budget: Optional[PositiveFloat] = Field(None)
    side_of_test: TestSidedness = Field(TestSidedness.one_sided)
    fixed_effects: bool = True
    alpha: confloat(gt=0, lt=1) = Field(0.05, description="Statistical significance value.")
    
    @validator('include_locations', 'exclude_locations', 'effect_sizes', pre=True)
    def uniquify_list(cls, v):
        """Ensure locations are always unique."""
        return list(set(v))
    
    # @validator('effect_sizes')
    # def check_effect_sizes(cls, v):
    #     # check that all effect sizes are in the same direction
    #     if sum(x >= 0 for x in v) != len(v):
    #         raise ValueError("Effect sizes must be either all positive or all negative.")
    #     return v   
            
    @root_validator(pre=False)
    def check_include_locations(cls, values):
        include_locations = values.get('include_locations')
        if include_locations:
            locations = set(row.location for row in values.get('data'))
            missing_locations = set(include_locations) - locations
            if len(missing_locations) > 0:
                raise ValueError('Some locations in `include_locations` are not in `data`: {}'.format(','.join(missing_locations)))
        return values
    
    @root_validator(pre=False)
    def check_exclude_locations(cls, values):
        exclude_locations = values.get('exclude_locations')
        if exclude_locations:
            locations = set(row.location for row in values.get('data'))
            missing_locations = set(exclude_locations) - locations
            if len(missing_locations) > 0:
                raise ValueError('Some locations in `exclude_locations` are not in `data`: {}'.format(','.join(missing_locations)))
        return values    

def market_selection(input: MarketSelectionInput) -> MarketSelectionResult:
    # preprocess and clean data
    df = pd.DataFrame.from_records([row.dict() for row in input.data])
    df['date'] = [d.strftime("%Y-%m-%d") for d in df['date']]
    data_clean = geo_data_read(
        df,
        date_id = "date",
        Y_id = "Y",
        location_id = "location",
        format = "yyyy-mm-dd",
        keep_unix_time=True,
        summary = False
    )

    # Run main function
    results = geo_lift_market_selection(
        data = data_clean,
        treatment_periods = input.treatment_periods,
        N = input.num_locations,
        #X = tuple(),
        effect_size = input.effect_sizes,
        lookback_window = input.lookback_window,
        Y_id = "Y",
        location_id = "location",
        time_id = "time",
        include_markets = input.include_locations,
        exclude_markets = input.exclude_locations,
        cpic = input.cpic,
        budget = input.budget,
        alpha = input.alpha,
        fixed_effects = input.fixed_effects,
        side_of_test = str(input.side_of_test),
        # These should be set internally
        Correlations = True,
        parallel = False,
        parallel_setup = "sequential",
        ProgressBar = False,
        print_ = False
    )

    #results.PowerCurves['duration'] = results.PowerCurves['duration'].astype(int)
    power_curves = dict(list(results.PowerCurves.groupby(["location", "duration"])))
    output = []
    # Post process results
    for row in results.BestMarkets.sort_values(["rank"]).itertuples(index=False):
        d = row._asdict()
        d['power_curve'] = power_curves[(row.location, row.duration)].to_dict(orient='records')
        d['location'] = d['location'].split(',')
        d['duration'] = int(d['duration'])
        output.append(d)
    # TODO: Should the original parameters be included in the results
    return MarketSelectionResult(__root__=output)


df = [MarketSelectionInputDataRow(**row) for row in geo_lift_pre_test.to_dict(orient='records')]
api_input = MarketSelectionInput(
    data = df,
    treatment_periods=[10, 15],
    num_locations=[2, 3, 4, 5],
    effect_sizes = list(np.arange(-0.25, 0.25, 0.05)),
    lookback_window = 1,
    include_locations=["chicago"],
    exclude_locations=["honolulu"],
    cpic = 7.50,
    budget = 100_000,
    alpha = 0.1,
    side_of_test="one_sided",
    fixed_effects=True,
)
with open("market_selection_api_input.json", "w") as f:
    f.write(api_input.json())
res = market_selection(api_input)
with open("market_selection_api_output.json", "w") as f:
    f.write(res.json())

# market_selections = geolift.geo_lift_market_selection(
#                                         data = data_clean, #geo_test_data_pre_test,
#                                         treatment_periods = input.treatment_periods,
#                                         N = input.num_locations,
#                                         Y_id = "Y",
#                                         location_id = "location",
#                                         time_id = "time",
#                                         effect_size = input.effect_sizes,
#                                         lookback_window = 1, 
#                                         include_markets = input.include_locations,
#                                         exclude_markets = input.exclude_locations,
#                                         cpic = 7.50,
#                                         budget = 100000,
#                                         alpha = 0.1,
#                                         Correlations = True,
#                                         fixed_effects = True,
#                                         side_of_test = "one_sided"
#                                         )

# MarketSelections <- GeoLiftMarketSelection(data = GeoTestData_PreTest,
#                                           treatment_periods = c(10,15),
#                                           N = c(2,3,4,5),
#                                           Y_id = "Y",
#                                           location_id = "location",
#                                           time_id = "time",
#                                           effect_size = seq(-0.25, 0.25, 0.05),
#                                           lookback_window = 1, 
#                                           include_markets = c("chicago"),
#                                           exclude_markets = c("honolulu"),
#                                           cpic = 7.50,
#                                           budget = 100000,
#                                           alpha = 0.1,
#                                           Correlations = TRUE,
#                                           fixed_effects = TRUE,
#                                           side_of_test = "one_sided")

R[write to console]: 
Deterministic setup with 2 locations in treatment.

R[write to console]: 
Deterministic setup with 3 locations in treatment.

R[write to console]: 
Deterministic setup with 4 locations in treatment.

R[write to console]: 
Deterministic setup with 5 locations in treatment.



In [69]:
res.dict()

{'__root__': [{'location': ['chicago', ' portland'],
   'duration': 15,
   'EffectSize': 0.04999999999999993,
   'Power': 1.0,
   'AvgScaledL2Imbalance': 0.17387779481518423,
   'Investment': 32281.874999999956,
   'AvgATT': 146.53208139442432,
   'Average_MDE': 0.05111983233411362,
   'ProportionTotal_Y': 0.03306537043498415,
   'abs_lift_in_zero': 0.001,
   'Holdout': 0.9669346295650159,
   'rank': 1,
   'correlation': None,
   'power_curve': [{'location': 'chicago, portland',
     'duration': 15,
     'EffectSize': -0.25,
     'power': 1.0,
     'Investment': 161409.375,
     'AvgATT': -714.3179186055752,
     'AvgDetectedLift': -0.2492001197613473},
    {'location': 'chicago, portland',
     'duration': 15,
     'EffectSize': -0.2,
     'power': 1.0,
     'Investment': 129127.5,
     'AvgATT': -570.8429186055752,
     'AvgDetectedLift': -0.19914679441210378},
    {'location': 'chicago, portland',
     'duration': 15,
     'EffectSize': -0.15000000000000002,
     'power': 1.0,
     

[-0.25, -0.2, -0.15, -0.1, -0.05, -0.0, 0.05, 0.1, 0.15, 0.2]

In [None]:
geo_test_data_test = geolift.geo_data_read(data = GeoLift_Test,
                                    date_id = "date",
                                    location_id = "location",
                                    Y_id = "Y",
                                    format = "yyyy-mm-dd",
                                    summary = True)

NameError: name 'GeoLift_Test' is not defined

In [None]:
geo_test_data_test.head()

NameError: name 'df' is not defined

Experiment create info

- name
- locations and variants
- experiment start
- experiment end
- metrics
- time unit: date
- additional matching metrics (TBD)
- model used to analyze it ... ? 

TODO: the data and metrics will include data prior to the experiment start. How should that be handled? 

In [None]:
geo_lift_test_dates = GeoLift_Test['date'].unique()
geo_lift_test_dates.sort()
geo_lift_test_dates[[90, 104]]

array(['2021-04-01', '2021-04-15'], dtype=object)

In [None]:
geo_test = geolift.geo_lift(
                   data = geo_test_data_test,
                   locations = ["chicago", "portland"],
                   treatment_start_time = 91, 
                   treatment_end_time = 105,
                   Y_id = "Y",
                   location_id = "location",
                   time_id = "time")


experiment: 
  - id
  - name
  - pre_period_start
  - pre_period_end
  - treatment_end
  - treatment_start
  - alpha

experiment_variants:
  - variant: str

experiment_assignments:
  - variant: str
  - location: str

results:
  - experiment_id
  - blob of stuff


['results',
 'inference',
 'data',
 'y_obs',
 'y_hat',
 'ATT',
 'ATT_se',
 'TreatmentStart',
 'TreatmentEnd',
 'test_id',
 'incremental',
 'Y_id',
 'summary',
 'ConfidenceIntervals',
 'lower_bound',
 'upper_bound',
 'df_weights',
 'stat_test']


In [None]:
# 
# treated_time = 15
# untreated time = 90
# geo_test_data_test['time'].max(). Max 105 time periods
# treated locations = 2
# geo_test_data_test['location'].unique().size = 40 unique periods

40

Note: Augsynth fits the per-period "mean" of all the treated units!

In [None]:
from pygeolift.rpy2_utils import r_df_to_pandas
print(geo_test.rx2("test_id"))
results = geo_test.rx2("results")
results.rx2("weights")

  loc_id     name
1      6  chicago
2     32 portland



0,1,2,3,4,5,6
0.0,0.046525,-0.0,...,0.0,0.0,0.0


In [None]:
# Experiment
import pandas as pd
name = 'Example 1'
variants = ['treatment', 'control']
treatment_locations = ['chicago', 'portland']
start_date = '2021-04-01'
end_date = '2021-04-15'

locations = pd.DataFrame({'location': GeoLift_Test['location'].unique()})
locations['variant'] = 'control'
locations[locations['location'].isin(treatment_locations)] = 'treatment'


array([-2147483648], dtype=int32)