## Summary

Walkthrough of GeoLift to check that PyGeoLift works correctly.

In [3]:
import sys
sys.path.append("..")

In [4]:
%reload_ext autoreload
%autoreload 2
%aimport pygeolift

In [5]:
import pygeolift.data

In [6]:
from rpy2.robjects.packages import importr, data as r_package_data
from rpy2.robjects import pandas2ri, default_converter, numpy2ri, NULL
from rpy2.robjects.conversion import localconverter

In [7]:
from pygeolift import geolift

Load example package data, `GeoLift_PreTest`.

In [8]:
geo_lift_pre_test = pygeolift.data.load_GeoLift_PreTest()

In [10]:
geo_test_data_pre_test =  geolift.geo_data_read(geo_lift_pre_test,
                                        date_id = "date",
                                        location_id = "location",
                                        Y_id = "Y",
                                        X = [], #empty list as we have no covariates
                                        format = "yyyy-mm-dd",
                                        summary = True)

R[write to console]: ##################################
#####       Summary       #####
##################################

* Raw Number of Locations: 40
* Time Periods: 90
* Final Number of Locations (Complete): 40



In [11]:
import numpy as np
market_selections = geolift.geo_lift_market_selection(
                                        data = geo_test_data_pre_test,
                                        treatment_periods = [10,15],
                                        N = [2,3,4,5],
                                        Y_id = "Y",
                                        location_id = "location",
                                        time_id = "time",
                                        effect_size = list(np.arange(-0.25, 0.25, 0.05)),
                                        lookback_window = 1, 
                                        include_markets = ["chicago"],
                                        exclude_markets = ["honolulu"],
                                        cpic = 7.50,
                                        budget = 100000,
                                        alpha = 0.1,
                                        Correlations = True,
                                        fixed_effects = True,
                                        side_of_test = "one_sided"
                                        )



R[write to console]: Setting up cluster.

R[write to console]: Importing functions into cluster.

R[write to console]: Attempting to load the environment ‘package:dplyr’

R[write to console]: 
Attaching package: ‘dplyr’


R[write to console]: The following objects are masked from ‘package:stats’:

    filter, lag


R[write to console]: The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


R[write to console]: 
Deterministic setup with 2 locations in treatment.

R[write to console]: 
Deterministic setup with 3 locations in treatment.

R[write to console]: 
Deterministic setup with 4 locations in treatment.

R[write to console]: 
Deterministic setup with 5 locations in treatment.



  ID                                           location duration EffectSize
1  1            atlanta, chicago, las vegas, saint paul       15       0.05
2  2                                  chicago, portland       15       0.05
3  3             chicago, cincinnati, houston, portland       15       0.05
4  4                  chicago, houston, nashville, reno       15       0.05
5  5       atlanta, chicago, cleveland, las vegas, reno       10       0.05
6  6 atlanta, chicago, cleveland, las vegas, saint paul       10       0.05
  Power AvgScaledL2Imbalance Investment   AvgATT Average_MDE ProportionTotal_Y
1     1            0.5558341   85305.00 189.2567  0.04991417        0.08767192
2     1            0.1738778   32281.87 146.5321  0.05111983        0.03306537
3     1            0.1971864   74118.37 159.3627  0.04829913        0.07576405
4     1            0.3321341   75556.12 174.0647  0.05193036        0.07816073
5     1            0.4536741   69300.00 195.0787  0.05292824        0.107

type(geolift.rpackage)

In [10]:
type(market_selections)

rpy2.robjects.vectors.ListVector

In [15]:
GeoLift_Test = pygeolift.data.load_GeoLift_Test()

In [13]:
geo_test_data_test = geolift.geo_data_read(data = GeoLift_Test,
                                    date_id = "date",
                                    location_id = "location",
                                    Y_id = "Y",
                                    format = "yyyy-mm-dd",
                                    summary = True)




R[write to console]: ##################################
#####       Summary       #####
##################################

* Raw Number of Locations: 40
* Time Periods: 105
* Final Number of Locations (Complete): 40



In [14]:
geo_test_data_test.head()

Unnamed: 0,location,time,Y
1,atlanta,1,3384
2,atlanta,2,3904
3,atlanta,3,5734
4,atlanta,4,4311
5,atlanta,5,3686


Experiment create info

- name
- locations and variants
- experiment start
- experiment end
- metrics
- time unit: date
- additional matching metrics (TBD)
- model used to analyze it ... ? 

TODO: the data and metrics will include data prior to the experiment start. How should that be handled? 

In [23]:
geo_lift_test_dates = GeoLift_Test['date'].unique()
geo_lift_test_dates.sort()
geo_lift_test_dates[[90, 104]]

array(['2021-04-01', '2021-04-15'], dtype=object)

In [15]:
geo_test = geolift.geo_lift(
                   data = geo_test_data_test,
                   locations = ["chicago", "portland"],
                   treatment_start_time = 91, 
                   treatment_end_time = 105,
                   Y_id = "Y",
                   location_id = "location",
                   time_id = "time")


experiment: 
  - id
  - name
  - pre_period_start
  - pre_period_end
  - treatment_end
  - treatment_start
  - alpha

experiment_variants:
  - variant: str

experiment_assignments:
  - variant: str
  - location: str

results:
  - experiment_id
  - blob of stuff


['results',
 'inference',
 'data',
 'y_obs',
 'y_hat',
 'ATT',
 'ATT_se',
 'TreatmentStart',
 'TreatmentEnd',
 'test_id',
 'incremental',
 'Y_id',
 'summary',
 'ConfidenceIntervals',
 'lower_bound',
 'upper_bound',
 'df_weights',
 'stat_test']


In [54]:
# 
# treated_time = 15
# untreated time = 90
# geo_test_data_test['time'].max(). Max 105 time periods
# treated locations = 2
# geo_test_data_test['location'].unique().size = 40 unique periods

40

Note: Augsynth fits the per-period "mean" of all the treated units!

In [94]:
from pygeolift.rpy2_utils import r_df_to_pandas
print(geo_test.rx2("test_id"))
results = geo_test.rx2("results")
results.rx2("weights")

  loc_id     name
1      6  chicago
2     32 portland



0,1,2,3,4,5,6
0.0,0.046525,-0.0,...,0.0,0.0,0.0


In [36]:
# Experiment
import pandas as pd
name = 'Example 1'
variants = ['treatment', 'control']
treatment_locations = ['chicago', 'portland']
start_date = '2021-04-01'
end_date = '2021-04-15'

locations = pd.DataFrame({'location': GeoLift_Test['location'].unique()})
locations['variant'] = 'control'
locations[locations['location'].isin(treatment_locations)] = 'treatment'


In [None]:
geo_test.summary.bias_est

array([-2147483648], dtype=int32)