## Summary

Walkthrough of GeoLift to check that PyGeoLift works correctly.

In [4]:
import sys
sys.path.append("..")

In [2]:
%reload_ext autoreload
%autoreload 2
%aimport pygeolift

In [3]:
import pygeolift.data

In [4]:
from rpy2.robjects.packages import importr, data as r_package_data
from rpy2.robjects import pandas2ri, default_converter, numpy2ri, NULL
from rpy2.robjects.conversion import localconverter

In [5]:
from pygeolift import geolift

Load example package data, `GeoLift_PreTest`.

In [6]:
geo_lift_pre_test = pygeolift.data.load_GeoLift_PreTest()

In [7]:
geo_test_data_pre_test =  geolift.geo_data_read(geo_lift_pre_test,
                                        date_id = "date",
                                        location_id = "location",
                                        Y_id = "Y",
                                        X = [], #empty list as we have no covariates
                                        format = "yyyy-mm-dd",
                                        summary = True)

R[write to console]: ##################################
#####       Summary       #####
##################################

* Raw Number of Locations: 40
* Time Periods: 90
* Final Number of Locations (Complete): 40



In [11]:
base = importr('base')
from rpy2 import robjects
import numpy as np
import pandas as pd

market_selections = geolift.geo_lift_market_selection(
                                        data = geo_test_data_pre_test,
                                        treatment_periods = [10,15],
                                        N = [2,3,4,5],
                                        Y_id = "Y",
                                        location_id = "location",
                                        time_id = "time",
                                        effect_size = list(np.arange(-0.25, 0.25, 0.05)),
                                        lookback_window = 1, 
                                        include_markets = ["chicago"],
                                        exclude_markets = ["honolulu"],
                                        cpic = 7.50,
                                        budget = 100000,
                                        alpha = 0.1,
                                        Correlations = True,
                                        fixed_effects = True,
                                        side_of_test = "one_sided"
                                        )



R[write to console]: Setting up cluster.

R[write to console]: Importing functions into cluster.

R[write to console]: Attempting to load the environment ‘package:dplyr’

R[write to console]: 
Attaching package: ‘dplyr’


R[write to console]: The following objects are masked from ‘package:stats’:

    filter, lag


R[write to console]: The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


R[write to console]: 
Deterministic setup with 2 locations in treatment.

R[write to console]: 
Deterministic setup with 3 locations in treatment.

R[write to console]: 
Deterministic setup with 4 locations in treatment.

R[write to console]: 
Deterministic setup with 5 locations in treatment.



  ID                                           location duration EffectSize
1  1            atlanta, chicago, las vegas, saint paul       15       0.05
2  2                                  chicago, portland       15       0.05
3  3             chicago, cincinnati, houston, portland       15       0.05
4  4                  chicago, houston, nashville, reno       15       0.05
5  5       atlanta, chicago, cleveland, las vegas, reno       10       0.05
6  6 atlanta, chicago, cleveland, las vegas, saint paul       10       0.05
  Power AvgScaledL2Imbalance Investment   AvgATT Average_MDE ProportionTotal_Y
1     1            0.5558341   85305.00 189.2567  0.04991417        0.08767192
2     1            0.1738778   32281.87 146.5321  0.05111983        0.03306537
3     1            0.1971864   74118.37 159.3627  0.04829913        0.07576405
4     1            0.3321341   75556.12 174.0647  0.05193036        0.07816073
5     1            0.4536741   69300.00 195.0787  0.05292824        0.107

In [9]:
print(market_selections)

    ID                                           location  duration  \
1    1            atlanta, chicago, las vegas, saint paul      15.0   
2    2                                  chicago, portland      15.0   
3    3             chicago, cincinnati, houston, portland      15.0   
4    4                  chicago, houston, nashville, reno      15.0   
5    5       atlanta, chicago, cleveland, las vegas, reno      10.0   
6    6  atlanta, chicago, cleveland, las vegas, saint ...      10.0   
7    7            atlanta, chicago, las vegas, saint paul      10.0   
8    8             chicago, cincinnati, houston, portland      10.0   
9    9                                  chicago, portland      10.0   
10  10                         chicago, houston, portland      10.0   
11  11                         chicago, houston, portland      15.0   
12  12                                   atlanta, chicago      15.0   
13  13                  atlanta, chicago, las vegas, reno      10.0   
14  14

In [10]:
## Example - Analyzing the Test Results

In [11]:
GeoLift_Test = pygeolift.data.load_GeoLift_Test()

In [12]:
geo_test_data_test = geolift.geo_data_read(data = GeoLift_Test,
                                    date_id = "date",
                                    location_id = "location",
                                    Y_id = "Y",
                                    format = "yyyy-mm-dd",
                                    summary = True)




R[write to console]: ##################################
#####       Summary       #####
##################################

* Raw Number of Locations: 40
* Time Periods: 105
* Final Number of Locations (Complete): 40



In [13]:
geo_test_data_test.head()

Unnamed: 0,location,time,Y
1,atlanta,1,3384
2,atlanta,2,3904
3,atlanta,3,5734
4,atlanta,4,4311
5,atlanta,5,3686


In [14]:
geo_lift_r = geolift.rpackage

In [15]:
from rpy2.robjects import local_context, conversion, default_converter, pandas2ri, StrVector

with localconverter(default_converter + pandas2ri.converter):
  geo_test_data_test_r = conversion.py2rpy(geo_test_data_test)

In [16]:
type(geo_test_data_test_r)

rpy2.robjects.vectors.DataFrame

In [36]:
geo_test = geolift.geo_lift(
                   data = geo_test_data_test_r,
                   locations = ["chicago", "portland"],
                   treatment_start_time = 91, 
                   treatment_end_time = 105,
                   Y_id = "Y",
                   location_id = "location",
                   time_id = "time")


In [39]:
print(geo_test.summarize())


Statistics
----------
Average ATT: 155.556
Percent Lift: 5.40%
Incremental Y: 4666.67
P-value: 0.02
90.0% Confidence Interval: (-6535.290, -6535.290)

Balance
-------
L2 Imbalance: 909.489
Scaled L2 Imbalance: 0.1636
Percent improvement from naive model: 84%
Average estimated bias: 

Model Weights
-------------
Prognostic function: none




In [24]:
geo_test.summary.bias_est

array([-2147483648], dtype=int32)