In [1]:
import sys
import os
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

import logging
import numpy as np

sys.path.append(os.path.dirname(os.path.abspath('')))
from stratification.params import SplitBuilderParams
from prepilot.params import PrepilotParams
from prepilot.prepilot_split_builder import PrepilotSplitBuilder
from prepilot.prepilot_experiment_builder import PrepilotExperimentBuilder

logging.basicConfig(level = logging.INFO)

%load_ext autoreload
%autoreload 2

In [2]:
spark = (SparkSession
         .builder
         .master("local")
         .appName("gbc_ab_pyspark")
         # Add postgres jar
         #.config("spark.driver.extraClassPath", "/home/jovyan/work/jars/postgresql-9.4.1207.jar")
         .getOrCreate())
sc = spark.sparkContext

22/05/04 19:16:51 WARN Utils: Your hostname, MacBook-Air-Egor.local resolves to a loopback address: 127.0.0.1; using 192.168.31.194 instead (on interface en0)
22/05/04 19:16:51 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/04 19:16:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
prepilot_params = PrepilotParams(
    metrics_names=['pre_gp_orders'],
    injects=[ 1.01, 1.03, 1.04, 1.05],
    min_group_size=50000, 
    max_group_size=150000, 
    step=50000,
    bootstrap_metric = np.median,
    iterations_number = 10,
    n_buckets = 1000,
    max_beta_score=0.2,
    min_beta_score=0.05,
)

In [4]:
split_builder_params = SplitBuilderParams(
    map_group_names_to_sizes={
        'control': 10000,
        'target': 10000
    },
    region_col = "moda_city",
    split_metric_col = "pre_gp_orders",
    customer_col = "customer_id",
    cols = [],
    cat_cols=[
        #'offer_rk_goal',
        #'offer_rk_campaign'
    ],
    pvalue=0.05,
    n_top_cat=100,
    stat_test="ttest_ind"
)

In [5]:
df = (
    spark.read
    .format("csv")
    .option("header", True)
    .option("inferSchema",True)
    .option("sep", ";")
    .load("TLO.csv")
)

                                                                                

In [6]:
prepilot_guests_collector = PrepilotSplitBuilder(spark, df,
                                                [(10000,10000), (20000,20000)],
                                                split_builder_params,
                                                3)

In [7]:
splited_df = prepilot_guests_collector.collect()



In [8]:
splited_df.show(10)

ERROR:root:KeyboardInterrupt while sending command.                 (0 + 1) / 1]
Traceback (most recent call last):
  File "/Users/egorshishkovets/opt/anaconda3/lib/python3.9/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/Users/egorshishkovets/opt/anaconda3/lib/python3.9/site-packages/py4j/clientserver.py", line 475, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/Users/egorshishkovets/opt/anaconda3/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt
INFO:py4j.clientserver:Closing down clientserver connection


KeyboardInterrupt: 

In [9]:
prepilot = PrepilotExperimentBuilder(spark, df,
                                     prepilot_params,
                                     split_builder_params)

In [10]:
beta, alpha = prepilot.collect()

                                                                                

In [11]:
beta.show()



+-------------+----+-------------+-------------+-------------------+
|       metric| MDE|100000_100000|150000_150000|        50000_50000|
+-------------+----+-------------+-------------+-------------------+
|pre_gp_orders|1.04|          0.0|          0.0|                0.0|
|pre_gp_orders|1.03|          0.0|          0.0|0.19999999999999996|
|pre_gp_orders|1.05|          0.0|          0.0|                0.0|
|pre_gp_orders|1.01|          0.9|          0.6|                1.0|
+-------------+----+-------------+-------------+-------------------+



                                                                                

In [13]:
alpha.show()



+-------------+-------------+-------------+-----------+
|       metric|100000_100000|150000_150000|50000_50000|
+-------------+-------------+-------------+-----------+
|pre_gp_orders|          0.0|          0.1|        0.0|
+-------------+-------------+-------------+-----------+



22/05/04 20:50:07 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 437912 ms exceeds timeout 120000 ms
22/05/04 20:50:07 WARN SparkContext: Killing executors is not supported by current scheduler.


In [13]:
spark.stop()