In [None]:
%matplotlib inline
import math
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import statsmodels.api as sm
import seaborn as sns
import math as math
import matplotlib.pyplot as plt
import seaborn as sns
#import statsmodels.formula.api as smf
#from statsmodels.regression.quantile_regression import QuantReg

In [None]:
import pyspark
import numpy as np
import pandas as pd
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql import types, functions as F

In [None]:
# random seed for reproducibility
RANDOM_SEED = 15

# pandas settings
pd.set_option('display.max_rows',1000)
pd.set_option('display.max.columns',2500)
pd.set_option('max_colwidth',200)

# disable auto-closing brackets and quotes
from notebook.services.config import ConfigManager
c = ConfigManager()
c.update('notebook', {"CodeCell": {"cm_config": {"autoCloseBrackets": False}}})
app_name = 'reseller_master_latlong'

In [None]:
conf = (SparkConf()
         .setMaster('yarn-client')
         .setAppName(app_name)
         .set("spark.yarn.queue", "root.hue_dmp")
         .set("spark.executor.memory", "16G")
         .set("spark.executor.cores","4")
         .set("spark.driver.memory", "16G")
         .set("spark.default.parallelism", "8")
         .set("spark.sql.shuffle.partitions", "1000")
         .set("spark.shuffle.service.enabled", "true")
         .set("spark.dynamicAllocation.enabled", "true")
         .set("spark.dynamicAllocation.minExecutors", "1")
         .set("spark.dynamicAllocation.maxExecutors", "20")
         .set("spark.dynamicAllocation.initialExecutors", "1")
         .set("spark.yarn.maxAppAttempts", "2")
         .set("spark.sql.parquet.compression.codec", "snappy")
         .set("spark.sql.parquet.binaryAsString", "true")
         .set("spark.driver.memoryOverhead", "4096")
         .set("spark.yarn.driver.memoryOverhead", "4096")
         .set("spark.yarn.executor.memoryOverhead", "4096")
         .set("spark.executor.heartbeatInterval", "20s")
         .set("spark.network.timeout", "800s")
         .set("spark.sql.broadcastTimeout", "1200")
         .set("spark.sql.hive.convertMetastoreParquet", "false")
         .set("yarn.nodemanager.vmem-check-enabled", "false")
         .set("spark.default.parallelism", "8")
         .set("spark.sql.shuffle.partitions", "1000")
         .set("spark.driver.memory", "16G")
         .set("spark.testing.memory", "2147480000")
         .set("spark.sql.hive.verifyPartitionPath", "false")
         .set("spark.driver.maxResultSize", "0")
         .set("spark.sql.autoBroadcastJoinThreshold", 400*1024*1024)
         .set("yarn.nodemanager.vmem-check-enabled","false")
         .set("spark.hadoop.fs.permissions.umask-mode","002")
         )

In [None]:
sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession(sc)
table = 'dmp_remote.outlet_dim_dd'
source_df = spark.sql(
    f"""
        SELECT DISTINCT
            outlet_id,
            regional
        FROM {table}
        WHERE
            load_date BETWEEN '2020-01-01' AND '2020-01-31'
    """
).toPandas()

In [None]:
aa = source_df
#source_df = aa
source_df.head()

In [None]:
source_df['outlet_id'] = source_df['outlet_id'].astype('int64')
source_df['regional'] = source_df['regional'].astype('string')
source_df.dtypes
len(source_df)

In [None]:
master_df = pd.read_csv('../../../data/reseller/05_model_input/master_table/ra_master_table.csv')

In [None]:
main_features = ['outlet_id',
                 'fea_outlet_string_type',
                 'fea_outlet_string_classification',
                 'fea_outlet_decimal_total_cashflow_mkios_mean']

In [None]:
master_relevant_df = master_df.loc[:,main_features]
master_relevant_df.head()

In [None]:
len(master_relevant_df)

In [None]:
master_relevant_2_df = master_relevant_df.join(source_df.set_index('outlet_id'), on='outlet_id')
master_relevant_2_df.head()

In [None]:
len(master_relevant_2_df)

In [None]:
test_group = pd.read_csv('Promotors pilot control group.csv')
#test_group.describe()
test_group.rename(columns={'Outlet Id':'outlet_id'},inplace=True)
len(test_group)
test_group.head()

In [None]:
test_group = test_group.join(master_relevant_2_df.set_index('outlet_id'), on='outlet_id')
test_group = test_group.dropna()
test_group = test_group.drop_duplicates()
test_group

In [None]:
test_group_outlets = list(test_group.outlet_id.unique())
len(test_group_outlets)

In [None]:
unique_type = list(test_group.fea_outlet_string_type.unique())
unique_type

In [None]:
unique_class = list(test_group.fea_outlet_string_classification.unique())
unique_class

In [None]:
unique_regions = list(test_group.regional.unique())
unique_regions

In [None]:
master_relevant_3_df = master_relevant_2_df[~master_relevant_2_df['outlet_id'].isin(test_group_outlets)]
len(master_relevant_3_df)

In [None]:
master_relevant_4_df = master_relevant_3_df[master_relevant_3_df['fea_outlet_string_type'].isin(unique_type)]
len(master_relevant_4_df)

In [None]:
master_relevant_5_df = master_relevant_4_df[master_relevant_4_df['fea_outlet_string_classification'].isin(unique_class)]
len(master_relevant_5_df)
master_relevant_5_df.head()

In [None]:
master_table_unique_combinations = master_relevant_5_df.iloc[:,1:4].drop_duplicates()
len(master_table_unique_combinations)

In [None]:
test_group.dtypes
#master_relevant_5_df.dtypes

In [None]:
#test_group['outlet_id'] = test_group['outlet_id'].astype('string')
test_group['fea_outlet_string_type'] = test_group['fea_outlet_string_type'].astype('string')
test_group['fea_outlet_string_classification'] = test_group['fea_outlet_string_classification'].astype('string')
#master_relevant_5_df['outlet_id'] = master_relevant_5_df['outlet_id'].astype('string')
master_relevant_5_df['fea_outlet_string_type'] = master_relevant_5_df['fea_outlet_string_type'].astype('string')
master_relevant_5_df['fea_outlet_string_classification'] = master_relevant_5_df['fea_outlet_string_classification'].astype('string')
test_group.dtypes
#master_relevant_5_df.dtypes

In [None]:
test_group = test_group.reset_index(drop=True)
master_relevant_5_df = master_relevant_5_df.reset_index(drop=True)

test_group = test_group.dropna()
master_relevant_5_df = master_relevant_5_df.dropna()
master_relevant_5_df.head()

In [None]:
test_group['twin_id'] = -1
test_group['twin_fea_outlet_decimal_total_cashflow_mkios_mean'] = -1
twin_id = []
#for i in range(0,3):
for i in range(0,len(test_group)):
    if i % 100 == 0:
        print(i)
    outlet_type  = test_group['fea_outlet_string_type'].iloc[i]
    outlet_class = test_group['fea_outlet_string_classification'].iloc[i]
    region       = test_group['regional'].iloc[i]
    
    flag = 0
    j = 0
    while flag == 0:
        if master_relevant_5_df['outlet_id'].iloc[j] not in twin_id and master_relevant_5_df['fea_outlet_string_type'].iloc[j] == outlet_type and master_relevant_5_df['fea_outlet_string_classification'].iloc[j] == outlet_class and master_relevant_5_df['regional'].iloc[j] == region:
            test_group.at[i,'twin_id'] = master_relevant_5_df['outlet_id'].iloc[j]
            test_group.at[i,'twin_fea_outlet_decimal_total_cashflow_mkios_mean'] = master_relevant_5_df['fea_outlet_decimal_total_cashflow_mkios_mean'].iloc[j]
            twin_id.append(test_group['twin_id'].iloc[i])
            flag = 1
        j += 1

In [None]:
test_group.to_csv('Promotor pilot control group twin ids.csv')
test_group.to_csv('Promotor pilot control group twin ids.txt')
#master_relevant_5_df.head()