### Load packages

In [None]:
%matplotlib inline
from causalml.dataset import synthetic_data
import math
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score, classification_report, confusion_matrix, accuracy_score
import statsmodels.api as sm
import seaborn as sns
import math as math
import matplotlib.pyplot as plt
import seaborn as sns

### Load data and organice it

#### Load experiment data

In [None]:
PP_experiment_df = pd.read_csv('Promoters Pilot input table v2.txt')
PP_experiment_df.rename(columns={'Outlet_id':'outlet_id'},inplace=True)
len(PP_experiment_df)

In [None]:
PP_experiment_df = PP_experiment_df.drop(columns=['Cashflow_feb','Cashflow_march_1_29'])
PP_experiment_df.head()

In [None]:
# Define outlets list
outlets_list = PP_experiment_df['outlet_id'].tolist()
len(outlets_list)

#### Load master table

In [None]:
master_df = pd.read_csv('../../../data/reseller/05_model_input/master_table/ra_master_table.csv')
#master_df.head()

In [None]:
# Remove un-used columns
master_df = master_df.drop(columns=['month','fea_outlet_string_location_bts'])

In [None]:
len(master_df)
len(master_df.outlet_id.unique())
len(master_df.columns)

### Check for experiment outlets not in master table

In [None]:
master_outlets_list = master_df.outlet_id.unique().tolist()
len(master_outlets_list)

In [None]:
Exp_outlets_in_master_df = PP_experiment_df[PP_experiment_df['outlet_id'].isin(master_outlets_list)]
len(Exp_outlets_in_master_df)

In [None]:
Exp_outlets_in_master_1_df = Exp_outlets_in_master_df.loc[Exp_outlets_in_master_df['Treatment'] == 1]
len(Exp_outlets_in_master_1_df)

In [None]:
Exp_outlets_in_master_0_df = Exp_outlets_in_master_df.loc[Exp_outlets_in_master_df['Treatment'] == 0]
len(Exp_outlets_in_master_0_df)

### Filter master table by outlets list

In [None]:
PP_features_df = master_df[master_df['outlet_id'].isin(outlets_list)]
len(PP_features_df)
len(PP_features_df.columns)

In [None]:
len(PP_features_df.outlet_id.unique())
#PP_features_df

In [None]:
PP_features_df = PP_features_df.dropna(axis=1)
len(PP_features_df.columns)

### Join tables

In [None]:
PP_master_df = PP_experiment_df.join(PP_features_df.set_index('outlet_id'), on='outlet_id')
#PP_master_df.head()
len(PP_master_df)

In [None]:
PP_master_df = PP_master_df.dropna()
len(PP_master_df)

### Create target variable and features

In [None]:
PP_master_df['target_class'] = -1
len(PP_master_df['target_class'])

In [None]:
PP_master_df['Delta_feb_mar'].describe()
#PP_master_df['Percentage_change_feb_mar'].describe()

In [None]:
cash = PP_master_df['Delta_feb_mar'].tolist()
target = PP_master_df['target_class'].tolist()
for i in range(0,len(cash)):
    if cash[i] > 7.240000e+05:
        target[i] = 4
    elif cash[i] <= 7.240000e+05 and cash[i] > -1.230000e+06:
        target[i] = 3
    elif cash[i] <= -1.230000e+06 and cash[i] > -3.792500e+06:  
        target[i] = 2
    elif cash[i] <= -3.792500e+06:
        target[i] = 1
    else:
        print('Error')
#cash = PP_master_df['Percentage_change_feb_mar'].tolist()
#target = PP_master_df['target_class'].tolist()
#for i in range(0,len(cash)):
#    if cash[i] > 5.109689:
#        target[i] = 4
#    elif cash[i] <= 5.109689 and cash[i] > -7.770335:
#        target[i] = 3
#    elif cash[i] <= -7.770335 and cash[i] > -19.744084:  
#        target[i] = 2
#    elif cash[i] <= -19.744084:
#        target[i] = 1
#    else:
#        print('Error')

In [None]:
PP_master_df['target_class'] = target
PP_master_df['target_class']
#len(PP_master_df['target_class'])
#PP_master_df
len(PP_master_df.columns)

In [None]:
treatment = PP_master_df['Treatment']

In [None]:
y = PP_master_df['target_class']

In [None]:
len(y)

In [None]:
#PP_master_df.columns[1053:1400].tolist() # At 347 start geospatial features

In [None]:
PP_master_filtered_df = PP_master_df.iloc[:,347:1053]

In [None]:
len(PP_master_df.columns)
len(PP_master_filtered_df.columns)

In [None]:
x = PP_master_filtered_df.drop(columns=['fea_outlet_decimal_total_cashflow_mkios_pv_mean_past_3m'])
#x = PP_master_filtered_df
#x

In [None]:
type(x.dtypes)
len(x.dtypes)

In [None]:
columns = []
for j in range(0,len(x.columns)):
    if x.dtypes[j] == 'float64' or x.dtypes[j] == 'int64':
        columns.append(x.columns[j])
print(len(columns))
#columns

In [None]:
x = x[columns]
#x = x.dropna(axis=1)
#x

### Random forest model to calculate feature importances

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=42)

In [None]:
regressor = RandomForestClassifier()
regressor.fit(x_train,y_train)

In [None]:
y_pred = regressor.predict(x_test)

In [None]:
print(confusion_matrix(y_test,y_pred))

In [None]:
Variables = pd.Series(x.columns)
Feature_importances = pd.Series(regressor.feature_importances_)
Feature_importances_dic = {'Variable': Variables, "Feature_importance": Feature_importances}
Feature_importances_df = pd.DataFrame(Feature_importances_dic)
Feature_importance_sorted_df = Feature_importances_df.sort_values(by="Feature_importance", ascending=False)
#Feature_importances_df

In [None]:
fis = Feature_importances_df.sort_values(by="Feature_importance", ascending=False)
fis.iloc[0:100,:]
#fis['Variable'].tolist()

In [None]:
vars_ranked = fis['Variable'].tolist()
top_vars = vars_ranked[0:50]
#top_vars

In [None]:
X = x[top_vars]
#X

### Uplift modeling

In [None]:
from causalml.inference.meta import BaseTRegressor
from xgboost import XGBRegressor
from causalml.inference.meta import XGBTRegressor

y = PP_master_df['Delta_feb_mar']

data = pd.concat([
    pd.DataFrame({"y": y, "treatment": treatment}),
    pd.DataFrame(X)],
    axis = 1
)
#data

#### Model 1

In [None]:
xgb_tlearner = BaseTRegressor(learner=XGBRegressor(random_state=42))

xgb_tlearner.fit(X=X, y=y, treatment=treatment)

In [None]:
xgb_tlearner.predict(X=X).shape

In [None]:
uplift, outcome_c, outcome_t = xgb_tlearner.predict(X=X, return_components=True)

In [None]:
len(uplift)

#### Model 2

In [None]:
X_train, X_test, y_train, y_test, treatment_train, treatment_test = train_test_split(X, y, treatment, test_size=0.30, random_state=42)

In [None]:
## Training T-learner on train
learner_t = XGBTRegressor(learner=XGBRegressor(random_state=42))
learner_t.fit(X=X_train, treatment=treatment_train, y=y_train)

## Get predictions, on the test set
t_pred = learner_t.predict(X=X_test)
uplift, outcome_c, outcome_t = learner_t.predict(X=X_test, return_components=True)

## Aggregating everything on a dataframe
df = pd.DataFrame({'y': y_test,
                   'w': treatment_test,
                   'T-Learner': t_pred.reshape(-1)
                   #'Actual': tau_test
                  })

In [None]:
len(uplift)
len(uplift[uplift >=0])
np.sum(uplift)

In [None]:
from causalml.metrics import plot
## Plotting the 3 types of uplift curve. 
## If `treatment_effect_col` is provided (the true uplift) it uses that to 
## order the population by the highest score. Otherwise it uses the Treatment score.
plot(df,kind='qini', outcome_col='y', treatment_col='w',figsize=(10, 3.3))