In [41]:
#
##
### LOAD DATA

import pandas as pd
import numpy as np

def _optimize_numeric_dtypes(df):
    import pandas as pd
    float_cols = df.select_dtypes("float").columns
    int_cols = df.select_dtypes("integer").columns
    df[float_cols] = df[float_cols].\
        apply(pd.to_numeric, downcast="float")
    df[int_cols] = df[int_cols].\
        apply(pd.to_numeric, downcast="integer")
    return df

data = _optimize_numeric_dtypes(
    pd.read_parquet("../data/customer_model/retailrocket/"))
data["target_cap"] = data["target_cap"].clip(-1000,25000)

#
##
### CONSTRUCT PROFIT TARGET
# NOTE: encapsulate this
config = {
    #"gamma":{"alpha":22.3, "beta":200},
    "gamma":{"alpha":20.5, "beta":116.167},
    "delta":500,
    "psi":{"alpha":9, "beta":1},
    "n_iter":1000,
    "seed":1}

gamma = config["gamma"]
delta = config["delta"]
psi = config["psi"]
n_iter = config["n_iter"]
seed = config["seed"] 

n_users = data.user_id.nunique()
sp = []
for i in range(n_iter):
    gamma_psi = pd.DataFrame.from_dict({
        "user_id":data.user_id.unique(),
        "gamma":np.random.beta(gamma["alpha"], gamma["beta"], size=n_users),
        "psi":np.random.beta(psi["alpha"], psi["beta"], size=n_users)})
    temp = data.merge(gamma_psi, on=["user_id"])
    temp["acp"] = (temp["target_event"]*temp["gamma"]*(temp["target_cap"]-delta)
        + (1-temp["target_event"])*(-temp["psi"]*delta))
    sp.append(temp.loc[:,["user_id", "week_step", "acp"]])
sp = pd.concat(sp)

In [42]:
train = data[data.week_step>1]
test = data[data.week_step==1]

out_cols = ["user_id", "row_id", "target_event",
    "target_revenue", "week_step", "target_cap",
    "cap", "cap_month_lag0", "cap_month_lag1",
    "cap_month_lag2", "cap_month_lag3", "cap_month_ma3"]

feat_cols = [c for c in train.columns if c not in set(out_cols)]


In [46]:
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier

lol = data.merge(sp.groupby(["user_id","week_step"], as_index=False).acp.mean(), on=["user_id","week_step"])
trf = lol.week_step>2
tef = lol.week_step==2

target_cols = ["target_event", "acp"]
X_train, y_train = lol.loc[trf,feat_cols], lol.loc[trf,target_cols]
X_test, y_test = lol.loc[tef,feat_cols], lol.loc[tef,target_cols]

# train split
X_reg, X_clf, y_reg, y_clf = train_test_split(X_train, y_train, test_size=.8, stratify=y_train.loc[:,["target_event"]])

clf = LGBMClassifier()
clf.fit(X_clf, y_clf.loc[:,["target_event"]])
X_reg["proba"] = clf.predict_proba(X_reg)[:,1]
X_test["proba"] = clf.predict_proba(X_test)[:,1]

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [47]:
from sklearn.metrics import r2_score
from imblearn.pipeline import Pipeline
from lightgbm import LGBMRegressor, LGBMClassifier

reg = LGBMRegressor()
reg.fit(X_reg, y_reg.loc[:,["acp"]])
r2_score(y_test.loc[:,["acp"]], reg.predict(X_test))

0.43809135311597847

In [None]:
import matplotlib.pyplot as plt
from sklearn.inspection import permutation_importance

results = permutation_importance(reg, X_test, y_test)
lgbm_mu = pd.Series(results.importances_mean, index=X_train.columns)
lgbm_std = pd.Series(results.importances_std, index=X_train.columns)

fig, ax = plt.subplots(figsize=(15,10))
ind = lgbm_mu.sort_values().tail(100).index
lgbm_mu[ind].plot.bar(yerr=lgbm_std[ind], ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()

In [None]:
acp = sp.groupby(["user_id","week_step"], as_index=False).mean()
acp = acp.sort_values("week_step", ascending=False)
acp["prev_acp"] = acp.groupby("user_id", as_index=False).acp.shift(1)
reg_data = data.merge(acp, on=["user_id", "week_step"], how="inner")

from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold, SelectKBest
from sklearn.preprocessing import PolynomialFeatures
from lightgbm import LGBMRegressor, LGBMClassifier
from category_encoders.target_encoder import TargetEncoder

pipe_reg = Pipeline([("vt", TargetEncoder(cols=list(range(100)), min_samples_leaf=1, smoothing=10.0)),
    ("reg", LGBMRegressor(learning_rate=0.01))])

pipe_clf = Pipeline([("vt", TargetEncoder(cols=list(range(100)), min_samples_leaf=1, smoothing=10.0)),
    ("reg", LGBMClassifier(learning_rate=0.01))])


train_filter = (reg_data.week_step>2) & (reg_data.week_step<10)
test_filter = (reg_data.week_step==2)
X_train = reg_data.loc[train_filter,feat_cols]
X_train_reg = np.column_stack([lgbm.predict(X_train, pred_leaf=True), lgbm.predict_proba(X_train)[:,1]])

#X_train["proba"] = lgbm.predict_proba(X_train)[:,1]
#X_train["prev_cap"] = reg_data.loc[train_filter,"prev_target_cap"]
#X_train["prev_acp"] = reg_data.loc[train_filter,"prev_acp"]
y_train = reg_data.loc[train_filter,"acp"].clip(-1000,50000)

X_test = reg_data.loc[test_filter,feat_cols]
#X_test["proba"] = lgbm.predict_proba(X_test)[:,1]
#X_test["prev_cap"] = reg_data.loc[test_filter,"prev_target_cap"]
#X_test["prev_acp"] = reg_data.loc[test_filter,"prev_acp"]
X_test_reg = np.column_stack([lgbm.predict(X_test, pred_leaf=True), lgbm.predict_proba(X_test)[:,1]])
y_test = reg_data.loc[test_filter,"acp"].clip(-1000,50000)
pipe_reg.fit(X_train_reg, y_train)

from sklearn.metrics import r2_score, mean_squared_error
r2_score(y_test, pipe_reg.predict(X_test_reg))

In [None]:
y_train = (reg_data.loc[reg_data.week_step>2,"acp"]>0).astype("int")
y_test = (reg_data.loc[reg_data.week_step==2,"acp"]>0).astype("int")
pipe_clf.fit(X_train_reg, y_train)

from sklearn.metrics import f1_score
f1_score(y_test, pipe_clf.predict(X_test_reg))

In [None]:
meh = pd.DataFrame.from_dict({"y":reg_data.loc[reg_data.week_step==2,"acp"].clip(-1000,25000), "y_pred":pipe_clf.predict_proba(X_test_reg)[:,1]*pipe_reg.predict(X_test_reg)})
meh = meh.sort_values("y_pred", ascending=False).reset_index(drop=True)
meh.y.cumsum()[meh.y_pred.cumsum().idxmax()]