Este notebook tem o intuito de comparar a inferencia das métricas da ultima oferta com cliente com outras ofertas que poderiam ser válidas

# Setting up

In [14]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.pandas import merge_asof
from pyspark.pandas import DataFrame as ps
from pyspark.sql import Window

from pathlib import Path

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score, confusion_matrix
import pickle
# current repo path 
repo_path = Path().resolve().parent

spark = SparkSession.builder.appName('Spark Demo').master('local[*]').getOrCreate()

In [60]:
with open((repo_path / 'models_artefact' / 'model.pkl').as_posix(), 'rb') as f:
    model = pickle.load(f)
    
df = spark.read.json((repo_path / 'data' / 'processed' / 'modelling_dataset').as_posix()).toPandas()

# pegando ultimas offers de cada
last_offers_idx = df.sort_values(['account_id', 'time_since_test_start'])\
                   .groupby('account_id').tail(1).index

# criando datasets
df_test = df.loc[last_offers_idx]
df_train = df.drop(last_offers_idx)

X_train = df_train
y_train = df_train['target']
X_test = df_test
y_test = df_test['target']

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

y_train_proba = model.predict_proba(X_train)[:,1]
y_test_proba = model.predict_proba(X_test)[:,1]

# Construindo base

Temos que pensar aqui no modelo em produção: tendo as variaveis já agrupadas e calculadas, qual da sofertas seria a mais interessante a se fazer? Ela equivale a que temos na base?

In [166]:
base_validacao = df_test[
    [
        "account_id",
        "age",
        "credit_card_limit",
        "event",
        "gender",
        "month_registered",
        "num_past_offers",
        "num_past_viewed",
        "registered_on_cos",
        "registered_on_seno",
        "time_since_last_offer",
        "time_since_test_start",
        "total_past_amount",
        "total_past_reward",
        "year_registered",
    ]
]

In [167]:
offers = spark.read.json((repo_path / 'data' / 'raw' / 'offers.json').as_posix()).toPandas()


In [168]:
offers

Unnamed: 0,channels,discount_value,duration,id,min_value,offer_type
0,"[email, mobile, social]",10,7.0,ae264e3637204a6fb9bb56bc8210ddfd,10,bogo
1,"[web, email, mobile, social]",10,5.0,4d5c57ea9a6940dd891ad53e9dbe8da0,10,bogo
2,"[web, email, mobile]",0,4.0,3f207df678b143eea3cee63160fa8bed,0,informational
3,"[web, email, mobile]",5,7.0,9b98b8c7a33c4b65b9aebfe6a799e6d9,5,bogo
4,"[web, email]",5,10.0,0b1e1539f2cc45b7b9fa7c272da2e1d7,20,discount
5,"[web, email, mobile, social]",3,7.0,2298d6c36e964ae4a3e7e9706d1fb8c2,7,discount
6,"[web, email, mobile, social]",2,10.0,fafdcd668e3743c1bb461111dcafc2a4,10,discount
7,"[email, mobile, social]",0,3.0,5a8bc65990b245e5a138643cd4eb9837,0,informational
8,"[web, email, mobile, social]",5,5.0,f19421c1d4aa40978ebb69ca19b0e20d,5,bogo
9,"[web, email, mobile]",2,7.0,2906b810c7d4411798c6938adc9daaa5,10,discount


Construindo uma base com todas ofertas por customer e reconstruindo as features

In [169]:
base_validacao = base_validacao.merge(offers, how='cross')
base_validacao['offer_id'] = base_validacao['id']

In [170]:
base_validacao['email'] = base_validacao['channels'].apply(lambda x: 'email' in x)
base_validacao['web'] = base_validacao['channels'].apply(lambda x: 'web' in x)
base_validacao['mobile'] = base_validacao['channels'].apply(lambda x: 'mobile' in x)
base_validacao['social'] = base_validacao['channels'].apply(lambda x: 'social' in x)
base_validacao['qtd_canais'] = base_validacao['channels'].apply(lambda x: len(x))

In [171]:
past_offer_transaction = df_train.groupby(["account_id", "offer_id"]).agg({'target':'max'}).reset_index()
past_offer_transaction.columns=['account_id', 'offer_id', 'past_offer_conversion']
base_validacao = base_validacao.merge(past_offer_transaction, on=['account_id', 'offer_id'], how='left')
base_validacao['past_offer_conversion'] = base_validacao['past_offer_conversion'].fillna(0)

In [172]:
base_validacao.head(9)

Unnamed: 0,account_id,age,credit_card_limit,event,gender,month_registered,num_past_offers,num_past_viewed,registered_on_cos,registered_on_seno,...,id,min_value,offer_type,offer_id,email,web,mobile,social,qtd_canais,past_offer_conversion
0,0009655768c64bdeb2e877511632db8f,33.0,72000.0,offer received,M,4,4,4.0,-0.333467,0.942762,...,ae264e3637204a6fb9bb56bc8210ddfd,10,bogo,ae264e3637204a6fb9bb56bc8210ddfd,True,False,True,True,3,0.0
1,0009655768c64bdeb2e877511632db8f,33.0,72000.0,offer received,M,4,4,4.0,-0.333467,0.942762,...,4d5c57ea9a6940dd891ad53e9dbe8da0,10,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0,True,True,True,True,4,0.0
2,0009655768c64bdeb2e877511632db8f,33.0,72000.0,offer received,M,4,4,4.0,-0.333467,0.942762,...,3f207df678b143eea3cee63160fa8bed,0,informational,3f207df678b143eea3cee63160fa8bed,True,True,True,False,3,0.0
3,0009655768c64bdeb2e877511632db8f,33.0,72000.0,offer received,M,4,4,4.0,-0.333467,0.942762,...,9b98b8c7a33c4b65b9aebfe6a799e6d9,5,bogo,9b98b8c7a33c4b65b9aebfe6a799e6d9,True,True,True,False,3,0.0
4,0009655768c64bdeb2e877511632db8f,33.0,72000.0,offer received,M,4,4,4.0,-0.333467,0.942762,...,0b1e1539f2cc45b7b9fa7c272da2e1d7,20,discount,0b1e1539f2cc45b7b9fa7c272da2e1d7,True,True,False,False,2,0.0
5,0009655768c64bdeb2e877511632db8f,33.0,72000.0,offer received,M,4,4,4.0,-0.333467,0.942762,...,2298d6c36e964ae4a3e7e9706d1fb8c2,7,discount,2298d6c36e964ae4a3e7e9706d1fb8c2,True,True,True,True,4,0.0
6,0009655768c64bdeb2e877511632db8f,33.0,72000.0,offer received,M,4,4,4.0,-0.333467,0.942762,...,fafdcd668e3743c1bb461111dcafc2a4,10,discount,fafdcd668e3743c1bb461111dcafc2a4,True,True,True,True,4,1.0
7,0009655768c64bdeb2e877511632db8f,33.0,72000.0,offer received,M,4,4,4.0,-0.333467,0.942762,...,5a8bc65990b245e5a138643cd4eb9837,0,informational,5a8bc65990b245e5a138643cd4eb9837,True,False,True,True,3,1.0
8,0009655768c64bdeb2e877511632db8f,33.0,72000.0,offer received,M,4,4,4.0,-0.333467,0.942762,...,f19421c1d4aa40978ebb69ca19b0e20d,5,bogo,f19421c1d4aa40978ebb69ca19b0e20d,True,True,True,True,4,1.0


Fazendo a inferência dos dados de customerXoffer

In [173]:
base_validacao['proba'] = model.predict_proba(base_validacao)[:,1]

Pegando a melhor offer de cada customer

In [174]:
best_offers_by_customer = base_validacao.groupby(['account_id']).apply(lambda x: x.sort_values('proba', ascending=False).head(1))
best_offers_by_customer = best_offers_by_customer[['account_id', 'offer_id', 'proba']]
best_offers_by_customer.columns = ['account_id', 'offer_modelo', 'proba']
best_offers_by_customer = best_offers_by_customer.drop('account_id', axis=1).reset_index().drop('level_1', axis=1)

  best_offers_by_customer = base_validacao.groupby(['account_id']).apply(lambda x: x.sort_values('proba', ascending=False).head(1))


Mergeando nos dados de test para comparar e agregar valores

In [175]:
df_test = df_test.merge(best_offers_by_customer, on=['account_id'], how='left')

In [176]:
df_test['offer_iguais'] = df_test['offer_modelo'] == df_test['offer_id']

In [177]:
df_test['offer_iguais'].value_counts()

offer_iguais
False    15083
True      1911
Name: count, dtype: int64

In [178]:
valor_transacionado_passado = df_train.groupby(['account_id']).agg({'total_past_amount':'max'}).reset_index()
valor_transacionado_passado.columns = ['account_id', 'amount_passado']

In [180]:
df_test = df_test.merge(valor_transacionado_passado, on=['account_id'], how='left')
df_test['amount_transacao_offer'] = df_test['total_past_amount']-df_test['amount_passado']

In [182]:
df_test[df_test['offer_iguais']==True]['amount_transacao_offer'].sum()

29070.660000000003

Temos 1911 offers que o modelo teria enviado aos customers, que viram 29070 reais nas transações. As que o modelo comparou e deu diferente, não temos como ver a diferença do que elas trariam de ganho vs o que aconteceu pois elas não foram enviadas de fato ao customer, teríamos de ter um modelo diferente para esse caso.