In [None]:
## data
import pandas as pd
import numpy as np
import datetime as dt
from dateutil.relativedelta import relativedelta
from unicodedata import normalize

## vis
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
# contagem de aging
from collections import Counter

from IPython.display import display, HTML

## modelo
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, precision_recall_curve
from sklearn.metrics import average_precision_score, mean_absolute_error, roc_curve, auc, roc_auc_score
from scikitplot.metrics import plot_roc
from sklearn.model_selection import cross_val_score

# check xgboost version
from xgboost import XGBClassifier, XGBRegressor

import dill as pickle

## alertas
import warnings
warnings.filterwarnings("ignore")

In [None]:
# %pip install xgboost==1.5.0

***
### Funções

In [None]:
def aucur(y_test, preds):
    # calculate AUC
    aucur = roc_auc_score(y_test, preds)
    print('AUC: %.3f' % aucur)

    # calculate roc curve
    fpr, tpr, thresholds = roc_curve(y_test, preds)

    # plot no skill
    plt.plot([0, 1], [0, 1], linestyle='--')
    # plot the roc curve for the model
    plt.plot(fpr, tpr, marker='.')
    # show the plot
    plt.show()

In [None]:
def just_check_nan(df):
    # Check NaN Pandas and Numpy
    missing_val_count_by_column = (df.isna().sum())

    columns_with_nan = missing_val_count_by_column[missing_val_count_by_column > 0]
    print('Dados com NaN:')
    print(columns_with_nan)
    print('')
    print('columns_with_nan.shape', columns_with_nan.shape)
    print('')

In [None]:
def plot_feature_freq(df, feature):
    labels, counts = np.unique(df[feature].dropna(), return_counts=True)
    total = counts[0] + counts[1]
    plt.figure(figsize=(10, 7))
    ax = sns.barplot(x=labels, y=(counts/total)*100, data = df)
    ax2 = sns.barplot(x=labels, y=counts, data = df)

    for p in ax.patches:
        ax.annotate('{:.2f}%'.format(p.get_height()), (p.get_x() + 0.3, p.get_height()+0.6), rotation = 0)

#     ax.set(xlabel='Trocou', ylabel='Porcentagem')
    ax.xaxis.label.set_size(20)
    ax.yaxis.label.set_size(20)
    plt.show()

In [None]:
def plot_feature_freq_mes(df, feature):
    labels, counts = np.unique(df[feature].dropna(), return_counts=True)
    total = counts[0] + counts[1] + counts[2] + counts[3] + counts[4] + counts[5] + counts[6]
    plt.figure(figsize=(10, 7))
    ax = sns.barplot(x=labels, y=(counts/total)*100, data = df)
    ax2 = sns.barplot(x=labels, y=counts, data = df)

    for p in ax.patches:
        ax.annotate('{:.2f}%'.format(p.get_height()), (p.get_x() + 0.3, p.get_height()+0.6), rotation = 0)

#     ax.set(xlabel='Trocou', ylabel='Porcentagem')
    ax.xaxis.label.set_size(20)
    ax.yaxis.label.set_size(20)
    plt.show()

***
## Coleta de dados

In [None]:
FILE = "/kaggle/input/bigdata/pca_tem_clientes_devicemanger_bat2_prod.csv"

In [None]:
client_chunks = []
dados_teste = pd.read_csv(FILE, sep = ";", 
                          header = 0, 
                          index_col = False, 
                          names = None, 
                          low_memory = False, 
                          chunksize = 10**5) 

In [None]:
for chunk in dados_teste:

    new_clients = pd.DataFrame(chunk)

    # filter data chunk with frac percentage
#     new_clients = new_clients.sample(frac = 0.5)
    new_clients = new_clients.drop_duplicates()

    # saves to list
    client_chunks.append(new_clients)

dados = pd.concat(client_chunks).reset_index(drop = True)

In [None]:
# FILE = "/kaggle/input/datasetsnew/batdtfim500kage.csv"
# dados = pd.read_csv(FILE, sep = ';', encoding = 'ISO-8859-1', engine ='python')

dados = dados.rename(columns={' a.msisdn': 'msisdn'})\
             .rename(columns={'a.imsi': 'imsi'})\
             .rename(columns={'a.tac': 'tac'})\
             .rename(columns={'a.dt_inicio': 'dt_inicio'})\
             .rename(columns={'a.dt_fim': 'dt_fim'})\
             .rename(columns={'a.aging_aparelho': 'aging_aparelho'})\
             .rename(columns={'a.aging_year': 'cat_aparelho'})\
             .rename(columns={'a.trocou_aparelho': 'trocou_aparelho'})\
             .rename(columns={'a.fabricante': 'fabricante'})\
             .rename(columns={'a.modelo': 'modelo'})\
             .rename(columns={'a.imei': 'imei'})\
             .rename(columns={'a.ranking': 'ranking'})\
             .rename(columns={'a.cpfidentifier': 'cpfidentifier'})\
             .rename(columns={'a.subscriberkey': 'subscriberkey'})\
             .rename(columns={'a.customerkey': 'customerkey'})\
             .rename(columns={'a.customersubtypedesc': 'customersubtypedesc'})\
             .rename(columns={'a.subscriberstatuskey': 'subscriberstatuskey'})\
             .rename(columns={'a.offertechnology': 'offertechnology'})\
             .rename(columns={'a.citydesc': 'citydesc'})\
             .rename(columns={'a.statedesc': 'statedesc'})\
             .rename(columns={'a.subscribermainofferdesc': 'subscribermainofferdesc'})\
             .rename(columns={'a.assignedchargeamount': 'assignedchargeamount'})\
             .rename(columns={'a.fl_3g': 'fl_3g'})\
             .rename(columns={'a.fl_4g': 'fl_4g'})\
             .rename(columns={'a.fl_5g': 'fl_5g'})\
             .rename(columns={'a.dt_ano_stts': 'dt_ano_stts'})\
             .rename(columns={'a.age': 'idade'})\
             .rename(columns={'a.avg_msisdn': 'avg_msisdn'})\
             .rename(columns={'a.avg_cpf': 'avg_cpf'})\

dados

In [None]:
dados.columns

### Limpeza dos dados

In [None]:
dados = dados[dados['peso_aparelho'] != 'IPHONE4']
dados = dados[dados['peso_aparelho'] != 'IPHONE3G']
dados = dados[dados['peso_aparelho'] != 'IPHONE']
dados = dados[dados['peso_aparelho'] != 'IPHONE3GS']
dados = dados[dados['peso_aparelho'] != 'DIGIPHONE K6700I']
dados = dados[dados['peso_aparelho'] != 'GENIPHONE 9 MINI']
dados = dados[dados['offertechnology'] != 'VOIP']

In [None]:
dados = dados[dados['aging_aparelho'] >= 12]

In [None]:
dados = dados[dados['aging_aparelho'] <= 48]

In [None]:
just_check_nan(dados)

In [None]:
dados_droped = dados.dropna(subset=['imsi',
                                    'statedesc', 
                                    'subscribermainofferdesc', 
                                    'assignedchargeamount', 'fl_3g', 'fl_4g', 'dt_ano_stts', 'idade', 'customersubtypedesc'])\
                    .reset_index(drop=True)
# dados_droped

In [None]:
just_check_nan(dados_droped)

### Mudando tipagem

In [None]:
plot_feature_freq(dados_droped, 'trocou_aparelho')

In [None]:
# dados_droped = dados_droped.tail(dados_droped.shape[0] -4000000).reset_index(drop=True)
dados_droped['dt_ano_stts'] = dados_droped['dt_ano_stts'].astype(np.int64).reset_index(drop=True)
dados_droped['idade'] = np.asarray(dados_droped['idade'], dtype = 'int16')
dados_droped['fl_3g'] = np.asarray(dados_droped['fl_3g'], dtype = 'int8')
dados_droped['fl_4g'] = np.asarray(dados_droped['fl_4g'], dtype = 'int8')
# dados_droped['fl_5g'] = np.asarray(dados_droped['fl_5g'], dtype = 'int8')
dados_droped

In [None]:
# plot_feature_freq(df, feature, qt_cat)
plot_feature_freq(dados_droped, 'trocou_aparelho')

In [None]:
dados_droped['dt_ano_stts'].unique()

***
### Load Model

In [None]:
# Open saved model, and directly make the prediction with new data
filename = '/kaggle/input/models/model_trocou_aparelho_apple.pk'
with open(filename ,'rb') as f:
    pipeline_ta = pickle.load(f)

In [None]:
X_ta = dados_droped.drop('trocou_aparelho', axis=1)

y_ta = dados_droped['trocou_aparelho']

In [None]:
preds_ta = pipeline_ta.predict(X_ta)

In [None]:
c_val_ta = Counter(y_ta)
c_val_ta

In [None]:
c_preds_ta = Counter(preds_ta)
c_preds_ta

***
## Métricas

In [None]:
aucur(y_ta, preds_ta)

In [None]:
print(classification_report(y_ta, preds_ta))

In [None]:
cm_df_ta = pd.DataFrame(confusion_matrix(y_ta, preds_ta),
                        index = sorted(Counter(y_ta)),
                        columns = sorted(Counter(preds_ta)))
cm_df_ta

In [None]:
# Plot non-normalized confusion matrix
titles_options = [
    ("Confusion matrix, without normalization", None),
    ("Normalized confusion matrix", "true"),
]
for title, normalize in titles_options:
    disp = ConfusionMatrixDisplay.from_estimator(
        pipeline_ta,
        X_ta,
        y_ta,
        display_labels=['Não Trocou', 'Trocou'],
        cmap=plt.cm.Blues,
        normalize=normalize,
    )
    disp.ax_.set_title(title)

    print(title)
    print(disp.confusion_matrix)
    
plt.figure(figsize=(18, 18))
plt.show()

***
***
***
## Captação dos clientes que foram previstos como Propensos a Fazer uma Compra

Base Original

In [None]:
X_ta['trocou_aparelho'] = y_ta
# X_ta

In [None]:
plt.figure(figsize=(8, 6))

# plot_feature_freq(df, feature, qt_cat)
plot_feature_freq(X_ta, 'trocou_aparelho')

Base Prevista

In [None]:
X_ta['trocou_aparelho_previsto'] = preds_ta
# X_ta

In [None]:
plt.figure(figsize=(8, 6))

plot_feature_freq(X_ta, 'trocou_aparelho_previsto')

In [None]:
X_ta['cpfidentifier'] = dados_droped['cpfidentifier']
# X_ta

In [None]:
X_ta['dt_inicio'] = dados_droped['dt_inicio']
# X_ta

In [None]:
X_ta['dt_fim'] = dados_droped['dt_fim']
X_ta['modelo'] = dados_droped['modelo']
X_ta['subscribermainofferdesc'] = dados_droped['subscribermainofferdesc']
X_ta['assignedchargeamount'] = dados_droped['assignedchargeamount']
X_ta['offertechnology'] = dados_droped['offertechnology']
X_ta['statedesc'] = dados_droped['statedesc']
X_ta['fl_3g'] = dados_droped['fl_3g']
X_ta['fl_4g'] = dados_droped['fl_4g']
# X_ta['fl_5g'] = dados_droped['fl_5g']
X_ta['idade'] = dados_droped['idade']
X_ta['avg_msisdn'] = dados_droped['avg_msisdn']
X_ta['avg_cpf'] = dados_droped['avg_cpf']

X_ta

In [None]:
val_acc_ta = accuracy_score(y_ta, preds_ta)
print('Validation Acuracy:', val_acc_ta)

***
***
***
## Separando as bases para outro modelo

In [None]:
base_to_pred_month = X_ta[X_ta['trocou_aparelho_previsto'] == 1].reset_index(drop=True)
base_to_pred_month

***
***
***
## Definição do Target aging_aparelho

In [None]:
# define target
y = base_to_pred_month['aging_aparelho']

In [None]:
# contagem de aging
from collections import Counter

c = Counter(y)
c

In [None]:
plt.figure(figsize=(20, 6))

# plot_feature_freq(df, feature, qt_cat)
plot_feature_freq_mes(base_to_pred_month, 'aging_aparelho')

***
### Load Model

In [None]:
# Open saved model, and directly make the prediction with new data
filename_ag = '/kaggle/input/models/model_aging_apple.pk'
with open(filename_ag ,'rb') as f:
    pipeline = pickle.load(f)

In [None]:
X = base_to_pred_month.drop('aging_aparelho', axis=1)

y = base_to_pred_month['aging_aparelho']

In [None]:
preds = pipeline.predict(X)

In [None]:
c_val = Counter(y)
c_val

In [None]:
c_preds = Counter(preds)
c_preds

***
## Métricas

In [None]:
y_probas = pipeline.predict_proba(X)
plot_roc(y, y_probas, plot_micro = False, plot_macro = False, figsize = (8, 8))
plt.show()

In [None]:
print(classification_report(y, preds))

In [None]:
cm_df = pd.DataFrame(confusion_matrix(y, preds),
                     index = sorted(Counter(y)),
                     columns = sorted(Counter(preds)))
cm_df

## Plot non-normalized confusion matrix

In [None]:
y.unique()

In [None]:
titles_options = [
    ("Confusion matrix, without normalization", None),
    ("Normalized confusion matrix", "true"),
]
for title, normalize in titles_options:
    disp = ConfusionMatrixDisplay.from_estimator(
        pipeline,
        X,
        y,
        display_labels=y.unique(),
        cmap=plt.cm.Blues,
        normalize=normalize,
    )
    disp.ax_.set_title(title)

    print(title)
    print(disp.confusion_matrix)
    
plt.figure(figsize=(25, 25))
plt.show()

***
## Mesclando resultados com tabela

### Com a base de validação

In [None]:
X['aging_aparelho'] = y
# X

In [None]:
X['previsto'] = preds
# X

In [None]:
X['cpfidentifier'] = base_to_pred_month['cpfidentifier']
# X

In [None]:
X['dt_inicio'] = base_to_pred_month['dt_inicio']
# X

In [None]:
X['dt_fim'] = base_to_pred_month['dt_fim']
# X

***
## Calculando data prevista

In [None]:
X['dt_fim_previsto'] = ((pd.to_datetime(X['dt_inicio']).dt.to_period('M')) + X['previsto'])
X

In [None]:
val_acc = accuracy_score(y, preds)
print('Validation Acuracy:', val_acc)