In [32]:
## data
import pandas as pd
import numpy as np
import datetime as dt
from dateutil.relativedelta import relativedelta
from unicodedata import normalize

## vis
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
# contagem de aging
from collections import Counter

from IPython.display import display, HTML

## modelo
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, precision_recall_curve
from sklearn.metrics import average_precision_score, mean_absolute_error, roc_curve, auc, roc_auc_score
from scikitplot.metrics import plot_roc
from sklearn.model_selection import cross_val_score

# check xgboost version
from xgboost import XGBClassifier, XGBRegressor

import dill as pickle

## alertas
import warnings
warnings.filterwarnings("ignore")

***
### Funções

In [33]:
def aucur(y_test, preds):
    # calculate AUC
    aucur = roc_auc_score(y_test, preds)
    print('AUC: %.3f' % aucur)

    # calculate roc curve
    fpr, tpr, thresholds = roc_curve(y_test, preds)

    # plot no skill
    plt.plot([0, 1], [0, 1], linestyle='--')
    # plot the roc curve for the model
    plt.plot(fpr, tpr, marker='.')
    # show the plot
    plt.show()

In [34]:
def just_check_nan(df):
    # Check NaN Pandas and Numpy
    missing_val_count_by_column = (df.isna().sum())

    columns_with_nan = missing_val_count_by_column[missing_val_count_by_column > 0]
    print('Dados com NaN:')
    print(columns_with_nan)
    print('')
    print('columns_with_nan.shape', columns_with_nan.shape)
    print('')

In [35]:
def plot_feature_freq(df, feature):
    labels, counts = np.unique(df[feature].dropna(), return_counts=True)
    total = counts[0] + counts[1]
    plt.figure(figsize=(10, 7))
    ax = sns.barplot(x=labels, y=(counts/total)*100, data = df)
    ax2 = sns.barplot(x=labels, y=counts, data = df)

    for p in ax.patches:
        ax.annotate('{:.2f}%'.format(p.get_height()), (p.get_x() + 0.3, p.get_height()+0.6), rotation = 0)

#     ax.set(xlabel='Trocou', ylabel='Porcentagem')
    ax.xaxis.label.set_size(20)
    ax.yaxis.label.set_size(20)
    plt.show()

In [36]:
def plot_feature_freq_mes(df, feature):
    labels, counts = np.unique(df[feature].dropna(), return_counts=True)
    total = counts[0] + counts[1] + counts[2] + counts[3] + counts[4] + counts[5] + counts[6]
    plt.figure(figsize=(10, 7))
    ax = sns.barplot(x=labels, y=(counts/total)*100, data = df)
    ax2 = sns.barplot(x=labels, y=counts, data = df)

    for p in ax.patches:
        ax.annotate('{:.2f}%'.format(p.get_height()), (p.get_x() + 0.3, p.get_height()+0.6), rotation = 0)

#     ax.set(xlabel='Trocou', ylabel='Porcentagem')
    ax.xaxis.label.set_size(20)
    ax.yaxis.label.set_size(20)
    plt.show()

***
## Coleta de dados

In [37]:
FILE = "/kaggle/input/datasetsnew/ta_predicted.csv"

In [38]:
client_chunks = []
dados_teste = pd.read_csv(FILE, sep = ",", 
                          header = 0, 
                          index_col = False, 
                          names = None, 
                          low_memory = False, 
                          chunksize = 10**5) 

In [39]:
for chunk in dados_teste:

    new_clients = pd.DataFrame(chunk)

    # filter data chunk with frac percentage
#     new_clients = new_clients.sample(frac = 0.5)
    new_clients = new_clients.drop_duplicates()

    # saves to list
    client_chunks.append(new_clients)

dados = pd.concat(client_chunks).reset_index(drop = True)

In [40]:
dados = dados.drop(['Unnamed: 0'], axis=1)
dados

Unnamed: 0,msisdn,imsi,tac,dt_inicio,dt_fim,cat_aparelho,fabricante,modelo,imei,ranking,...,fl_4g,dt_ano_stts,idade,avg_msisdn,avg_cpf,trocou_aparelho,trocou_aparelho_previsto,aging_aparelho,aging_aparelho_previsto,dt_fim_previsto
0,5588988039599,724118596208177,35946008,2021-05-22,2022-08-06,1,APPLE,IPHONE 7 (A1660),359460083674180,1,...,1,2017,39,14.0,14.0,1,1,14,14,2022-07
1,5598984843818,724119893014054,35301109,2021-07-11,2022-09-15,1,APPLE,IPHONE 8 PLUS (A1864),353011090395470,1,...,1,2017,39,14.0,14.0,1,1,14,14,2022-09
2,5543991172181,724064190886669,35923309,2021-08-23,2022-09-10,1,SAMSUNG,SM-J810M DS,359233092097610,1,...,1,2018,44,12.0,12.0,1,1,12,12,2022-08
3,5551999774680,724065191700977,35616011,2021-04-14,2022-08-20,1,Samsung Korea,GALAXY A31,356160114319960,1,...,1,2020,43,16.0,16.0,1,1,16,16,2022-08
4,5565996867917,724066613532716,86830805,2021-07-06,2022-08-04,1,XIAOMI,REDMI 9A (M2006C3LG),868308057222690,1,...,1,2020,67,12.0,12.0,1,1,12,12,2022-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1003070,5527997164405,724113002524393,35687011,2021-10-20,2022-11-14,1,APPLE,IPHONE 11 (A2111),356870110599300,1,...,1,2020,66,12.0,12.0,1,1,12,12,2022-10
1003071,5551996583605,724065010778518,35715510,2021-02-12,2022-07-31,1,LG Electronics Inc.,LM-X210BMW,357155106284230,1,...,1,2019,43,17.0,17.0,1,1,17,17,2022-07
1003072,5527998979742,724112706384504,35411310,2021-02-17,2022-07-26,1,Multilaser Industrial S.A.,VITA 3G,354113101160480,1,...,1,2018,54,17.0,17.0,1,1,17,17,2022-07
1003073,5554996750264,724065191279991,35556211,2021-04-05,2022-08-26,1,MOTOROLA,FIJI (XT2053-2),355562113384250,1,...,1,2020,39,16.0,16.0,1,1,16,16,2022-08


***
### Tabela Verdade

#### Comparando a tabela verdade de 'avg_msisdn', 'avg_cpf' e 'previsto' com 'aging_aparelho' para as classes

In [76]:
print(classification_report(dados['aging_aparelho'], np.asarray(dados['aging_aparelho_previsto'], dtype = 'int')))

              precision    recall  f1-score   support

          12       0.95      0.87      0.91    182635
          13       0.91      0.86      0.88    163261
          14       0.87      0.87      0.87    150271
          15       0.86      0.87      0.86    136197
          16       0.83      0.88      0.86    128731
          17       0.85      0.89      0.87    122457
          18       0.87      0.92      0.89    119523

    accuracy                           0.88   1003075
   macro avg       0.88      0.88      0.88   1003075
weighted avg       0.88      0.88      0.88   1003075



In [77]:
print(classification_report(dados['aging_aparelho'], np.asarray(dados['avg_msisdn'], dtype = 'int')))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         0
          12       1.00      0.72      0.84    182635
          13       1.00      0.72      0.83    163261
          14       1.00      0.73      0.84    150271
          15       1.00      0.75      0.86    136197
          16       1.00      0.74      0.85    128731
          17       1.00    

In [78]:
print(classification_report(dados['aging_aparelho'], np.asarray(dados['avg_cpf'], dtype = 'int')))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         0
          12       0.92      0.64      0.76    182635
          13       0.92      0.64      0.76    163261
          14       0.91      0.65      0.76    150271
          15       0.89      0.67      0.76    136197
          16       0.90      0.66      0.76    128731
          17       0.92    