In [1]:
## data
import pandas as pd
import numpy as np
import datetime as dt
from dateutil.relativedelta import relativedelta
from unicodedata import normalize

## vis
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
# contagem de aging
from collections import Counter

from IPython.display import display, HTML

## modelo
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, precision_recall_curve
from sklearn.metrics import average_precision_score, mean_absolute_error, roc_curve, auc, roc_auc_score
from scikitplot.metrics import plot_roc
from sklearn.model_selection import cross_val_score

# check xgboost version
from xgboost import XGBClassifier, XGBRegressor

import dill as pickle

## alertas
import warnings
warnings.filterwarnings("ignore")

***
### Funções

In [2]:
def aucur(y_test, preds):
    # calculate AUC
    aucur = roc_auc_score(y_test, preds)
    print('AUC: %.3f' % aucur)

    # calculate roc curve
    fpr, tpr, thresholds = roc_curve(y_test, preds)

    # plot no skill
    plt.plot([0, 1], [0, 1], linestyle='--')
    # plot the roc curve for the model
    plt.plot(fpr, tpr, marker='.')
    # show the plot
    plt.show()

In [3]:
def just_check_nan(df):
    # Check NaN Pandas and Numpy
    missing_val_count_by_column = (df.isna().sum())

    columns_with_nan = missing_val_count_by_column[missing_val_count_by_column > 0]
    print('Dados com NaN:')
    print(columns_with_nan)
    print('')
    print('columns_with_nan.shape', columns_with_nan.shape)
    print('')

In [4]:
def plot_feature_freq(df, feature):
    labels, counts = np.unique(df[feature].dropna(), return_counts=True)
    total = counts[0] + counts[1]
    plt.figure(figsize=(10, 7))
    ax = sns.barplot(x=labels, y=(counts/total)*100, data = df)
    ax2 = sns.barplot(x=labels, y=counts, data = df)

    for p in ax.patches:
        ax.annotate('{:.2f}%'.format(p.get_height()), (p.get_x() + 0.3, p.get_height()+0.6), rotation = 0)

#     ax.set(xlabel='Trocou', ylabel='Porcentagem')
    ax.xaxis.label.set_size(20)
    ax.yaxis.label.set_size(20)
    plt.show()

In [5]:
def plot_feature_freq_mes(df, feature):
    labels, counts = np.unique(df[feature].dropna(), return_counts=True)
    total = counts[0] + counts[1] + counts[2] + counts[3] + counts[4] + counts[5] + counts[6]
    plt.figure(figsize=(10, 7))
    ax = sns.barplot(x=labels, y=(counts/total)*100, data = df)
    ax2 = sns.barplot(x=labels, y=counts, data = df)

    for p in ax.patches:
        ax.annotate('{:.2f}%'.format(p.get_height()), (p.get_x() + 0.3, p.get_height()+0.6), rotation = 0)

#     ax.set(xlabel='Trocou', ylabel='Porcentagem')
    ax.xaxis.label.set_size(20)
    ax.yaxis.label.set_size(20)
    plt.show()

***
## Coleta de dados

In [6]:
FILE = "/kaggle/input/datasetsnew/ta_predicted_apple.csv"

In [7]:
client_chunks = []
dados_teste = pd.read_csv(FILE, sep = ",", 
                          header = 0, 
                          index_col = False, 
                          names = None, 
                          low_memory = False, 
                          chunksize = 10**5) 

In [8]:
for chunk in dados_teste:

    new_clients = pd.DataFrame(chunk)

    # filter data chunk with frac percentage
#     new_clients = new_clients.sample(frac = 0.5)
    new_clients = new_clients.drop_duplicates()

    # saves to list
    client_chunks.append(new_clients)

dados = pd.concat(client_chunks).reset_index(drop = True)

In [9]:
dados = dados.drop(['Unnamed: 0'], axis=1)
dados

Unnamed: 0,msisdn,imsi,tac,dt_inicio,dt_fim,cat_aparelho,fabricante,modelo,imei,ranking,...,idade,avg_msisdn,avg_cpf,peso_aparelho,update_software,trocou_aparelho,trocou_aparelho_previsto,aging_aparelho,aging_aparelho_previsto,dt_fim_previsto
0,5551995977894,724065005997730,35291811,2020-08-07,2022-09-12,2,APPLE,IPHONE 11 (A2221),3.529181e+14,1,...,43,25.0,25.0,2221,1,1,1,25,25,2022-09
1,5521999048600,724112005300513,35735809,2019-01-13,2022-10-09,3,APPLE,IPHONE XR (A2105),3.573581e+14,1,...,54,44.0,12.0,2105,1,1,1,44,44,2022-09
2,5537999859602,724233608728525,35926706,2020-01-29,2022-10-21,2,APPLE,IPHONE 5S (A1457),3.592671e+14,1,...,58,32.0,32.0,1457,0,1,1,32,32,2022-09
3,5551998169677,724065007435855,35307409,2018-12-12,2022-07-15,3,APPLE,IPHONE 7 (A1778),3.530741e+14,1,...,40,22.5,22.5,1778,0,1,1,43,45,2022-09
4,5554999161144,724065303571480,35676308,2019-06-12,2022-10-05,3,APPLE,IPHONE 8 (A1905),3.567631e+14,1,...,55,39.0,39.0,1905,1,1,1,39,39,2022-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
414301,5511971419892,724101002363049,35330207,2018-10-14,2022-09-12,3,APPLE,IPHONE 6S PLUS TD-LTE (A1634),3.533021e+14,1,...,68,46.0,46.0,1634,0,1,1,46,46,2022-08
414302,5531971280177,724234290903081,35645210,2020-10-10,2022-07-24,1,APPLE,IPHONE XR (A2105),3.564521e+14,1,...,59,21.0,21.0,2105,1,1,1,21,21,2022-07
414303,5511981118011,724101703573155,35298911,2020-08-27,2022-08-12,1,APPLE,IPHONE 11 (A2221),3.529891e+14,1,...,70,23.0,23.0,2221,1,1,1,23,23,2022-07
414304,5527996100063,724112703043033,35491409,2020-07-16,2022-08-04,2,APPLE,IPHONE 7 (A1660),3.549141e+14,1,...,56,24.0,24.0,1660,0,1,1,24,24,2022-07


***
### Tabela Verdade

#### Comparando a tabela verdade de 'avg_msisdn', 'avg_cpf' e 'previsto' com 'aging_aparelho' para as classes

In [10]:
print(classification_report(dados['aging_aparelho'], np.asarray(dados['aging_aparelho_previsto'], dtype = 'int')))

              precision    recall  f1-score   support

          12       0.88      0.95      0.91     23936
          13       0.88      0.92      0.90     20839
          14       0.87      0.93      0.90     20158
          15       0.88      0.94      0.90     18777
          16       0.85      0.94      0.89     17369
          17       0.87      0.94      0.90     16961
          18       0.88      0.93      0.90     16225
          19       0.91      0.93      0.92     16019
          20       0.89      0.95      0.92     16525
          21       0.92      0.93      0.92     16577
          22       0.90      0.96      0.93     15558
          23       0.94      0.95      0.94     14344
          24       0.94      0.86      0.90     12366
          25       0.96      0.94      0.95     11819
          26       0.98      0.87      0.93     11126
          27       0.96      0.95      0.95      9440
          28       0.98      0.87      0.92      8966
          29       0.97    

In [11]:
print(classification_report(dados['aging_aparelho'], np.asarray(dados['avg_msisdn'], dtype = 'int')))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         0
          12       0.86      0.82      0.84     23936
          13       0.86      0.82      0.84     20839
          14       0.88      0.82      0.85     20158
          15       0.87      0.83      0.85     18777
          16       0.84      0.83      0.84     17369
          17       0.87      0.84      0.85     16961
          18       0.90    

In [12]:
print(classification_report(dados['aging_aparelho'], np.asarray(dados['avg_cpf'], dtype = 'int')))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         0
          12       0.72      0.65      0.68     23936
          13       0.71      0.67      0.69     20839
          14       0.71      0.67      0.69     20158
          15       0.69      0.67      0.68     18777
          16       0.67      0.66      0.67     17369
          17       0.69    