In [1]:
## data
import pandas as pd
import numpy as np
import datetime as dt
from dateutil.relativedelta import relativedelta
from unicodedata import normalize

## vis
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns

from IPython.display import display, HTML

## modelo
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

## alertas
import warnings
warnings.filterwarnings("ignore")

***
## Coleta de dados

In [2]:
PATH = "../datasets/"

In [3]:
FILE = "tabletest39043283878.xlsx"
dados = pd.read_excel(PATH + FILE)

dados

Unnamed: 0,a.msisdn,a.imsi,a.tac,a.dt_inicio,a.dt_fim,a.aging_aparelho,a.cat_aparelho,a.trocou_aparelho,a.fabricante,a.modelo,...,a.subscriberstatuskey,a.offertechnology,a.citydesc,a.statedesc,a.subscribermainofferdesc,a.assignedchargeamount,a.fl_3g,a.fl_4g,a.fl_5g,a.dt_ano_stts
0,5511910004238,724102902565740,35414810,2021-10-29,2022-04-28,5,0,1,"Motorola Mobility LLC, a Lenovo Company",TROIKA,...,A,Controle,,SAO PAULO,VIVO CONTROLE 4GB IV,32.49,0,1,0,2019
1,5511910951409,724101996552176,35421087,2022-01-30,,8,0,0,GREYMARKET,HANDSET,...,A,Controle,,SAO PAULO,VIVO CONTROLE 4GB,37.61,0,1,0,2021
2,5511910004238,724102302106781,35421087,2022-04-28,2022-05-24,0,0,1,GREYMARKET,HANDSET,...,A,Controle,,SAO PAULO,VIVO CONTROLE 4GB IV,32.49,0,1,0,2021
3,5511910004238,724102302106781,86583405,2022-05-24,,4,0,1,XIAOMI,REDMI 9C (M2006C3MG),...,A,Controle,,SAO PAULO,VIVO CONTROLE 4GB IV,32.49,0,1,0,2020


### Limpeza dos dados

In [4]:
def just_check_nan(df):
    # Check NaN Pandas and Numpy
    missing_val_count_by_column = (df.isna().sum())

    columns_with_nan = missing_val_count_by_column[missing_val_count_by_column > 0]
    print('Dados com NaN:')
    print(columns_with_nan)
    print('')
    print('columns_with_nan.shape', columns_with_nan.shape)
    print('')

In [5]:
just_check_nan(dados)

Dados com NaN:
Series([], dtype: int64)

columns_with_nan.shape (0,)



***
## Analise dos dados

In [6]:
dados.columns

Index(['    a.msisdn    ', '      a.imsi      ', '   a.tac   ',
       ' a.dt_inicio  ', '  a.dt_fim   ', ' a.aging_aparelho  ',
       ' a.cat_aparelho  ', ' a.trocou_aparelho  ',
       '               a.fabricante               ', '       a.modelo        ',
       '      a.imei      ', ' a.ranking  ', ' a.cpfidentifier  ',
       ' a.subscriberkey  ', ' a.customerkey  ', ' a.customersubtypedesc  ',
       ' a.subscriberstatuskey  ', ' a.offertechnology  ', ' a.citydesc  ',
       ' a.statedesc  ', ' a.subscribermainofferdesc  ',
       ' a.assignedchargeamount  ', ' a.fl_3g  ', ' a.fl_4g  ', ' a.fl_5g  ',
       ' a.dt_ano_stts  '],
      dtype='object')

In [7]:
# dados_analysis = ProfileReport(dados, title="Profiling Report", progress_bar=False, infer_dtypes=False)
# profile_td_html = dados_analysis.to_html()

In [8]:
# display(HTML(profile_td_html))

***
## Exploração dos dados

### seleção de features

In [9]:
# define features
X = dados[[' a.cat_aparelho  ', ' a.trocou_aparelho  ',
           '               a.fabricante               ', '       a.modelo        ',
           ' a.ranking  ', 
           ' a.customersubtypedesc  ', ' a.subscriberstatuskey  ', 
           ' a.offertechnology  ', 
           ' a.citydesc  ', ' a.statedesc  ', ' a.subscribermainofferdesc  ',
           ' a.assignedchargeamount  ', ' a.fl_3g  ', ' a.fl_4g  ', ' a.fl_5g  ',
           ' a.dt_ano_stts  ']]
X

Unnamed: 0,a.cat_aparelho,a.trocou_aparelho,a.fabricante,a.modelo,a.ranking,a.customersubtypedesc,a.subscriberstatuskey,a.offertechnology,a.citydesc,a.statedesc,a.subscribermainofferdesc,a.assignedchargeamount,a.fl_3g,a.fl_4g,a.fl_5g,a.dt_ano_stts
0,0,1,"Motorola Mobility LLC, a Lenovo Company",TROIKA,1,Pessoa Física,A,Controle,,SAO PAULO,VIVO CONTROLE 4GB IV,32.49,0,1,0,2019
1,0,0,GREYMARKET,HANDSET,1,Pessoa Física,A,Controle,,SAO PAULO,VIVO CONTROLE 4GB,37.61,0,1,0,2021
2,0,1,GREYMARKET,HANDSET,2,Pessoa Física,A,Controle,,SAO PAULO,VIVO CONTROLE 4GB IV,32.49,0,1,0,2021
3,0,1,XIAOMI,REDMI 9C (M2006C3MG),3,Pessoa Física,A,Controle,,SAO PAULO,VIVO CONTROLE 4GB IV,32.49,0,1,0,2020


#### Variaveis Categoricas

In [10]:
# Get list of categorical variables
s = (X.dtypes == 'object')
object_cols = list(s[s].index)
print("Categorical variables: ", len(object_cols))
print(object_cols)

Categorical variables:  8
['               a.fabricante               ', '       a.modelo        ', ' a.customersubtypedesc  ', ' a.subscriberstatuskey  ', ' a.offertechnology  ', ' a.citydesc  ', ' a.statedesc  ', ' a.subscribermainofferdesc  ']


#### Variaveis Numéricas

In [11]:
# Criando uma lista de variaveis numéricas
numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]
print("Numerical variables: ", len(numerical_cols))
print(numerical_cols)

Numerical variables:  8
[' a.cat_aparelho  ', ' a.trocou_aparelho  ', ' a.ranking  ', ' a.assignedchargeamount  ', ' a.fl_3g  ', ' a.fl_4g  ', ' a.fl_5g  ', ' a.dt_ano_stts  ']


***
## Mostrando a correlação entre os dados numéricos

In [12]:
# corr = X.corr()

# fig, ax = plt.subplots(figsize=(20,10))
# sns.heatmap(corr, xticklabels = corr.columns, yticklabels = corr.columns, annot = True)

***
## Definição do Target

In [13]:
# define target
y = dados[' a.aging_aparelho  ']
y

0    5
1    8
2    0
3    4
Name:  a.aging_aparelho  , dtype: int64

***
## Modelagem

In [14]:
# contagem de aging
from collections import Counter

c = Counter(y)
c

Counter({5: 1, 8: 1, 0: 1, 4: 1})

In [15]:
# features de treinamento
X.columns

Index([' a.cat_aparelho  ', ' a.trocou_aparelho  ',
       '               a.fabricante               ', '       a.modelo        ',
       ' a.ranking  ', ' a.customersubtypedesc  ', ' a.subscriberstatuskey  ',
       ' a.offertechnology  ', ' a.citydesc  ', ' a.statedesc  ',
       ' a.subscribermainofferdesc  ', ' a.assignedchargeamount  ',
       ' a.fl_3g  ', ' a.fl_4g  ', ' a.fl_5g  ', ' a.dt_ano_stts  '],
      dtype='object')

In [16]:
# Dividindo as features em dados de treino e teste - 70% para dados de treinamento e 30% para dados de teste
X_train, X_val, y_train, y_val = train_test_split(X, 
                                                  y, 
                                                  test_size=0.30, 
                                                  random_state = 2048)

In [17]:
# Pré-processamento para Dados Numéricos
numerical_transformer = Pipeline([
                                  ('scaler', StandardScaler())
                                 ])

In [18]:
# Pré-processamento para Dados Categóricos
categorical_transformer = Pipeline([
                                    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
                                   ])

In [19]:
# Pré-processamento de pacote de dados numéricos e categóricos
preprocessor = ColumnTransformer(transformers=[
                                                ('num', numerical_transformer, numerical_cols),
                                                ('cat', categorical_transformer, object_cols)
                                              ], n_jobs=-1)

In [20]:
pipeline = Pipeline(steps=[
                            ('preprocessor', preprocessor),
                            ('model', RandomForestRegressor(random_state=2048, n_jobs=-1))
                          ])

In [21]:
# Preprocessing of training data, fit model 
pipeline.fit(X_train, y_train)

In [22]:
# Preprocessing of validation data, get predictions
preds = np.asarray(pipeline.predict(X_val), dtype = 'int')

In [23]:
preds

array([5, 6])

***
## Métricas

***
## Mesclando resultados com tabela

### Com a base de validação

In [24]:
X_val['a.aging_aparelho'] = y_val

In [25]:
X_val['previsto'] = preds
X_val

Unnamed: 0,a.cat_aparelho,a.trocou_aparelho,a.fabricante,a.modelo,a.ranking,a.customersubtypedesc,a.subscriberstatuskey,a.offertechnology,a.citydesc,a.statedesc,a.subscribermainofferdesc,a.assignedchargeamount,a.fl_3g,a.fl_4g,a.fl_5g,a.dt_ano_stts,a.aging_aparelho,previsto
0,0,1,"Motorola Mobility LLC, a Lenovo Company",TROIKA,1,Pessoa Física,A,Controle,,SAO PAULO,VIVO CONTROLE 4GB IV,32.49,0,1,0,2019,5,5
2,0,1,GREYMARKET,HANDSET,2,Pessoa Física,A,Controle,,SAO PAULO,VIVO CONTROLE 4GB IV,32.49,0,1,0,2021,0,6


In [26]:
X_val[' a.cpfidentifier  '] = dados[' a.cpfidentifier  ']
X_val

Unnamed: 0,a.cat_aparelho,a.trocou_aparelho,a.fabricante,a.modelo,a.ranking,a.customersubtypedesc,a.subscriberstatuskey,a.offertechnology,a.citydesc,a.statedesc,a.subscribermainofferdesc,a.assignedchargeamount,a.fl_3g,a.fl_4g,a.fl_5g,a.dt_ano_stts,a.aging_aparelho,previsto,a.cpfidentifier
0,0,1,"Motorola Mobility LLC, a Lenovo Company",TROIKA,1,Pessoa Física,A,Controle,,SAO PAULO,VIVO CONTROLE 4GB IV,32.49,0,1,0,2019,5,5,39043283878
2,0,1,GREYMARKET,HANDSET,2,Pessoa Física,A,Controle,,SAO PAULO,VIVO CONTROLE 4GB IV,32.49,0,1,0,2021,0,6,39043283878


In [27]:
X_val[' a.dt_inicio  '] = dados[' a.dt_inicio  ']
X_val

Unnamed: 0,a.cat_aparelho,a.trocou_aparelho,a.fabricante,a.modelo,a.ranking,a.customersubtypedesc,a.subscriberstatuskey,a.offertechnology,a.citydesc,a.statedesc,a.subscribermainofferdesc,a.assignedchargeamount,a.fl_3g,a.fl_4g,a.fl_5g,a.dt_ano_stts,a.aging_aparelho,previsto,a.cpfidentifier,a.dt_inicio
0,0,1,"Motorola Mobility LLC, a Lenovo Company",TROIKA,1,Pessoa Física,A,Controle,,SAO PAULO,VIVO CONTROLE 4GB IV,32.49,0,1,0,2019,5,5,39043283878,2021-10-29
2,0,1,GREYMARKET,HANDSET,2,Pessoa Física,A,Controle,,SAO PAULO,VIVO CONTROLE 4GB IV,32.49,0,1,0,2021,0,6,39043283878,2022-04-28


In [28]:
X_val['  a.dt_fim   '] = dados['  a.dt_fim   ']
X_val

Unnamed: 0,a.cat_aparelho,a.trocou_aparelho,a.fabricante,a.modelo,a.ranking,a.customersubtypedesc,a.subscriberstatuskey,a.offertechnology,a.citydesc,a.statedesc,...,a.assignedchargeamount,a.fl_3g,a.fl_4g,a.fl_5g,a.dt_ano_stts,a.aging_aparelho,previsto,a.cpfidentifier,a.dt_inicio,a.dt_fim
0,0,1,"Motorola Mobility LLC, a Lenovo Company",TROIKA,1,Pessoa Física,A,Controle,,SAO PAULO,...,32.49,0,1,0,2019,5,5,39043283878,2021-10-29,2022-04-28
2,0,1,GREYMARKET,HANDSET,2,Pessoa Física,A,Controle,,SAO PAULO,...,32.49,0,1,0,2021,0,6,39043283878,2022-04-28,2022-05-24


## Calculando data prevista

In [29]:
X_val['a.dt_fim_previsto'] = ((pd.to_datetime(X_val[' a.dt_inicio  ']).dt.to_period('M')) + X_val['previsto']).dt.to_timestamp()
X_val

Unnamed: 0,a.cat_aparelho,a.trocou_aparelho,a.fabricante,a.modelo,a.ranking,a.customersubtypedesc,a.subscriberstatuskey,a.offertechnology,a.citydesc,a.statedesc,...,a.fl_3g,a.fl_4g,a.fl_5g,a.dt_ano_stts,a.aging_aparelho,previsto,a.cpfidentifier,a.dt_inicio,a.dt_fim,a.dt_fim_previsto
0,0,1,"Motorola Mobility LLC, a Lenovo Company",TROIKA,1,Pessoa Física,A,Controle,,SAO PAULO,...,0,1,0,2019,5,5,39043283878,2021-10-29,2022-04-28,2022-03-01
2,0,1,GREYMARKET,HANDSET,2,Pessoa Física,A,Controle,,SAO PAULO,...,0,1,0,2021,0,6,39043283878,2022-04-28,2022-05-24,2022-10-01


***
## Teste com a base toda

In [30]:
# Preprocessing of validation data, get predictions
preds_Test = np.asarray(pipeline.predict(X), dtype = 'int')
preds_Test

array([5, 7, 6, 5])

In [31]:
X['a.aging_aparelho'] = y

In [32]:
X['previsto'] = preds_Test
X

Unnamed: 0,a.cat_aparelho,a.trocou_aparelho,a.fabricante,a.modelo,a.ranking,a.customersubtypedesc,a.subscriberstatuskey,a.offertechnology,a.citydesc,a.statedesc,a.subscribermainofferdesc,a.assignedchargeamount,a.fl_3g,a.fl_4g,a.fl_5g,a.dt_ano_stts,a.aging_aparelho,previsto
0,0,1,"Motorola Mobility LLC, a Lenovo Company",TROIKA,1,Pessoa Física,A,Controle,,SAO PAULO,VIVO CONTROLE 4GB IV,32.49,0,1,0,2019,5,5
1,0,0,GREYMARKET,HANDSET,1,Pessoa Física,A,Controle,,SAO PAULO,VIVO CONTROLE 4GB,37.61,0,1,0,2021,8,7
2,0,1,GREYMARKET,HANDSET,2,Pessoa Física,A,Controle,,SAO PAULO,VIVO CONTROLE 4GB IV,32.49,0,1,0,2021,0,6
3,0,1,XIAOMI,REDMI 9C (M2006C3MG),3,Pessoa Física,A,Controle,,SAO PAULO,VIVO CONTROLE 4GB IV,32.49,0,1,0,2020,4,5


In [33]:
X[' a.cpfidentifier  '] = dados[' a.cpfidentifier  ']
X

Unnamed: 0,a.cat_aparelho,a.trocou_aparelho,a.fabricante,a.modelo,a.ranking,a.customersubtypedesc,a.subscriberstatuskey,a.offertechnology,a.citydesc,a.statedesc,a.subscribermainofferdesc,a.assignedchargeamount,a.fl_3g,a.fl_4g,a.fl_5g,a.dt_ano_stts,a.aging_aparelho,previsto,a.cpfidentifier
0,0,1,"Motorola Mobility LLC, a Lenovo Company",TROIKA,1,Pessoa Física,A,Controle,,SAO PAULO,VIVO CONTROLE 4GB IV,32.49,0,1,0,2019,5,5,39043283878
1,0,0,GREYMARKET,HANDSET,1,Pessoa Física,A,Controle,,SAO PAULO,VIVO CONTROLE 4GB,37.61,0,1,0,2021,8,7,39043283878
2,0,1,GREYMARKET,HANDSET,2,Pessoa Física,A,Controle,,SAO PAULO,VIVO CONTROLE 4GB IV,32.49,0,1,0,2021,0,6,39043283878
3,0,1,XIAOMI,REDMI 9C (M2006C3MG),3,Pessoa Física,A,Controle,,SAO PAULO,VIVO CONTROLE 4GB IV,32.49,0,1,0,2020,4,5,39043283878


In [34]:
X[' a.dt_inicio  '] = dados[' a.dt_inicio  ']
X

Unnamed: 0,a.cat_aparelho,a.trocou_aparelho,a.fabricante,a.modelo,a.ranking,a.customersubtypedesc,a.subscriberstatuskey,a.offertechnology,a.citydesc,a.statedesc,a.subscribermainofferdesc,a.assignedchargeamount,a.fl_3g,a.fl_4g,a.fl_5g,a.dt_ano_stts,a.aging_aparelho,previsto,a.cpfidentifier,a.dt_inicio
0,0,1,"Motorola Mobility LLC, a Lenovo Company",TROIKA,1,Pessoa Física,A,Controle,,SAO PAULO,VIVO CONTROLE 4GB IV,32.49,0,1,0,2019,5,5,39043283878,2021-10-29
1,0,0,GREYMARKET,HANDSET,1,Pessoa Física,A,Controle,,SAO PAULO,VIVO CONTROLE 4GB,37.61,0,1,0,2021,8,7,39043283878,2022-01-30
2,0,1,GREYMARKET,HANDSET,2,Pessoa Física,A,Controle,,SAO PAULO,VIVO CONTROLE 4GB IV,32.49,0,1,0,2021,0,6,39043283878,2022-04-28
3,0,1,XIAOMI,REDMI 9C (M2006C3MG),3,Pessoa Física,A,Controle,,SAO PAULO,VIVO CONTROLE 4GB IV,32.49,0,1,0,2020,4,5,39043283878,2022-05-24


In [35]:
X['  a.dt_fim   '] = dados['  a.dt_fim   ']
X

Unnamed: 0,a.cat_aparelho,a.trocou_aparelho,a.fabricante,a.modelo,a.ranking,a.customersubtypedesc,a.subscriberstatuskey,a.offertechnology,a.citydesc,a.statedesc,...,a.assignedchargeamount,a.fl_3g,a.fl_4g,a.fl_5g,a.dt_ano_stts,a.aging_aparelho,previsto,a.cpfidentifier,a.dt_inicio,a.dt_fim
0,0,1,"Motorola Mobility LLC, a Lenovo Company",TROIKA,1,Pessoa Física,A,Controle,,SAO PAULO,...,32.49,0,1,0,2019,5,5,39043283878,2021-10-29,2022-04-28
1,0,0,GREYMARKET,HANDSET,1,Pessoa Física,A,Controle,,SAO PAULO,...,37.61,0,1,0,2021,8,7,39043283878,2022-01-30,
2,0,1,GREYMARKET,HANDSET,2,Pessoa Física,A,Controle,,SAO PAULO,...,32.49,0,1,0,2021,0,6,39043283878,2022-04-28,2022-05-24
3,0,1,XIAOMI,REDMI 9C (M2006C3MG),3,Pessoa Física,A,Controle,,SAO PAULO,...,32.49,0,1,0,2020,4,5,39043283878,2022-05-24,


## Calculando data prevista

In [37]:
X['a.dt_fim_previsto'] = ((pd.to_datetime(X[' a.dt_inicio  ']).dt.to_period('M')) + X['previsto']).dt.to_timestamp()
X

Unnamed: 0,a.cat_aparelho,a.trocou_aparelho,a.fabricante,a.modelo,a.ranking,a.customersubtypedesc,a.subscriberstatuskey,a.offertechnology,a.citydesc,a.statedesc,...,a.fl_3g,a.fl_4g,a.fl_5g,a.dt_ano_stts,a.aging_aparelho,previsto,a.cpfidentifier,a.dt_inicio,a.dt_fim,a.dt_fim_previsto
0,0,1,"Motorola Mobility LLC, a Lenovo Company",TROIKA,1,Pessoa Física,A,Controle,,SAO PAULO,...,0,1,0,2019,5,5,39043283878,2021-10-29,2022-04-28,2022-03-01
1,0,0,GREYMARKET,HANDSET,1,Pessoa Física,A,Controle,,SAO PAULO,...,0,1,0,2021,8,7,39043283878,2022-01-30,,2022-08-01
2,0,1,GREYMARKET,HANDSET,2,Pessoa Física,A,Controle,,SAO PAULO,...,0,1,0,2021,0,6,39043283878,2022-04-28,2022-05-24,2022-10-01
3,0,1,XIAOMI,REDMI 9C (M2006C3MG),3,Pessoa Física,A,Controle,,SAO PAULO,...,0,1,0,2020,4,5,39043283878,2022-05-24,,2022-10-01
