In [1]:
## data
import pandas as pd
import numpy as np
import datetime as dt
from dateutil.relativedelta import relativedelta
from unicodedata import normalize

## vis
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns

from IPython.display import display, HTML

## modelo
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

## alertas
import warnings
warnings.filterwarnings("ignore")

***
## Coleta de dados

In [None]:
PATH = "../datasets/"

In [None]:
FILE = "batdtfim_finaledited.csv"
dados = pd.read_csv(PATH + FILE, sep = ';', encoding = 'ISO-8859-1', engine ='python')
dados = dados.rename(columns={'batdtfim_anlz6.msisdn': 'msisdn'})\
             .rename(columns={'batdtfim_anlz6.imsi': 'imsi'})\
             .rename(columns={'batdtfim_anlz6.tac': 'tac'})\
             .rename(columns={'batdtfim_anlz6.dt_inicio': 'dt_inicio'})\
             .rename(columns={'batdtfim_anlz6.dt_fim': 'dt_fim'})\
             .rename(columns={'batdtfim_anlz6.aging_aparelho': 'aging_aparelho'})\
             .rename(columns={'batdtfim_anlz6.cat_aparelho': 'cat_aparelho'})\
             .rename(columns={'batdtfim_anlz6.trocou_aparelho': 'trocou_aparelho'})\
             .rename(columns={'batdtfim_anlz6.fabricante': 'fabricante'})\
             .rename(columns={'batdtfim_anlz6.modelo': 'modelo'})\
             .rename(columns={'batdtfim_anlz6.imei': 'imei'})\
             .rename(columns={'batdtfim_anlz6.ranking': 'ranking'})\
             .rename(columns={'batdtfim_anlz6.cpfidentifier': 'cpfidentifier'})\
             .rename(columns={'batdtfim_anlz6.subscriberkey': 'subscriberkey'})\
             .rename(columns={'batdtfim_anlz6.customerkey': 'customerkey'})\
             .rename(columns={'batdtfim_anlz6.customersubtypedesc': 'customersubtypedesc'})\
             .rename(columns={'batdtfim_anlz6.subscriberstatuskey': 'subscriberstatuskey'})\
             .rename(columns={'batdtfim_anlz6.offertechnology': 'offertechnology'})\
             .rename(columns={'batdtfim_anlz6.citydesc': 'citydesc'})\
             .rename(columns={'batdtfim_anlz6.statedesc': 'statedesc'})\
             .rename(columns={'batdtfim_anlz6.subscribermainofferdesc': 'subscribermainofferdesc'})\
             .rename(columns={'batdtfim_anlz6.assignedchargeamount': 'assignedchargeamount'})\
             .rename(columns={'batdtfim_anlz6.fl_3g': 'fl_3g'})\
             .rename(columns={'batdtfim_anlz6.fl_4g': 'fl_4g'})\
             .rename(columns={'batdtfim_anlz6.fl_5g': 'fl_5g'})\
             .rename(columns={'batdtfim_anlz6.dt_ano_stts': 'dt_ano_stts'})


dados

### Limpeza dos dados

In [None]:
def just_check_nan(df):
    # Check NaN Pandas and Numpy
    missing_val_count_by_column = (df.isna().sum())

    columns_with_nan = missing_val_count_by_column[missing_val_count_by_column > 0]
    print('Dados com NaN:')
    print(columns_with_nan)
    print('')
    print('columns_with_nan.shape', columns_with_nan.shape)
    print('')

In [None]:
just_check_nan(dados)

In [None]:
dados_droped = dados.dropna(subset=['statedesc', 'subscribermainofferdesc', 'assignedchargeamount', 'fl_3g', 'fl_4g', 'fl_5g', 'dt_ano_stts'])\
                    .reset_index()

In [None]:
just_check_nan(dados_droped)

***
## Exploração dos dados

### seleção de features

In [None]:
# define features
X = dados_droped[['cat_aparelho', 'trocou_aparelho',
                  'fabricante', 'modelo',
                  'ranking', 
                  'customersubtypedesc', 'subscriberstatuskey', 
                  'offertechnology', 
                  'statedesc', 'subscribermainofferdesc',
                  'assignedchargeamount', 'fl_3g', 'fl_4g', 'fl_5g',
                  'dt_ano_stts']]
X

#### Variaveis Categoricas

In [None]:
# Get list of categorical variables
s = (X.dtypes == 'object')
object_cols = list(s[s].index)
print("Categorical variables: ", len(object_cols))
print(object_cols)

#### Variaveis Numéricas

In [None]:
# Criando uma lista de variaveis numéricas
numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]
print("Numerical variables: ", len(numerical_cols))
print(numerical_cols)

***
## Mostrando a correlação entre os dados numéricos

In [None]:
# corr = X.corr()

# fig, ax = plt.subplots(figsize=(20,10))
# sns.heatmap(corr, xticklabels = corr.columns, yticklabels = corr.columns, annot = True)

***
## Definição do Target

In [None]:
# define target
y = dados_droped['aging_aparelho']
y

***
## Modelagem

In [None]:
# contagem de aging
from collections import Counter

c = Counter(y)
c

In [None]:
# Dividindo as features em dados de treino e teste - 70% para dados de treinamento e 30% para dados de teste
X_train, X_val, y_train, y_val = train_test_split(X, 
                                                  y, 
                                                  test_size=0.30, 
                                                  random_state = 2048)

In [None]:
# Pré-processamento para Dados Numéricos
numerical_transformer = Pipeline([
                                  ('scaler', StandardScaler())
                                 ])

In [None]:
# Pré-processamento para Dados Categóricos
categorical_transformer = Pipeline([
                                    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
                                   ])

In [None]:
# Pré-processamento de pacote de dados numéricos e categóricos
preprocessor = ColumnTransformer(transformers=[
                                                ('num', numerical_transformer, numerical_cols),
                                                ('cat', categorical_transformer, object_cols)
                                              ], n_jobs=-1)

In [None]:
pipeline = Pipeline(steps=[
                            ('preprocessor', preprocessor),
                            ('model', RandomForestRegressor(random_state=2048, n_jobs=-1))
                          ])

In [None]:
# Preprocessing of training data, fit model 
pipeline.fit(X_train, y_train)

In [None]:
# Preprocessing of validation data, get predictions
preds = np.asarray(pipeline.predict(X_val), dtype = 'int')

In [None]:
preds

***
## Métricas

***
## Mesclando resultados com tabela

### Com a base de validação

In [None]:
X_val['a.aging_aparelho'] = y_val

In [None]:
X_val['previsto'] = preds
X_val

In [None]:
X_val[' a.cpfidentifier  '] = dados[' a.cpfidentifier  ']
X_val

In [None]:
X_val[' a.dt_inicio  '] = dados[' a.dt_inicio  ']
X_val

In [None]:
X_val['  a.dt_fim   '] = dados['  a.dt_fim   ']
X_val

## Calculando data prevista

In [None]:
X_val['a.dt_fim_previsto'] = ((pd.to_datetime(X_val[' a.dt_inicio  ']).dt.to_period('M')) + X_val['previsto']).dt.to_timestamp()
X_val

***
## Teste com a base toda

In [None]:
# Preprocessing of validation data, get predictions
preds_Test = np.asarray(pipeline.predict(X), dtype = 'int')
preds_Test

In [None]:
X['a.aging_aparelho'] = y

In [None]:
X['previsto'] = preds_Test
X

In [None]:
X[' a.cpfidentifier  '] = dados[' a.cpfidentifier  ']
X

In [None]:
X[' a.dt_inicio  '] = dados[' a.dt_inicio  ']
X

In [None]:
X['  a.dt_fim   '] = dados['  a.dt_fim   ']
X

## Calculando data prevista

In [None]:
X['a.dt_fim_previsto'] = ((pd.to_datetime(X[' a.dt_inicio  ']).dt.to_period('M')) + X['previsto']).dt.to_timestamp()
X