# Desenvolvendo um modelo para predizer a rotatividade

A operadora de comunicações Interconnect gostaria de ser capaz de predizer a rotatividade de seus clientes. Se for descoberto que um usuário está planejando trocar de operadora, a empresa oferecerá-lhe códigos promocionais e opções de plano especiais. A equipe de marketing da Interconnect coletou alguns dados pessoais da sua clientela, incluindo a informação sobre seus planos e contratos.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats as st
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Carregue o arquivo com os dados em um DataFrame
df_contract = pd.read_csv('/datasets/final_provider/contract.csv')
df_internet = pd.read_csv('/datasets/final_provider/internet.csv')
df_personal = pd.read_csv('/datasets/final_provider/personal.csv')
df_phone = pd.read_csv('/datasets/final_provider/phone.csv')

In [3]:
df_contract = df_contract.merge(df_internet, on='customerID', how='left')
df_contract = df_contract.merge(df_personal, on='customerID', how='left')
df = df_contract.merge(df_phone, on='customerID', how='left')

In [4]:
df['target'] = df['EndDate'].apply(lambda x: 1 if x == 'No' else 0)

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
df_train, df_test = train_test_split(df,stratify=df['target'], test_size=0.1, random_state=42)

In [7]:
df

Unnamed: 0,customerID,BeginDate,EndDate,Type,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,gender,SeniorCitizen,Partner,Dependents,MultipleLines,target
0,7590-VHVEG,2020-01-01,No,Month-to-month,Yes,Electronic check,29.85,29.85,DSL,No,...,No,No,No,No,Female,0,Yes,No,,1
1,5575-GNVDE,2017-04-01,No,One year,No,Mailed check,56.95,1889.5,DSL,Yes,...,Yes,No,No,No,Male,0,No,No,No,1
2,3668-QPYBK,2019-10-01,2019-12-01 00:00:00,Month-to-month,Yes,Mailed check,53.85,108.15,DSL,Yes,...,No,No,No,No,Male,0,No,No,No,0
3,7795-CFOCW,2016-05-01,No,One year,No,Bank transfer (automatic),42.30,1840.75,DSL,Yes,...,Yes,Yes,No,No,Male,0,No,No,,1
4,9237-HQITU,2019-09-01,2019-11-01 00:00:00,Month-to-month,Yes,Electronic check,70.70,151.65,Fiber optic,No,...,No,No,No,No,Female,0,No,No,No,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,2018-02-01,No,One year,Yes,Mailed check,84.80,1990.5,DSL,Yes,...,Yes,Yes,Yes,Yes,Male,0,Yes,Yes,Yes,1
7039,2234-XADUH,2014-02-01,No,One year,Yes,Credit card (automatic),103.20,7362.9,Fiber optic,No,...,Yes,No,Yes,Yes,Female,0,Yes,Yes,Yes,1
7040,4801-JZAZL,2019-03-01,No,Month-to-month,Yes,Electronic check,29.60,346.45,DSL,Yes,...,No,No,No,No,Female,0,Yes,Yes,,1
7041,8361-LTMKD,2019-07-01,2019-11-01 00:00:00,Month-to-month,Yes,Mailed check,74.40,306.6,Fiber optic,No,...,No,No,No,No,Male,1,Yes,No,Yes,0


In [8]:
df_train = df_train.drop(['customerID','BeginDate','EndDate','Type','PaperlessBilling','MonthlyCharges'], axis=1)

In [9]:
df_train['TotalCharges'] = df_train['TotalCharges'].replace(' ',np.nan).fillna(0).astype('float')

In [10]:
df_train.isna().sum()

PaymentMethod          0
TotalCharges           0
InternetService     1378
OnlineSecurity      1378
OnlineBackup        1378
DeviceProtection    1378
TechSupport         1378
StreamingTV         1378
StreamingMovies     1378
gender                 0
SeniorCitizen          0
Partner                0
Dependents             0
MultipleLines        612
target                 0
dtype: int64

In [11]:
df_train = df_train.fillna('No')

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [13]:
# Definir os dados de entrada e o alvo
X_train = df_train.drop(['target'], axis=1)
y_train = df_train['target']

In [14]:
# Definir as colunas categóricas e numéricas do conjunto de dados
categorical_cols = ['PaymentMethod','InternetService','OnlineSecurity','OnlineBackup','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies','gender','SeniorCitizen','Partner','Dependents','MultipleLines']  # Suas colunas categóricas
numeric_cols = ['TotalCharges']  # Suas colunas numéricas

In [15]:
# Criar os transformadores para as colunas categóricas e numéricas
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder())
])

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Criar o pré-processador para aplicar os transformadores nas colunas corretas
preprocessor = ColumnTransformer([
    ('cat', categorical_transformer, categorical_cols),
    ('num', numeric_transformer, numeric_cols)
])

# Criar o pipeline completo com o pré-processador e o modelo de regressão logística
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

# Executar a validação cruzada usando o pipeline completo
scores = cross_val_score(pipeline, X_train, y_train, cv=5)

# Imprimir a média das pontuações da validação cruzada
mean_score = scores.mean()
print(f'Média das pontuações da validação cruzada: {mean_score}')



Média das pontuações da validação cruzada: 0.7970986383294452


In [16]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder())]),
                                                  ['PaymentMethod',
                                                   'InternetService',
                                                   'OnlineSecurity',
                                                   'OnlineBackup',
                                                   'OnlineBackup',
                                                   'DeviceProtection',
                                                   'TechSupport', 'StreamingTV',
                                                   'StreamingMovie

In [17]:
from sklearn.metrics import roc_auc_score

# Executar a validação cruzada usando o pipeline completo
scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='roc_auc')

# Imprimir a média das pontuações da validação cruzada
mean_score = scores.mean()
print(f'Média das pontuações da validação cruzada (AUROC): {mean_score}')

Média das pontuações da validação cruzada (AUROC): 0.8282105846448969


# Aplicar solução no Teste

In [18]:
df_test = df_test.drop(['customerID','BeginDate','EndDate','Type','PaperlessBilling','MonthlyCharges'], axis=1)
df_test['TotalCharges'] = df_test['TotalCharges'].replace(' ',np.nan).fillna(0).astype('float')
df_test = df_test.fillna('No')

In [19]:
# Definir os dados de entrada e o alvo
X_test = df_test.drop(['target'], axis=1)
y_test = df_test['target']

In [20]:
y_pred = pipeline.predict_proba(X_test)[:, 1]  # Fazer previsões de probabilidade para a classe positiva

In [21]:
auroc_score = roc_auc_score(y_test, y_pred)
print(f'Pontuação AUROC no conjunto de dados de teste: {auroc_score}')

Pontuação AUROC no conjunto de dados de teste: 0.8205149381619972
