# IoT Planning 

**Input**: data from telco's customers with demographic, services and account information.

**Output**: flask application with form to be filled with customer's information. The application should return with the prediction result if the customer will or will not churn in that month.

**Tasks**:
- data description
- end-to-end solution
- metrics definition
- data cleaning
- check the unbalency
- feature engineering
- eda
- data preparation
- feature selection
- ml models
- hyperparameters fine tuning
- deploy

# 0.0 Imports

In [21]:
from IPython.core.display      import HTML
from pycaret.classification import *

import matplotlib.pyplot as plt
import pandas            as pd
import seaborn           as sns

import inflection

## 0.1 Helper Functions

In [3]:
def jupyter_settings():
    %matplotlib inline
    %pylab inline
    
    plt.style.use('bmh')
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 24
    
    display(HTML('<style>.container{width:100% !important; }</style>'))
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option('display.expand_frame_repr', False)
    
    sns.set

jupyter_settings()

Populating the interactive namespace from numpy and matplotlib


## 0.2 Load Data

In [4]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

# 1.0 Data Description

In [5]:
df1 = df_train.copy()

## 1.1 Data Dimension

In [6]:
print('Number of rows: {}'.format(df1.shape[0]))
print('Number of columns: {}'.format(df1.shape[1]))

Number of rows: 5634
Number of columns: 21


## 1.2 Rename Columns

In [7]:
cols_old = ['id', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
            'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
            'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
            'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
            'MonthlyCharges', 'TotalCharges', 'Churn']
snakecase = lambda x: inflection.underscore(x)
new_cols = list(map(snakecase, cols_old))
df1.columns = new_cols

In [45]:
cols_old = ['id', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges']
snakecase = lambda x: inflection.underscore(x)
new_cols = list(map(snakecase, cols_old))
df_test.columns = new_cols

## 1.3 Data Types

In [8]:
df1.dtypes

id                     int64
gender                object
senior_citizen         int64
partner               object
dependents            object
tenure               float64
phone_service         object
multiple_lines        object
internet_service      object
online_security       object
online_backup         object
device_protection     object
tech_support          object
streaming_tv          object
streaming_movies      object
contract              object
paperless_billing     object
payment_method        object
monthly_charges      float64
total_charges         object
churn                  int64
dtype: object

## 1.4 Check N/A

In [9]:
df1.isna().sum()

id                     0
gender                 0
senior_citizen         0
partner                0
dependents           218
tenure               461
phone_service          0
multiple_lines         0
internet_service       0
online_security        0
online_backup          0
device_protection      0
tech_support           0
streaming_tv           0
streaming_movies       0
contract               0
paperless_billing      0
payment_method        99
monthly_charges        0
total_charges          0
churn                  0
dtype: int64

## 1.5 Drop N/A

In [27]:
df1 = df1.dropna()

## 1.6 Change Data Types

In [31]:
df1['total_charges'] = pd.to_numeric(df1['total_charges'], errors='coerce')

In [47]:
df_test['total_charges'] = pd.to_numeric(df_test['total_charges'], errors='coerce')

# Model

In [32]:
setup(data=df1, target='churn')

Unnamed: 0,Description,Value
0,session_id,3755
1,Target,churn
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(4884, 21)"
5,Missing Values,True
6,Numeric Features,4
7,Categorical Features,16
8,Ordinal Features,False
9,High Cardinality Features,False


(False,
 False,
 -1,
 'box-cox',
 1319    0
 4689    1
 3122    0
 5626    0
 955     0
        ..
 621     0
 3116    0
 126     0
 4743    1
 3972    0
 Name: churn, Length: 3418, dtype: int64,
 None,
 10,
 <MLUsecase.CLASSIFICATION: 1>,
 {'USI',
  'X',
  'X_test',
  'X_train',
  '_all_metrics',
  '_all_models',
  '_all_models_internal',
  '_available_plots',
  '_gpu_n_jobs_param',
  '_internal_pipeline',
  '_ml_usecase',
  'create_model_container',
  'data_before_preprocess',
  'display_container',
  'exp_name_log',
  'experiment__',
  'fix_imbalance_method_param',
  'fix_imbalance_param',
  'fold_generator',
  'fold_groups_param',
  'fold_groups_param_full',
  'fold_param',
  'fold_shuffle_param',
  'gpu_param',
  'html_param',
  'imputation_classifier',
  'imputation_regressor',
  'iterative_imputation_iters_param',
  'log_plots_param',
  'logging_param',
  'master_model_container',
  'n_jobs_param',
  'prep_pipe',
  'pycaret_globals',
  'seed',
  'stratify_param',
  'target_param

In [33]:
best_model = compare_models(n_select=5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.8142,0.8516,0.5652,0.6713,0.6123,0.4916,0.4956,2.946
ridge,Ridge Classifier,0.8133,0.0,0.5393,0.6795,0.6,0.4806,0.4869,0.02
lda,Linear Discriminant Analysis,0.8075,0.8442,0.5685,0.6495,0.6054,0.479,0.4815,0.026
gbc,Gradient Boosting Classifier,0.8072,0.8512,0.5416,0.6592,0.5934,0.4689,0.4735,0.34
ada,Ada Boost Classifier,0.8031,0.8481,0.536,0.6488,0.5861,0.4586,0.4628,0.156
rf,Random Forest Classifier,0.7964,0.8332,0.5034,0.6388,0.5621,0.4321,0.4379,0.314
lightgbm,Light Gradient Boosting Machine,0.7864,0.832,0.5124,0.6078,0.5551,0.4161,0.4193,0.402
et,Extra Trees Classifier,0.7826,0.8155,0.4921,0.6015,0.5407,0.4003,0.4042,0.289
nb,Naive Bayes,0.7566,0.8388,0.7708,0.5228,0.6228,0.4529,0.4716,0.014
dummy,Dummy Classifier,0.7396,0.5,0.0,0.0,0.0,0.0,0.0,0.01


In [34]:
len(df1.loc[df1['churn']==0])/len(df1)

0.7391482391482391

In [39]:
t = tune_model(best_model[0])

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8158,0.8617,0.618,0.6548,0.6358,0.5127,0.5131
1,0.7982,0.8302,0.4831,0.6515,0.5548,0.4281,0.4361
2,0.7982,0.826,0.5056,0.6429,0.566,0.437,0.4424
3,0.8129,0.8628,0.5393,0.6761,0.6,0.4799,0.4851
4,0.8626,0.8783,0.6629,0.7763,0.7152,0.6253,0.6287
5,0.7953,0.8436,0.5281,0.6267,0.5732,0.4398,0.4426
6,0.8392,0.8816,0.6292,0.7179,0.6707,0.5649,0.567
7,0.7982,0.8324,0.6292,0.6087,0.6188,0.4817,0.4818
8,0.8299,0.8346,0.5506,0.7313,0.6282,0.5208,0.5296
9,0.827,0.8708,0.573,0.7083,0.6335,0.5219,0.527


In [40]:
final_model = finalize_model(t)

In [50]:
predict = predict_model(final_model, df_test)
predict

Unnamed: 0,id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,contract,paperless_billing,payment_method,monthly_charges,total_charges,Label,Score
0,5027,Male,0,Yes,Yes,23.0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,Yes,Credit card (automatic),20.00,445.30,0,0.9432
1,1733,Male,1,Yes,Yes,61.0,Yes,Yes,Fiber optic,No,Yes,No,No,Yes,Yes,One year,No,,99.00,5969.30,0,0.8168
2,5384,Male,0,No,No,36.0,Yes,Yes,Fiber optic,No,No,No,No,No,Yes,Month-to-month,Yes,Electronic check,84.75,3050.15,1,0.5262
3,6554,Female,0,Yes,Yes,61.0,No,No phone service,DSL,No,Yes,Yes,Yes,Yes,Yes,Two year,Yes,Bank transfer (automatic),61.45,3751.15,0,0.9834
4,364,Female,0,No,No,47.0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,Yes,Mailed check,20.55,945.70,0,0.9906
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1404,4897,Male,0,No,,24.0,Yes,No,DSL,No,No,Yes,No,No,No,Month-to-month,Yes,Mailed check,49.70,1167.80,0,0.8923
1405,6940,Male,0,No,No,35.0,Yes,No,Fiber optic,Yes,No,No,Yes,No,Yes,One year,Yes,Electronic check,89.20,3251.30,0,0.8469
1406,804,Female,0,Yes,No,46.0,Yes,No,DSL,No,Yes,No,Yes,Yes,No,Two year,Yes,Credit card (automatic),64.20,3009.50,0,0.9613
1407,1143,Male,1,Yes,Yes,11.0,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Credit card (automatic),75.20,775.30,1,0.6687


In [53]:
len(predict.loc[predict['Label']==0])

1111

In [56]:
len(predict.loc[predict['Label']==1])

298