# IoT Planning 

**Input**: data from telco's customers with demographic, services and account information.

**Output**: flask application with form to be filled with customer's information. The application should return with the prediction result if the customer will or will not churn in that month.

**Tasks**:
- data description
- end-to-end solution
- metrics definition
- data cleaning
- check the unbalency
- feature engineering
- eda
- data preparation
- feature selection
- ml models
- hyperparameters fine tuning
- deploy

# 0.0 Imports

In [27]:
from IPython.core.display      import HTML
from pycaret.classification    import *
from scikitplot                import metrics         as mt
from sklearn                   import model_selection as ms
from sklearn                   import preprocessing   as pp
from sklearn                   import metrics         as m
from keras                     import models          as ml
from keras                     import layers          as l

import matplotlib.pyplot as plt
import pandas            as pd
import seaborn           as sns
import numpy             as np

import inflection

## 0.1 Helper Functions

In [4]:
def jupyter_settings():
    %matplotlib inline
    %pylab inline
    
    plt.style.use('bmh')
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 24
    
    display(HTML('<style>.container{width:100% !important; }</style>'))
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option('display.expand_frame_repr', False)
    
    sns.set

jupyter_settings()

Populating the interactive namespace from numpy and matplotlib


## 0.2 Load Data

In [5]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

# 1.0 Data Description

In [6]:
df1 = df_train.copy()

## 1.1 Data Dimension

In [7]:
print('Number of rows: {}'.format(df1.shape[0]))
print('Number of columns: {}'.format(df1.shape[1]))

Number of rows: 5634
Number of columns: 21


## 1.2 Rename Columns

In [8]:
cols_old = ['id', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
            'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
            'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
            'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
            'MonthlyCharges', 'TotalCharges', 'Churn']
snakecase = lambda x: inflection.underscore(x)
new_cols = list(map(snakecase, cols_old))
df1.columns = new_cols

In [9]:
cols_old = ['id', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges']
snakecase = lambda x: inflection.underscore(x)
new_cols = list(map(snakecase, cols_old))
df_test.columns = new_cols

## 1.3 Data Types

In [10]:
df1.dtypes

id                     int64
gender                object
senior_citizen         int64
partner               object
dependents            object
tenure               float64
phone_service         object
multiple_lines        object
internet_service      object
online_security       object
online_backup         object
device_protection     object
tech_support          object
streaming_tv          object
streaming_movies      object
contract              object
paperless_billing     object
payment_method        object
monthly_charges      float64
total_charges         object
churn                  int64
dtype: object

## 1.4 Check N/A

In [11]:
df1.isna().sum()

id                     0
gender                 0
senior_citizen         0
partner                0
dependents           218
tenure               461
phone_service          0
multiple_lines         0
internet_service       0
online_security        0
online_backup          0
device_protection      0
tech_support           0
streaming_tv           0
streaming_movies       0
contract               0
paperless_billing      0
payment_method        99
monthly_charges        0
total_charges          0
churn                  0
dtype: int64

## 1.5 Check Balanced Data

In [12]:
df1['churn'].value_counts(normalize=True)

0    0.734647
1    0.265353
Name: churn, dtype: float64

## 1.5 Drop N/A

In [13]:
df1 = df1.dropna()

## 1.6 Change Data Types

In [14]:
df1['total_charges'] = pd.to_numeric(df1['total_charges'], errors='coerce')

In [15]:
df_test['total_charges'] = pd.to_numeric(df_test['total_charges'], errors='coerce')

# Model w/ Pycaret

In [16]:
setup(data=df1, target='churn')

Unnamed: 0,Description,Value
0,session_id,8295
1,Target,churn
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(4884, 21)"
5,Missing Values,True
6,Numeric Features,4
7,Categorical Features,16
8,Ordinal Features,False
9,High Cardinality Features,False


(False,
 2       0
 3       1
 4       1
 5       0
 6       0
        ..
 5628    1
 5629    0
 5631    0
 5632    0
 5633    0
 Name: churn, Length: 4884, dtype: int64,
         id  gender  senior_citizen partner dependents  tenure phone_service    multiple_lines internet_service      online_security  ...    device_protection         tech_support         streaming_tv     streaming_movies        contract paperless_billing             payment_method monthly_charges  total_charges  churn
 2     6479  Female               0     Yes         No    60.0           Yes               Yes      Fiber optic                   No  ...                  Yes                  Yes                  Yes                  Yes        Two year               Yes    Credit card (automatic)          110.80        6640.70      0
 3     6861  Female               0      No         No    37.0           Yes               Yes      Fiber optic                   No  ...                  Yes                   No        

In [17]:
best_model = compare_models(n_select=5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.8098,0.8471,0.5422,0.6643,0.5963,0.4737,0.4784,0.859
ridge,Ridge Classifier,0.8048,0.0,0.5152,0.6592,0.5774,0.4533,0.4596,0.014
lda,Linear Discriminant Analysis,0.8031,0.8393,0.5569,0.6391,0.5945,0.4654,0.4678,0.026
gbc,Gradient Boosting Classifier,0.8016,0.8435,0.5366,0.642,0.5828,0.4544,0.4586,0.431
rf,Random Forest Classifier,0.797,0.8258,0.5129,0.6358,0.5662,0.4361,0.4412,0.295
ada,Ada Boost Classifier,0.7961,0.837,0.5275,0.6266,0.5717,0.4395,0.4428,0.15
lightgbm,Light Gradient Boosting Machine,0.7929,0.8282,0.5299,0.6186,0.5697,0.4346,0.4375,0.355
et,Extra Trees Classifier,0.7806,0.8076,0.4735,0.6001,0.5279,0.3878,0.3933,0.306
nb,Naive Bayes,0.7504,0.8324,0.7734,0.5138,0.6168,0.443,0.464,0.013
dummy,Dummy Classifier,0.7405,0.5,0.0,0.0,0.0,0.0,0.0,0.014


In [18]:
len(df1.loc[df1['churn']==0])/len(df1)

0.7391482391482391

In [19]:
t = tune_model(best_model[0])

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7982,0.835,0.5795,0.6145,0.5965,0.4621,0.4625
1,0.8304,0.8835,0.6067,0.7013,0.6506,0.5394,0.5419
2,0.8012,0.833,0.5056,0.6522,0.5696,0.443,0.4491
3,0.8041,0.8388,0.5618,0.641,0.5988,0.47,0.4717
4,0.8216,0.8497,0.573,0.6892,0.6258,0.51,0.5137
5,0.8129,0.8538,0.5393,0.6761,0.6,0.4799,0.4851
6,0.8041,0.867,0.573,0.6375,0.6036,0.4739,0.4751
7,0.8304,0.8677,0.5955,0.7067,0.6463,0.5359,0.5393
8,0.8006,0.8128,0.5455,0.6316,0.5854,0.455,0.4571
9,0.8065,0.8335,0.4773,0.6774,0.56,0.4407,0.4518


In [20]:
final_model = finalize_model(t)

In [21]:
predict = predict_model(final_model, df_test)
predict

Unnamed: 0,id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service,online_security,...,tech_support,streaming_tv,streaming_movies,contract,paperless_billing,payment_method,monthly_charges,total_charges,Label,Score
0,5027,Male,0,Yes,Yes,23.0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,One year,Yes,Credit card (automatic),20.00,445.30,0,0.9400
1,1733,Male,1,Yes,Yes,61.0,Yes,Yes,Fiber optic,No,...,No,Yes,Yes,One year,No,,99.00,5969.30,0,0.8760
2,5384,Male,0,No,No,36.0,Yes,Yes,Fiber optic,No,...,No,No,Yes,Month-to-month,Yes,Electronic check,84.75,3050.15,1,0.5329
3,6554,Female,0,Yes,Yes,61.0,No,No phone service,DSL,No,...,Yes,Yes,Yes,Two year,Yes,Bank transfer (automatic),61.45,3751.15,0,0.9684
4,364,Female,0,No,No,47.0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,Two year,Yes,Mailed check,20.55,945.70,0,0.9879
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1404,4897,Male,0,No,,24.0,Yes,No,DSL,No,...,No,No,No,Month-to-month,Yes,Mailed check,49.70,1167.80,0,0.8513
1405,6940,Male,0,No,No,35.0,Yes,No,Fiber optic,Yes,...,Yes,No,Yes,One year,Yes,Electronic check,89.20,3251.30,0,0.7871
1406,804,Female,0,Yes,No,46.0,Yes,No,DSL,No,...,Yes,Yes,No,Two year,Yes,Credit card (automatic),64.20,3009.50,0,0.9636
1407,1143,Male,1,Yes,Yes,11.0,Yes,Yes,Fiber optic,No,...,No,No,No,Month-to-month,Yes,Credit card (automatic),75.20,775.30,1,0.5715


In [22]:
len(predict.loc[predict['Label']==0])

1116

In [23]:
len(predict.loc[predict['Label']==1])

293

# Model w/ Neural Network MLP

In [37]:
df2 = df1.copy()
df2_dummy = pd.get_dummies(df2.drop(['id', 'churn'], axis=1))
df2 = pd.concat([df2[['id', 'churn']], df2_dummy], axis=1)

In [38]:
X = df2.drop(['id', 'churn'], axis=1)
y = df2['churn'].copy()

In [39]:
#training and test dataset
X_train, X_test, y_train, y_test = ms.train_test_split(X, y, test_size=0.2, random_state=32)

In [47]:
y_train_nn = y_train.values.reshape(-1, 1)

In [49]:
#model definition
model = ml.Sequential()
model.add(l.Dense(128, input_dim=X_train.shape[1], activation='relu'))
model.add(l.Dense(11, activation='softmax'))

#model compile
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

#train model
model.fit(X_train, y_train_nn, epochs=100)

Epoch 1/100


ValueError: in user code:

    File "C:\Users\Giovana\anaconda3\lib\site-packages\keras\engine\training.py", line 1021, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\Giovana\anaconda3\lib\site-packages\keras\engine\training.py", line 1010, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\Giovana\anaconda3\lib\site-packages\keras\engine\training.py", line 1000, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\Giovana\anaconda3\lib\site-packages\keras\engine\training.py", line 860, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "C:\Users\Giovana\anaconda3\lib\site-packages\keras\engine\training.py", line 918, in compute_loss
        return self.compiled_loss(
    File "C:\Users\Giovana\anaconda3\lib\site-packages\keras\engine\compile_utils.py", line 201, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "C:\Users\Giovana\anaconda3\lib\site-packages\keras\losses.py", line 141, in __call__
        losses = call_fn(y_true, y_pred)
    File "C:\Users\Giovana\anaconda3\lib\site-packages\keras\losses.py", line 245, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "C:\Users\Giovana\anaconda3\lib\site-packages\keras\losses.py", line 1789, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "C:\Users\Giovana\anaconda3\lib\site-packages\keras\backend.py", line 5083, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (None, 1) and (None, 11) are incompatible
