## Start and prep

In [1]:
# we will use Kaggle dataset from here:
# https://www.kaggle.com/datasets/blastchar/telco-customer-churn
# code and video from here:
# https://github.com/DataTalksClub/machine-learning-zoomcamp/blob/master/03-classification/01-churn-project.md
# notebook 
# https://github.com/DataTalksClub/machine-learning-zoomcamp/blob/master/03-classification/notebook.ipynb
# dataset for download
# https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv

# Data preparation

# Download the data, read it with pandas
# Look at the data
# Make column names and values look uniform
# Check if all the columns read correctly
# Check if the churn variable needs any preparation
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [2]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv'


In [3]:
# run once to download the dataset
# !wget $data -O data-week-3.csv 

In [4]:
df = pd.read_csv('data-week-3.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## Data cleaning part

In [5]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [6]:
df.head().T # nice trick to see all colums ))

Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no


In [7]:
# df.dtypes # totalcharges  object - it should be a number! Fill missing values with zeroes...
df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce') # convedrting to numbers and replace "-" with NaN

In [8]:
df.totalcharges = df.totalcharges.fillna(0) # fill NaN with zeroes

In [9]:
# df.totalcharges.isnull().sum() # np.int64(0) - now it is OK - no NaN here, only digits
# df.dtypes # totalcharges float64 - all good!
# but churn is the onject still - churn  object - it has yes/no text marks
# df.churn
df.churn = (df.churn == 'yes').astype(int) 

In [10]:
# df.churn # now all looks good -we converted yes to 1 and no to 0

## Setting up the validation framework using Scikit-Learn

In [11]:
# to split dataset automatically we will use scikit-learn
from sklearn.model_selection import train_test_split

In [12]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [13]:
# lets see how skl split our dataset
len(df_train), len(df_val), len(df_test) # looks good - (4225, 1409, 1409)

(4225, 1409, 1409)

In [14]:
# we need to reset index in each split for simplicity
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [15]:
# and remove label we predict - so our model will not accidentally look at it
y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values

del df_train['churn']
del df_val['churn']
del df_test['churn']

In [17]:
# offtopic - what if I forgot what train_test_split function is doing? Built-in help:
train_test_split?

[31mSignature:[39m
train_test_split(
    *arrays,
    test_size=[38;5;28;01mNone[39;00m,
    train_size=[38;5;28;01mNone[39;00m,
    random_state=[38;5;28;01mNone[39;00m,
    shuffle=[38;5;28;01mTrue[39;00m,
    stratify=[38;5;28;01mNone[39;00m,
)
[31mDocstring:[39m
Split arrays or matrices into random train and test subsets.

Quick utility that wraps input validation,
``next(ShuffleSplit().split(X, y))``, and application to input data
into a single call for splitting (and optionally subsampling) data into a
one-liner.

Read more in the :ref:`User Guide <cross_validation>`.

Parameters
----------
*arrays : sequence of indexables with same length / shape[0]
    Allowed inputs are lists, numpy arrays, scipy-sparse
    matrices or pandas dataframes.

test_size : float or int, default=None
    If float, should be between 0.0 and 1.0 and represent the proportion
    of the dataset to include in the test split. If int, represents the
    absolute number of test samples. If None

## EDA

In [16]:
# We have to
# -- Check missing values
# -- Look at the target variable (churn)
# -- Look at numerical and categorical variables

df_full_train = df_full_train.reset_index(drop=True)

In [22]:
# df_full_train.isnull().sum() # all good - zeroes everywhere - no missing values ))

# we have not deleted our label from full_train - so we can have a look at stats:
df_full_train.churn.value_counts()

# churn
# 0    4113 - customers stayed
# 1    1521 - churned customers


churn
0    4113
1    1521
Name: count, dtype: int64