## Start and prep

In [1]:
# we will use Kaggle dataset from here:
# https://www.kaggle.com/datasets/blastchar/telco-customer-churn
# code and video from here:
# https://github.com/DataTalksClub/machine-learning-zoomcamp/blob/master/03-classification/01-churn-project.md
# notebook 
# https://github.com/DataTalksClub/machine-learning-zoomcamp/blob/master/03-classification/notebook.ipynb
# dataset for download
# https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv

# Data preparation

# Download the data, read it with pandas
# Look at the data
# Make column names and values look uniform
# Check if all the columns read correctly
# Check if the churn variable needs any preparation
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [2]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv'


In [4]:
# run once to download the dataset
# !wget $data -O data-week-3.csv 

In [5]:
df = pd.read_csv('data-week-3.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## Data cleaning and EDA part

In [6]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [8]:
df.head().T # nice trick to see all colums ))

Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no


In [13]:
# df.dtypes # totalcharges  object - it should be a number! Fill missing values with zeroes...
df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce') # convedrting to numbers and replace "-" with NaN

In [14]:
df.totalcharges = df.totalcharges.fillna(0) # fill NaN with zeroes

In [19]:
# df.totalcharges.isnull().sum() # np.int64(0) - now it is OK - no NaN here, only digits
# df.dtypes # totalcharges float64 - all good!
# but churn is the onject still - churn  object - it has yes/no text marks
# df.churn
df.churn = (df.churn == 'yes').astype(int) 

In [21]:
# df.churn # converted yes to 1 and no to 0

## Setting up the validation framework using Scikit-Learn