# Data Pipeline

In [46]:
import os
import yaml
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split

In [47]:
os.chdir('/home/febbyngrni/telco_churn')
os.getcwd()

'/home/febbyngrni/telco_churn'

In [48]:
params_dir = 'config/config.yaml'

In [49]:
def load_params(params_dir):
    with open(params_dir, 'r') as file:
        params = yaml.safe_load(file)

    return params

In [50]:
params = load_params(params_dir)
params

{'dataset_path': 'data/raw/telco_customer_churn.csv',
 'dataset_cleaned_path': 'data/processed/cleaned_data.pkl',
 'train_set_path': ['data/processed/X_train.pkl',
  'data/processed/y_train.pkl'],
 'valid_set_path': ['data/processed/X_valid.pkl',
  'data/processed/y_valid.pkl'],
 'test_set_path': ['data/processed/X_test.pkl', 'data/processed/y_test.pkl'],
 'data_rus_path': ['data/processed/X_rus.pkl', 'data/processed/y_rus.pkl'],
 'data_ros_path': ['data/processed/X_ros.pkl', 'data/processed/y_ros.pkl'],
 'data_sm_path': ['data/processed/X_sm.pkl', 'data/processed/y_sm.pkl'],
 'valid_feng_set_path': ['data/processed/X_valid_feng.pkl',
  'data/processed/y_valid_feng.pkl'],
 'test_feng_set_path': ['data/processed/X_test_feng.pkl',
  'data/processed/y_test_feng.pkl'],
 'ohe_gender_path': 'models/ohe_gender.pkl',
 'ohe_senior_citizen_path': 'models/ohe_senior_citizen.pkl',
 'ohe_partner_path': 'models/ohe_partner.pkl',
 'ohe_dependents_path': 'models/ohe_dependents.pkl',
 'ohe_phone_servic

## Data Collection

In [51]:
def load_data(dataset_dir):
    try:
        dataset = pd.read_csv(dataset_dir)
        return dataset
    
    except FileNotFoundError:
        print(f"File {dataset_dir} tidak ditemukan.")

    except pd.errors.EmptyDataError:
        print(f"File {dataset_dir} kosong.")

    except pd.errors.ParserError:
        print(f"Terdapat error saat parsing file {dataset_dir}.")
        
    except Exception as e:
        print(f"Terjadi error: {e}")

In [52]:
df = load_data(params['dataset_path'])
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


## Data Validation

### Data Shape

In [53]:
# check data shape
df.shape

(7043, 21)

### Data Types

In [54]:
# check data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


### Check Missing Values

In [55]:
# check number of missing values
df.isnull().sum().sort_values(ascending=False)

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

### Check Duplicated Data

In [56]:
# check duplicated data
print(f'Number of duplicated data: {df.duplicated().sum()}')

Number of duplicated data: 0


### Drop Unnecessary Column

In [57]:
# drop unnecessary column
column_to_drop = 'customerID'
df = df.drop(columns = column_to_drop)

### Handling Senior Citizen

In [58]:
mapping_citizen = {0 : 'No', 1 : 'Yes'}
df['SeniorCitizen'] = df['SeniorCitizen'].map(mapping_citizen)

In [59]:
df['SeniorCitizen']

0        No
1        No
2        No
3        No
4        No
       ... 
7038     No
7039     No
7040     No
7041    Yes
7042     No
Name: SeniorCitizen, Length: 7043, dtype: object

### Handling Payment Method

In [60]:
df['PaymentMethod'].unique()

array(['Electronic check', 'Mailed check', 'Bank transfer (automatic)',
       'Credit card (automatic)'], dtype=object)

In [61]:
mapping_payment = {
    'Electronic check' : 'E-Check',
    'Mailed check' : 'M-Check',
    'Bank transfer (automatic)' : 'Auto Bank',
    'Credit card (automatic)' : 'Auto Card'
}

df['PaymentMethod'] = df['PaymentMethod'].map(mapping_payment)
df['PaymentMethod']

0         E-Check
1         M-Check
2         M-Check
3       Auto Bank
4         E-Check
          ...    
7038      M-Check
7039    Auto Card
7040      E-Check
7041      M-Check
7042    Auto Bank
Name: PaymentMethod, Length: 7043, dtype: object

### Handling Total Charges

In [62]:
df['TotalCharges'] = df['TotalCharges'].astype(float)

ValueError: could not convert string to float: ' '

In [63]:
df[df['TotalCharges'] == ' ']

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
488,Female,No,Yes,Yes,0,No,No phone service,DSL,Yes,No,Yes,Yes,Yes,No,Two year,Yes,Auto Bank,52.55,,No
753,Male,No,No,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,M-Check,20.25,,No
936,Female,No,Yes,Yes,0,Yes,No,DSL,Yes,Yes,Yes,No,Yes,Yes,Two year,No,M-Check,80.85,,No
1082,Male,No,Yes,Yes,0,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,M-Check,25.75,,No
1340,Female,No,Yes,Yes,0,No,No phone service,DSL,Yes,Yes,Yes,Yes,Yes,No,Two year,No,Auto Card,56.05,,No
3331,Male,No,Yes,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,M-Check,19.85,,No
3826,Male,No,Yes,Yes,0,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,M-Check,25.35,,No
4380,Female,No,Yes,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,M-Check,20.0,,No
5218,Male,No,Yes,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,Yes,M-Check,19.7,,No
6670,Female,No,Yes,Yes,0,Yes,Yes,DSL,No,Yes,Yes,Yes,Yes,No,Two year,No,M-Check,73.35,,No


In [64]:
df['TotalCharges'] = df['TotalCharges'].replace(' ', 0).astype('float')
df['TotalCharges'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 7043 entries, 0 to 7042
Series name: TotalCharges
Non-Null Count  Dtype  
--------------  -----  
7043 non-null   float64
dtypes: float64(1)
memory usage: 55.1 KB


### Range Data

In [65]:
df.describe()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges
count,7043.0,7043.0,7043.0
mean,32.371149,64.761692,2279.734304
std,24.559481,30.090047,2266.79447
min,0.0,18.25,0.0
25%,9.0,35.5,398.55
50%,29.0,70.35,1394.55
75%,55.0,89.85,3786.6
max,72.0,118.75,8684.8


### Change Column Name

In [66]:
mapping_column_name = {
    'SeniorCitizen' : 'senior_citizen',
    'Partner' : 'partner',
    'Dependents' : 'dependents',
    'tenure' : 'tenure_months',
    'PhoneService' : 'phone_service',
    'MultipleLines' : 'multiple_lines',
    'InternetService' : 'internet_service',
    'OnlineSecurity' : 'online_security',
    'OnlineBackup' : 'online_backup',
    'DeviceProtection' : 'device_protection',
    'TechSupport' : 'tech_support',
    'StreamingTV' : 'streaming_tv',
    'StreamingMovies' : 'streaming_movies',
    'Contract' : 'contract',
    'PaperlessBilling' : 'paperless_billing',
    'PaymentMethod' : 'payment_method',
    'MonthlyCharges' : 'monthly_charges',
    'TotalCharges' : 'total_charges',
    'Churn' : 'churn'
}

df = df.rename(columns = mapping_column_name)

In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             7043 non-null   object 
 1   senior_citizen     7043 non-null   object 
 2   partner            7043 non-null   object 
 3   dependents         7043 non-null   object 
 4   tenure_months      7043 non-null   int64  
 5   phone_service      7043 non-null   object 
 6   multiple_lines     7043 non-null   object 
 7   internet_service   7043 non-null   object 
 8   online_security    7043 non-null   object 
 9   online_backup      7043 non-null   object 
 10  device_protection  7043 non-null   object 
 11  tech_support       7043 non-null   object 
 12  streaming_tv       7043 non-null   object 
 13  streaming_movies   7043 non-null   object 
 14  contract           7043 non-null   object 
 15  paperless_billing  7043 non-null   object 
 16  payment_method     7043 

## Data Defense

In [68]:
def check_data(input_data, params):
    # measure the range of input data
    len_input_data = len(input_data)

    # check data types
    assert input_data.select_dtypes('object').columns.to_list() == params['object_columns'], 'an error occurs in object column(s)'
    assert input_data.select_dtypes('float').columns.to_list() == params['float_columns'], 'an error occurs in float column(s)'
    assert input_data.select_dtypes('int').columns.to_list() == params['int_columns'], 'an error occurs in integer column(s)'

    # check range of data
    assert input_data[params['float_columns'][0]].between(params['range_monthly_charges'][0], params['range_monthly_charges'][1]).sum() == len_input_data, 'an error occurs in monthly charges range'
    assert input_data[params['float_columns'][1]].between(params['range_total_charges'][0], params['range_total_charges'][1]).sum() == len_input_data, 'an error occurs in total charges range'
    assert input_data[params['int_columns'][0]].between(params['range_tenure_months'][0], params['range_tenure_months'][1]).sum() == len_input_data, 'an error occurs in tenure months range'
    assert set(input_data[params['object_columns'][0]]).issubset(set(params['range_gender'])), 'an error occurs in gender range'
    assert set(input_data[params['object_columns'][1]]).issubset(set(params['range_senior_citizen'])), 'an error occurs in senior citizen range'
    assert set(input_data[params['object_columns'][2]]).issubset(set(params['range_partner'])), 'an error occurs in partner range'
    assert set(input_data[params['object_columns'][3]]).issubset(set(params['range_dependents'])), 'an error occurs in dependents range'
    assert set(input_data[params['object_columns'][4]]).issubset(set(params['range_phone_service'])), 'an error occurs in phone service range'
    assert set(input_data[params['object_columns'][5]]).issubset(set(params['range_multiple_lines'])), 'an error occurs in multiple lines range'
    assert set(input_data[params['object_columns'][6]]).issubset(set(params['range_internet_service'])), 'an error occurs in internet service range'
    assert set(input_data[params['object_columns'][7]]).issubset(set(params['range_online_security'])), 'an error occurs in online security range'
    assert set(input_data[params['object_columns'][8]]).issubset(set(params['range_online_backup'])), 'an error occurs in online backup range'
    assert set(input_data[params['object_columns'][9]]).issubset(set(params['range_device_protection'])), 'an error occurs in device protection range'
    assert set(input_data[params['object_columns'][10]]).issubset(set(params['range_tech_support'])), 'an error occurs in tech support range'
    assert set(input_data[params['object_columns'][11]]).issubset(set(params['range_streaming_tv'])), 'an error occurs in streaming tv range'
    assert set(input_data[params['object_columns'][12]]).issubset(set(params['range_streaming_movies'])), 'an error occurs in streaming movies range'
    assert set(input_data[params['object_columns'][13]]).issubset(set(params['range_contract'])), 'an error occurs in contract range'
    assert set(input_data[params['object_columns'][14]]).issubset(set(params['range_paperless_billing'])), 'an error occurs in paperless billing range'
    assert set(input_data[params['object_columns'][15]]).issubset(set(params['range_payment_method'])), 'an error occurs in payment method range'

In [69]:
check_data(input_data=df, params=params)

## Data Splitting

In [70]:
def split_input_output(data, target_column):
    X = data.drop(columns = target_column)
    y = data[target_column]

    return X, y

In [71]:
X, y = split_input_output(
    data = df,
    target_column = 'churn'
)

In [72]:
X.head()

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure_months,phone_service,multiple_lines,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,contract,paperless_billing,payment_method,monthly_charges,total_charges
0,Female,No,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,E-Check,29.85,29.85
1,Male,No,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,M-Check,56.95,1889.5
2,Male,No,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,M-Check,53.85,108.15
3,Male,No,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Auto Bank,42.3,1840.75
4,Female,No,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,E-Check,70.7,151.65


In [73]:
y.head()

0     No
1     No
2    Yes
3     No
4    Yes
Name: churn, dtype: object

In [74]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size = params['test_size'],
    random_state = 42,
    stratify = y)

In [75]:
X_valid, X_test, y_valid, y_test = train_test_split(
    X_test,
    y_test,
    test_size = params['valid_size'],
    random_state = 42,
    stratify = y_test)

In [76]:
print(f'Train set shape: {len(X_train)}, {len(y_train)}')
print(f'Valid set shape: {len(X_valid)}, {len(y_valid)}')
print(f'Test set shape: {len(X_test)}, {len(y_test)}')

Train set shape: 4930, 4930
Valid set shape: 1056, 1056
Test set shape: 1057, 1057


In [77]:
joblib.dump(df, params['dataset_cleaned_path'])

joblib.dump(X_train, params['train_set_path'][0])
joblib.dump(y_train, params['train_set_path'][1])

joblib.dump(X_test, params['test_set_path'][0])
joblib.dump(y_test, params['test_set_path'][1])

joblib.dump(X_valid, params['valid_set_path'][0])
joblib.dump(y_valid, params['valid_set_path'][1])

['data/processed/y_valid.pkl']