In [51]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder,OrdinalEncoder
from sklearn.linear_model import LogisticRegression

In [52]:
df = pd.read_csv('telco_data .csv')

In [53]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            6293 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           6043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            4543 non-null   float64
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   6043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       5543 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [55]:
df.isna().sum()

customerID             0
gender               750
SeniorCitizen          0
Partner             1000
Dependents             0
tenure              2500
PhoneService           0
MultipleLines          0
InternetService     1000
OnlineSecurity         0
OnlineBackup           0
DeviceProtection       0
TechSupport            0
StreamingTV         1500
StreamingMovies        0
Contract               0
PaperlessBilling       0
PaymentMethod          0
MonthlyCharges      1500
TotalCharges           0
Churn                  0
dtype: int64

In [56]:
df.head() 

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1.0,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34.0,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2.0,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45.0,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2.0,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [57]:
df=df.drop('customerID',axis=1)

In [58]:
x=df.drop('Churn',axis=1)
y=df.Churn

In [59]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,train_size=0.8,random_state=42)  

In [60]:
num_cols = x.select_dtypes(include='number').columns
obj_cols = x.select_dtypes(exclude='number').columns

In [61]:
x

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,Female,0,Yes,No,1.0,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85
1,Male,0,No,No,34.0,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5
2,Male,0,No,No,2.0,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15
3,Male,0,No,No,45.0,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75
4,Female,0,No,No,2.0,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24.0,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5
7039,Female,0,Yes,Yes,72.0,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9
7040,Female,0,Yes,Yes,11.0,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45
7041,Male,1,Yes,No,,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6


In [62]:
compose = ColumnTransformer(
    transformers=[
    ('num_scaler', StandardScaler(),num_cols ),  #columnnames ['tenure', 'MonthlyCharges', 'TotalCharges']
    ('cat_encoder', OneHotEncoder(), obj_cols ) #columnnames ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService',
                                                              #   'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                                                              #   'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod'])
]
)

In [63]:
num_pipeline=Pipeline(
    steps=[
        ("Imputer",SimpleImputer(strategy="mean")),
        ('StandardScaler',StandardScaler())
    ]
)
num_pipeline.fit_transform(xtrain[num_cols])    

array([[-0.4377492 ,  0.        ,  0.        ],
       [-0.4377492 ,  0.        ,  0.        ],
       [-0.4377492 ,  0.        ,  0.        ],
       ...,
       [-0.4377492 , -0.93381312, -1.46464784],
       [ 2.28441306, -0.93381312,  1.15806793],
       [-0.4377492 , -0.29753207, -1.50986708]], shape=(5634, 3))

In [64]:
obj_pipeline=Pipeline(
    steps=[
        ("Imputer",SimpleImputer(strategy="constant",fill_value="unknown")),
        ("OrdinalEncoder",OrdinalEncoder())
    ]
)
obj_pipeline.fit_transform(xtrain[obj_cols])

array([[0.000e+00, 0.000e+00, 1.000e+00, ..., 0.000e+00, 3.000e+00,
        4.920e+02],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 1.000e+00, 0.000e+00,
        3.511e+03],
       [1.000e+00, 1.000e+00, 0.000e+00, ..., 1.000e+00, 2.000e+00,
        1.566e+03],
       ...,
       [1.000e+00, 1.000e+00, 1.000e+00, ..., 1.000e+00, 2.000e+00,
        2.167e+03],
       [1.000e+00, 0.000e+00, 0.000e+00, ..., 1.000e+00, 2.000e+00,
        3.050e+02],
       [1.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 1.000e+00,
        3.149e+03]], shape=(5634, 16))

In [65]:
pipeline=Pipeline(
    steps=[
        ('preprocessing',compose),
        ('model',LogisticRegression())
        ]
)
# pipeline.fit(xtrain,ytrain) 

In [66]:
# pipeline.fit(xtrain,ytrain) 
'''This step will run after cleaning and data handling of the null values in the dataset.'''

'This step will run after cleaning and data handling of the null values in the dataset.'