In [None]:
#importing all necessary libraries
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
#from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer,SimpleImputer
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,LabelEncoder,FunctionTransformer
from sklearn.compose import ColumnTransformer,make_column_selector as selector
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score,recall_score, precision_score, f1_score,roc_auc_score,classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
import pickle

In [2]:
data = pd.read_csv(r"C:\Users\GAYATHRI\OneDrive\Documents\NovaConnect\dataset.csv")
data.head(2)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No


In [3]:
#columns of the data
data.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [4]:
#Formatting Column names
data.rename(columns={"customerID":"CustomerID", "gender":"Gender", "tenure":"Tenure"},inplace=True)
data.columns

Index(['CustomerID', 'Gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'Tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [5]:
#Type of values in each column
data.dtypes

CustomerID           object
Gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
Tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [6]:
#Converting the Total Charges column to a numeric type column since it contains numeric data but is classified as strings
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
data.dtypes

CustomerID           object
Gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
Tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object

In [7]:
#Checking for missing values
data.isnull().sum()

CustomerID           0
Gender               0
SeniorCitizen        0
Partner              0
Dependents           0
Tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [8]:
#Only keeping columns that contibute the most towards predicting churn and dropping the rest
cols = ['Contract', 'Tenure','InternetService','TotalCharges','PaymentMethod','MonthlyCharges','TechSupport','Churn']
data = data[cols].copy() #copy of the trimmed dataset


In [9]:
#Dropping the Total Charges column from data --> This is our input data X
X = data.drop('Churn',axis=1)

#The data containing only Total Charges --> This is our output or target data
y = data['Churn']

In [10]:
#splitting the data into categorical and numerical columns to get some insights
feat_num=list(X.select_dtypes(include=np.number).columns)
feat_cat=list(X.select_dtypes(exclude=np.number).columns)
feat_num,feat_cat

(['Tenure', 'TotalCharges', 'MonthlyCharges'],
 ['Contract', 'InternetService', 'PaymentMethod', 'TechSupport'])

In [11]:
#Creating custom encoders for columns that use frequency encoding
#Frequency Encoder
class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.freq_map_ = {}

    def fit(self, X, y=None):
        X = pd.Series(X.ravel())  
        freq = X.value_counts(normalize=True)
        self.freq_map_ = freq.to_dict()
        return self

    def transform(self, X):
        X = pd.Series(X.ravel())
        return X.map(self.freq_map_).fillna(0).to_frame()

In [12]:
ordinal_col = ["Contract"]
ordinal_order = [["Month-to-month", "One year", "Two year"]]

frequency_col = ["PaymentMethod"] 

onehot_col = ["InternetService", "TechSupport"]

In [13]:
#train test split
Xtrain,Xtest,ytrain,ytest=train_test_split(X,y,test_size=0.3,random_state=48)

In [14]:
ordinal_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ordinal", OrdinalEncoder(categories=ordinal_order))
])

frequency_pipe =Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("frequency", FrequencyEncoder())
])

onehot_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore",sparse_output=False))
])
num_pipe = Pipeline(steps=[
    ('imputer', KNNImputer()),
    ('std_scaler', StandardScaler())
])


In [None]:
data_pipeline = ColumnTransformer(transformers=[
    ("ordinal", ordinal_pipe,ordinal_col),
    ("frequency", frequency_pipe, frequency_col),
    ("onehot", onehot_pipe,onehot_col),
    ('numeric',num_pipe,feat_num)
],remainder='passthrough')


In [16]:
data_pipeline

In [17]:
full_pipeline=Pipeline([('pre-processor',data_pipeline),("smote", SMOTE(sampling_strategy=0.9,random_state=42)),('model',RandomForestClassifier())])
full_pipeline

In [18]:
full_pipeline.fit(Xtrain,ytrain)



In [19]:
full_pipeline.predict(Xtest)

array(['Yes', 'Yes', 'Yes', ..., 'No', 'No', 'No'], dtype=object)

In [20]:
pickle.dump(full_pipeline,open('my_ml_pipeline','wb'))