In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier 
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score,confusion_matrix,accuracy_score

In [2]:
#Import the Cleaned dataset Files
try:
    df = pd.read_csv("Schema/Cleaned_data_of_chunk.csv")
except FileNotFoundError as e:
    print(f"Error {e}.")

In [3]:
#Check the first 5 rows.
df.head()

Unnamed: 0.1,Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,female,no,yes,no,1,no,no,dsl,no,...,no,no,no,no,month,yes,electronic,29.85,29.85,no
1,1,male,no,no,no,34,yes,no,dsl,yes,...,yes,no,no,no,one year,no,mailed,56.95,1889.5,no
2,2,male,no,no,no,2,yes,no,dsl,yes,...,no,no,no,no,month,yes,mailed,53.85,108.15,yes
3,3,male,no,no,no,45,no,no,dsl,yes,...,yes,yes,no,no,one year,no,bank transfer,42.3,1840.75,no
4,4,female,no,no,no,2,yes,no,fiber optic,no,...,no,no,no,no,month,yes,electronic,70.7,151.65,yes


In [4]:
#Removed unwanted column.
df.drop(columns=["Unnamed: 0"],inplace=True)

In [5]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,female,no,yes,no,1,no,no,dsl,no,yes,no,no,no,no,month,yes,electronic,29.85,29.85,no
1,male,no,no,no,34,yes,no,dsl,yes,no,yes,no,no,no,one year,no,mailed,56.95,1889.5,no
2,male,no,no,no,2,yes,no,dsl,yes,yes,no,no,no,no,month,yes,mailed,53.85,108.15,yes
3,male,no,no,no,45,no,no,dsl,yes,no,yes,yes,no,no,one year,no,bank transfer,42.3,1840.75,no
4,female,no,no,no,2,yes,no,fiber optic,no,no,no,no,no,no,month,yes,electronic,70.7,151.65,yes


In [6]:
#Lets Saperate the Numaric and Objective columns.

temp1 = [] #Its Store Objective columns.
temp2 = [] #Its Store Numaric columns.

for i in df.columns:
    if df[i].dtype == "object":
        temp1.append(i)
    else:
        temp2.append(i)
print(temp1)
print(temp2)

['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn']
['tenure', 'MonthlyCharges', 'TotalCharges']


In [7]:
#lets encode all the labeled columns.

encode = {}
for i in temp1:
    lb = LabelEncoder()
    df[i] = lb.fit_transform(df[i])
    encode[i] = lb
print("Encoding is now completed")

Encoding is now completed


In [8]:
#Lets Split the data using train test split model
x = df.iloc[:,:-1]
y = df["Churn"]

x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=42,train_size=0.2)

print("Train and test splited....")

Train and test splited....


In [9]:
#Lets scale down the data using the StandardScaler

std = StandardScaler()

x_train_scaled = std.fit_transform(x_train)
x_test_scaled = std.transform(x_test)

print("Data Scaled Successfully.....")

Data Scaled Successfully.....


In [10]:
#LogisticRegression Model

lr = LogisticRegression()

lr.fit(x_train_scaled, y_train)

lr_prediction = lr.predict(x_test_scaled)

lr_accuracy = round(accuracy_score(y_test,lr_prediction)*100,2)
print("Accuracy Score:-",lr_accuracy)
print("Mean Square Error (MSE) : ",round(mean_squared_error(y_test,lr_prediction),4))
print("Mean Absolute Error (MAE) : ",round(mean_absolute_error(y_test,lr_prediction),4))
print("Root Mean Square Error (MAE) : ",round(np.square(mean_squared_error(y_test,lr_prediction)),4))
print("R2 Score : ", round(r2_score(y_test,lr_prediction),4))  
print("Confusion Matrics : \n",confusion_matrix(y_test,lr_prediction)) 

Accuracy Score:- 80.32
Mean Square Error (MSE) :  0.1968
Mean Absolute Error (MAE) :  0.1968
Root Mean Square Error (MAE) :  0.0387
R2 Score :  -0.002
Confusion Matrics : 
 [[3690  432]
 [ 677  836]]


In [11]:
# #DecisionTreeClassifier Model

dtc = DecisionTreeClassifier()

dtc.fit(x_train_scaled, y_train)

dtc_prediction = dtc.predict(x_test_scaled)

dtc_accuracy = round(accuracy_score(y_test,dtc_prediction)*100,2)
print("Accuracy Score:-",dtc_accuracy)
print("Mean Square Error (MSE) : ",round(mean_squared_error(y_test,dtc_prediction),4))
print("Mean Absolute Error (MAE) : ",round(mean_absolute_error(y_test,dtc_prediction),4))
print("Root Mean Square Error (MAE) : ",round(np.square(mean_squared_error(y_test,dtc_prediction)),4))
print("R2 Score : ", round(r2_score(y_test,dtc_prediction),4)) 
print("Confusion Matrics : \n",confusion_matrix(y_test,dtc_prediction))  

Accuracy Score:- 71.68
Mean Square Error (MSE) :  0.2832
Mean Absolute Error (MAE) :  0.2832
Root Mean Square Error (MAE) :  0.0802
R2 Score :  -0.442
Confusion Matrics : 
 [[3293  829]
 [ 767  746]]


In [12]:
# xgboost Model

xgb = XGBClassifier(
    random_state=42,
    n_estimators=300,
    learning_rate=0.1,
    max_depth=9,
    eval_metric='logloss'
)
xgb.fit(x_train_scaled, y_train)

xgb_prediction = xgb.predict(x_test_scaled)

xgb_accuracy = round(accuracy_score(y_test,xgb_prediction)*100,2)
print("Accuracy Score:-",xgb_accuracy)
print("Mean Square Error (MSE) : ",round(mean_squared_error(y_test,xgb_prediction),4))
print("Mean Absolute Error (MAE) : ",round(mean_absolute_error(y_test,xgb_prediction),4))
print("Root Mean Square Error (MAE) : ",round(np.square(mean_squared_error(y_test,xgb_prediction)),4))
print("R2 Score : ", round(r2_score(y_test,xgb_prediction),4)) 
print("Confusion Matrics : \n",confusion_matrix(y_test,xgb_prediction)) 

Accuracy Score:- 77.05
Mean Square Error (MSE) :  0.2295
Mean Absolute Error (MAE) :  0.2295
Root Mean Square Error (MAE) :  0.0527
R2 Score :  -0.1683
Confusion Matrics : 
 [[3608  514]
 [ 779  734]]


In [13]:
#KNeighborsClassifier Model

knc = KNeighborsClassifier(n_neighbors=30)

knc.fit(x_train_scaled, y_train)

knc_predictoin = knc.predict(x_test_scaled)

knc_accuracy = round(accuracy_score(y_test,knc_predictoin)*100,2)
print("Accuracy Score:-",knc_accuracy)
print("Mean Square Error (MSE) : ",round(mean_squared_error(y_test,knc_predictoin),4))
print("Mean Absolute Error (MAE) : ",round(mean_absolute_error(y_test,knc_predictoin),4))
print("Root Mean Square Error (MAE) : ",round(np.square(mean_squared_error(y_test,knc_predictoin)),4))
print("R2 Score : ", round(r2_score(y_test,knc_predictoin),4)) 
print("Confusion Matrics : \n",confusion_matrix(y_test,knc_predictoin)) 

Accuracy Score:- 77.8
Mean Square Error (MSE) :  0.222
Mean Absolute Error (MAE) :  0.222
Root Mean Square Error (MAE) :  0.0493
R2 Score :  -0.1303
Confusion Matrics : 
 [[3701  421]
 [ 830  683]]


In [14]:
#Lets Find the best model for our project.
models = {
    "Logistic Regression": lr_accuracy,
    "Decision Tree": dtc_accuracy,
    "XGBoost": xgb_accuracy,
    "KNN": knc_accuracy
}

best_model = max(models,key=models.get)
best_model

'Logistic Regression'

In [15]:
#User Input 

