In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier 
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score,confusion_matrix

In [13]:
#Import the Cleaned dataset Files
try:
    df = pd.read_csv("Schema/Cleaned_data_of_chunk.csv")
except FileNotFoundError as e:
    print(f"Error {e}.")

In [14]:
#Check the first 5 rows.
df.head()

Unnamed: 0.1,Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,female,0,yes,no,1,no,no,dsl,no,...,no,no,no,no,month,yes,electronic,29.85,29.85,no
1,1,male,0,no,no,34,yes,no,dsl,yes,...,yes,no,no,no,one year,no,mailed,56.95,1889.5,no
2,2,male,0,no,no,2,yes,no,dsl,yes,...,no,no,no,no,month,yes,mailed,53.85,108.15,yes
3,3,male,0,no,no,45,no,no,dsl,yes,...,yes,yes,no,no,one year,no,bank transfer,42.3,1840.75,no
4,4,female,0,no,no,2,yes,no,fiber optic,no,...,no,no,no,no,month,yes,electronic,70.7,151.65,yes


In [15]:
#Removed unwanted column.
df.drop(columns=["Unnamed: 0"],inplace=True)

In [16]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,female,0,yes,no,1,no,no,dsl,no,yes,no,no,no,no,month,yes,electronic,29.85,29.85,no
1,male,0,no,no,34,yes,no,dsl,yes,no,yes,no,no,no,one year,no,mailed,56.95,1889.5,no
2,male,0,no,no,2,yes,no,dsl,yes,yes,no,no,no,no,month,yes,mailed,53.85,108.15,yes
3,male,0,no,no,45,no,no,dsl,yes,no,yes,yes,no,no,one year,no,bank transfer,42.3,1840.75,no
4,female,0,no,no,2,yes,no,fiber optic,no,no,no,no,no,no,month,yes,electronic,70.7,151.65,yes


In [17]:
#Lets Saperate the Numaric and Objective columns.

temp1 = [] #Its Store Objective columns.
temp2 = [] #Its Store Numaric columns.

for i in df.columns:
    if df[i].dtype == "object":
        temp1.append(i)
    else:
        temp2.append(i)
print(temp1)
print(temp2)

['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn']
['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']


In [18]:
#lets encode all the labeled columns.

encode = {}
for i in temp1:
    lb = LabelEncoder()
    df[i] = lb.fit_transform(df[i])
    encode[i] = lb
print("Encoding is now completed")

Encoding is now completed


In [19]:
#Lets Split the data using train test split model
x = df.iloc[:,:-1]
y = df["Churn"]

x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=42,train_size=0.2)

print("Train and test splited....")

Train and test splited....


In [20]:
#Lets scale down the data using the StandardScaler

std = StandardScaler()

x_train_scaled = std.fit_transform(x_train)
x_test_scaled = std.transform(x_test)

print("Data Scaled Successfully.....")

Data Scaled Successfully.....


In [35]:
#LogisticRegression Model

lr = LogisticRegression()

lr.fit(x_train_scaled, y_train)

lr_prediction = lr.predict(x_test_scaled)

print("Mean Square Error (MSE) : ",round(mean_squared_error(y_test,lr_prediction),4))
print("Mean Absolute Error (MAE) : ",round(mean_absolute_error(y_test,lr_prediction),4))
print("Root Mean Square Error (MAE) : ",round(np.square(mean_squared_error(y_test,lr_prediction)),4))
print("R2 Score : ", round(r2_score(y_test,lr_prediction),4))  
print("Confusion Matrics : \n",confusion_matrix(y_test,lr_prediction)) 

Mean Square Error (MSE) :  0.1968
Mean Absolute Error (MAE) :  0.1968
Root Mean Square Error (MAE) :  0.0387
R2 Score :  -0.002
Confusion Matrics : 
 [[3690  432]
 [ 677  836]]


In [37]:
# #DecisionTreeClassifier Model

dtc = DecisionTreeClassifier()

dtc.fit(x_train_scaled, y_train)

dtc_prediction = dtc.predict(x_test_scaled)

print("Mean Square Error (MSE) : ",round(mean_squared_error(y_test,dtc_prediction),4))
print("Mean Absolute Error (MAE) : ",round(mean_absolute_error(y_test,dtc_prediction),4))
print("Root Mean Square Error (MAE) : ",round(np.square(mean_squared_error(y_test,dtc_prediction)),4))
print("R2 Score : ", round(r2_score(y_test,dtc_prediction),4)) 
print("Confusion Matrics : \n",confusion_matrix(y_test,dtc_prediction))  

Mean Square Error (MSE) :  0.2848
Mean Absolute Error (MAE) :  0.2848
Root Mean Square Error (MAE) :  0.0811
R2 Score :  -0.4502
Confusion Matrics : 
 [[3244  878]
 [ 727  786]]


In [8]:
# xgboost Model

xgb = XGBClassifier(
    random_state=42,
    n_estimators=300,
    learning_rate=0.1,
    max_depth=9,
    eval_metric='logloss'
)
xgb.fit(x_train_scaled, y_train)

xgb_prediction = xgb.predict(x_test_scaled)

print("Mean Square Error (MSE) : ",round(mean_squared_error(y_test,xgb_prediction),4))
print("Mean Absolute Error (MAE) : ",round(mean_absolute_error(y_test,xgb_prediction),4))
print("Root Mean Square Error (MAE) : ",round(np.square(mean_squared_error(y_test,xgb_prediction)),4))
print("R2 Score : ", round(r2_score(y_test,xgb_prediction),4)) 
print("Confusion Matrics : \n",confusion_matrix(y_test,xgb_prediction)) 

NameError: name 'x_train_scaled' is not defined