In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [3]:
churn_d= pd.read_csv('churn_data.csv')
churn_d.head()

Unnamed: 0,customerID,tenure,PhoneService,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,1,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,34,Yes,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,2,Yes,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,45,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,2,Yes,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
churn_d.dtypes

customerID           object
tenure                int64
PhoneService         object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [5]:
churn_d['PhoneService'].value_counts()

PhoneService
Yes    6361
No      682
Name: count, dtype: int64

In [6]:
churn_d['Contract'].value_counts()

Contract
Month-to-month    3875
Two year          1695
One year          1473
Name: count, dtype: int64

In [7]:
churn_d['PaperlessBilling'].value_counts()

PaperlessBilling
Yes    4171
No     2872
Name: count, dtype: int64

In [8]:
churn_d.shape

(7043, 9)

In [9]:
churn_d.isna().sum()  # shows null values and its count

customerID          0
tenure              0
PhoneService        0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [10]:
# NO NUULL VALUES

In [11]:
# WILL ALLOW ONLY OBJECT TYPE DATA
churn_d.select_dtypes(include=['object']).columns

Index(['customerID', 'PhoneService', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'TotalCharges', 'Churn'],
      dtype='object')

In [12]:
# ENCODING : converting categorial value into numeric value

label_encoders={}
for column in churn_d.select_dtypes(include=['object']).columns:
    if column != 'customerID':
        label_encoders[column] = LabelEncoder()
        churn_d[column] = label_encoders[column].fit_transform(churn_d[column])
    

In [13]:
churn_d.head()

Unnamed: 0,customerID,tenure,PhoneService,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,1,0,0,1,2,29.85,2505,0
1,5575-GNVDE,34,1,1,0,3,56.95,1466,0
2,3668-QPYBK,2,1,0,1,3,53.85,157,1
3,7795-CFOCW,45,0,1,0,0,42.3,1400,0
4,9237-HQITU,2,1,0,1,2,70.7,925,1


In [14]:
X = churn_d.drop(columns=['customerID','Churn'])
Y = churn_d['Churn']

X.head()
                

Unnamed: 0,tenure,PhoneService,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,1,0,0,1,2,29.85,2505
1,34,1,1,0,3,56.95,1466
2,2,1,0,1,3,53.85,157
3,45,0,1,0,0,42.3,1400
4,2,1,0,1,2,70.7,925


In [15]:
# SPLIT DATA INTO TRAIN AND TEST SETS
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state= 42)

In [16]:
X_test.shape

(1409, 7)

In [17]:
Y_test.shape

(1409,)

In [18]:
Y_train.shape

(5634,)

In [19]:
# STANDARDIZE THE FEATURES
scale = StandardScaler()

X_train = scale.fit_transform(X_train)
X_test = scale.transform(X_test)

In [20]:
X_train

array([[-4.65683364e-01,  3.29573443e-01,  3.72908354e-01, ...,
         1.33926673e+00, -4.73723375e-04, -1.40800618e+00],
       [ 8.85536787e-01,  3.29573443e-01,  1.57759050e+00, ...,
        -1.47094882e+00,  1.07475386e+00,  5.58360318e-01],
       [-1.28460467e+00, -3.03422506e+00, -8.31773795e-01, ...,
         4.02528212e-01, -1.37649913e+00, -7.02892284e-01],
       ...,
       [-8.34197950e-01,  3.29573443e-01, -8.31773795e-01, ...,
         4.02528212e-01, -1.45294499e+00, -3.21706651e-01],
       [-8.34197950e-01,  3.29573443e-01, -8.31773795e-01, ...,
         4.02528212e-01,  1.14953785e+00, -1.53524478e+00],
       [-2.60953038e-01,  3.29573443e-01,  3.72908354e-01, ...,
        -5.34210304e-01, -1.49781538e+00,  3.30921324e-01]])

In [21]:
X_test

array([[-1.28460467, -3.03422506, -0.83177379, ...,  0.40252821,
        -1.33162874, -0.64775556],
       [ 0.35323794,  0.32957344, -0.83177379, ..., -1.47094882,
        -1.31667194,  1.72630461],
       [ 0.80364466,  0.32957344,  1.5775905 , ...,  1.33926673,
        -1.51277218, -1.69588351],
       ...,
       [-0.62946762,  0.32957344,  0.37290835, ..., -1.47094882,
        -1.49449165, -0.21726497],
       [ 1.49972776, -3.03422506,  1.5775905 , ..., -0.5342103 ,
        -0.69513389, -0.37631322],
       [-1.28460467, -3.03422506, -0.83177379, ...,  1.33926673,
        -1.11392424, -0.30156054]])

In [22]:
from sklearn.linear_model import LogisticRegression

model= LogisticRegression()

model.fit(X_train, Y_train)

Y_pred= model.predict(X_test)

In [23]:
Y_pred

array([0, 0, 0, ..., 0, 0, 1])

In [24]:
# Accuracy
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score

accuracy =  accuracy_score(Y_test, Y_pred)
print('Test Accuracy:', accuracy)  

Test Accuracy: 0.8062455642299503


In [25]:
from sklearn.feature_selection import RFE

rfe_model = LogisticRegression()

rfe = RFE(estimator= rfe_model, n_features_to_select=5)

In [26]:
# Fit RFE
rfe.fit(X_train, Y_train)

selected_feature = rfe.support_

X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)

model.fit(X_train_rfe, Y_train)

Y_pred_rfe = model.predict(X_test_rfe)

In [27]:
Y_pred_rfe

array([0, 0, 0, ..., 0, 0, 1])

In [28]:
accuracy =  accuracy_score(Y_test, Y_pred_rfe)
print('Test Accuracy:', accuracy)  

Test Accuracy: 0.8090844570617459


In [29]:
# Confusion Matrix
print('Confusion Matrix (Test Set):\n',confusion_matrix(Y_test, Y_pred))

Confusion Matrix (Test Set):
 [[938  98]
 [175 198]]


In [31]:
# ROC Rate
# Import necessary libraries
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt 

# Get predicted probabilities for the positive class (ATTORNEY = 1)
Y_pred_proba = model.predict_proba(X_test)[:, 1]  # Probabilities for class 1 , : stands for all

# Calculate ROC curve metrics
fpr, tpr, thresholds = roc_curve(Y_test, Y_pred)

# Calculate AUC score
auc_score = roc_auc_score(Y_test, Y_pred)
print(f"AUC Score: {auc_score:.3f}")  # Prints AUC with 3 decimal precision, {} aka place holder

# Plot ROC Curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, 
         label=f'ROC Curve (AUC = {auc_score:.2f})')
plt.plot([0, 1], [0, 1], color='red', linestyle='--', label='Random Guessing')

# Customize plot
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')

plt.title('Receiver Operating Characteristic (ROC) Curve')

plt.legend(loc="lower right")

plt.grid(True)

# Show plot
plt.show()

ValueError: X has 7 features, but LogisticRegression is expecting 5 features as input.

In [None]:
#metrics beyond accuracy 
#calculate senstivity (recall) and specifically
from sklearn.metrics import recall_score

#senstivity (recall)
senstivity= recall_score(Y_test, Y_pred_rfe)
print("senstivity:", senstivity)

#specificity
tn,fp,fn, tp= confusion_matrix(Y_test,Y_pred_rfe).ravel()
specificity= tn/ (tn+fp)
print("specificity:", specificity)