In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
from sklearn.preprocessing import LabelEncoder,StandardScaler


In [2]:
from google.colab import drive
drive.mount('/content/drive')
folder_path = '/content/drive/MyDrive/'

file_path = f'{folder_path}/vw_churndata.csv'
df = pd.read_csv(file_path)

df.head()


Mounted at /content/drive


Unnamed: 0,Customer_ID,Gender,Age,Married,State,Number_of_Referrals,Tenure_in_Months,Value_Deal,Phone_Service,Multiple_Lines,...,Payment_Method,Monthly_Charge,Total_Charges,Total_Refunds,Total_Extra_Data_Charges,Total_Long_Distance_Charges,Total_Revenue,Customer_Status,Churn_Category,Churn_Reason
0,19877-DEL,Male,35,No,Delhi,7,27,,Yes,No,...,Credit Card,65.6,593.3,0.0,0,381.51,974.81,Stayed,Others,Others
1,58353-MAH,Female,45,Yes,Maharashtra,14,13,,Yes,Yes,...,Credit Card,-4.0,542.4,38.33,10,96.21,610.28,Stayed,Others,Others
2,25063-WES,Male,51,No,West Bengal,4,35,Deal 5,Yes,No,...,Bank Withdrawal,73.9,280.85,0.0,0,134.6,415.45,Churned,Competitor,Competitor had better devices
3,59787-KAR,Male,79,No,Karnataka,3,21,Deal 4,Yes,No,...,Bank Withdrawal,98.0,1237.85,0.0,0,361.66,1599.51,Churned,Dissatisfaction,Product dissatisfaction
4,28544-TAM,Female,80,No,Tamil Nadu,3,8,,Yes,No,...,Credit Card,83.9,267.4,0.0,0,22.14,289.54,Churned,Dissatisfaction,Network reliability


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6007 entries, 0 to 6006
Data columns (total 32 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Customer_ID                  6007 non-null   object 
 1   Gender                       6007 non-null   object 
 2   Age                          6007 non-null   int64  
 3   Married                      6007 non-null   object 
 4   State                        6007 non-null   object 
 5   Number_of_Referrals          6007 non-null   int64  
 6   Tenure_in_Months             6007 non-null   int64  
 7   Value_Deal                   2710 non-null   object 
 8   Phone_Service                6007 non-null   object 
 9   Multiple_Lines               6007 non-null   object 
 10  Internet_Service             6007 non-null   object 
 11  Internet_Type                4784 non-null   object 
 12  Online_Security              6007 non-null   object 
 13  Online_Backup     

In [8]:
df.isnull().sum()

Unnamed: 0,0
Customer_ID,0
Gender,0
Age,0
Married,0
State,0
Number_of_Referrals,0
Tenure_in_Months,0
Value_Deal,3297
Phone_Service,0
Multiple_Lines,0


In [12]:
print(df['Value_Deal'].unique())
print(df['Internet_Type'].unique())

['none' 'Deal 5' 'Deal 4' 'Deal 1' 'Deal 2' 'Deal 3']
['Cable' 'Fiber Optic' 'DSL' nan]


In [11]:
df['Value_Deal'] = df['Value_Deal'].fillna('none')


In [13]:
print(df['Internet_Type'].value_counts(dropna=False))


Internet_Type
Fiber Optic    2675
DSL            1398
NaN            1223
Cable           711
Name: count, dtype: int64


In [14]:
df['Internet_Type'] = df['Internet_Type'].fillna('Unknown')


In [15]:
df.columns

Index(['Customer_ID', 'Gender', 'Age', 'Married', 'State',
       'Number_of_Referrals', 'Tenure_in_Months', 'Value_Deal',
       'Phone_Service', 'Multiple_Lines', 'Internet_Service', 'Internet_Type',
       'Online_Security', 'Online_Backup', 'Device_Protection_Plan',
       'Premium_Support', 'Streaming_TV', 'Streaming_Movies',
       'Streaming_Music', 'Unlimited_Data', 'Contract', 'Paperless_Billing',
       'Payment_Method', 'Monthly_Charge', 'Total_Charges', 'Total_Refunds',
       'Total_Extra_Data_Charges', 'Total_Long_Distance_Charges',
       'Total_Revenue', 'Customer_Status', 'Churn_Category', 'Churn_Reason'],
      dtype='object')

In [21]:
binary_cols = [
    'Married','Phone_Service','Multiple_Lines','Online_Security',
    'Online_Backup','Device_Protection_Plan','Premium_Support',
    'Streaming_TV','Streaming_Movies','Streaming_Music',
    'Unlimited_Data','Paperless_Billing'
]

df[binary_cols].head()


Unnamed: 0,Married,Phone_Service,Multiple_Lines,Online_Security,Online_Backup,Device_Protection_Plan,Premium_Support,Streaming_TV,Streaming_Movies,Streaming_Music,Unlimited_Data,Paperless_Billing
0,No,Yes,No,No,Yes,No,Yes,Yes,No,No,Yes,Yes
1,Yes,Yes,Yes,No,No,No,No,No,Yes,Yes,No,No
2,No,Yes,No,No,No,Yes,No,No,No,No,Yes,Yes
3,No,Yes,No,No,Yes,Yes,No,Yes,Yes,No,Yes,Yes
4,No,Yes,No,No,No,No,Yes,Yes,No,No,Yes,Yes


In [24]:
df[binary_cols] = df[binary_cols].replace({'Yes':1,'No':0})
df[binary_cols].head()


Unnamed: 0,Married,Phone_Service,Multiple_Lines,Online_Security,Online_Backup,Device_Protection_Plan,Premium_Support,Streaming_TV,Streaming_Movies,Streaming_Music,Unlimited_Data,Paperless_Billing
0,0,1,0,0,1,0,1,1,0,0,1,1
1,1,1,1,0,0,0,0,0,1,1,0,0
2,0,1,0,0,0,1,0,0,0,0,1,1
3,0,1,0,0,1,1,0,1,1,0,1,1
4,0,1,0,0,0,0,1,1,0,0,1,1


In [25]:
df['State'].unique()

array(['Delhi', 'Maharashtra', 'West Bengal', 'Karnataka', 'Tamil Nadu',
       'Telangana', 'Gujarat', 'Uttar Pradesh', 'Rajasthan', 'Bihar',
       'Andhra Pradesh', 'Madhya Pradesh', 'Chhattisgarh', 'Punjab',
       'Jharkhand', 'Haryana', 'Jammu & Kashmir', 'Assam', 'Uttarakhand',
       'Odisha', 'Kerala', 'Puducherry'], dtype=object)

In [26]:
nominal_cols = [
    'Gender','State','Value_Deal','Internet_Service',
    'Internet_Type','Contract','Payment_Method'
]

df = pd.get_dummies(df, columns=nominal_cols, drop_first=True)


In [27]:
df.columns

Index(['Customer_ID', 'Age', 'Married', 'Number_of_Referrals',
       'Tenure_in_Months', 'Phone_Service', 'Multiple_Lines',
       'Online_Security', 'Online_Backup', 'Device_Protection_Plan',
       'Premium_Support', 'Streaming_TV', 'Streaming_Movies',
       'Streaming_Music', 'Unlimited_Data', 'Paperless_Billing',
       'Monthly_Charge', 'Total_Charges', 'Total_Refunds',
       'Total_Extra_Data_Charges', 'Total_Long_Distance_Charges',
       'Total_Revenue', 'Customer_Status', 'Churn_Category', 'Churn_Reason',
       'Gender_Male', 'State_Assam', 'State_Bihar', 'State_Chhattisgarh',
       'State_Delhi', 'State_Gujarat', 'State_Haryana',
       'State_Jammu & Kashmir', 'State_Jharkhand', 'State_Karnataka',
       'State_Kerala', 'State_Madhya Pradesh', 'State_Maharashtra',
       'State_Odisha', 'State_Puducherry', 'State_Punjab', 'State_Rajasthan',
       'State_Tamil Nadu', 'State_Telangana', 'State_Uttar Pradesh',
       'State_Uttarakhand', 'State_West Bengal', 'Value_Deal_D

In [28]:
df['Customer_Status'].unique()

array(['Stayed', 'Churned'], dtype=object)

In [30]:
df['Customer_Status'] = df['Customer_Status'].replace({'Stayed':0,'Churned':1}).astype(int)

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6007 entries, 0 to 6006
Data columns (total 60 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Customer_ID                  6007 non-null   object 
 1   Age                          6007 non-null   int64  
 2   Married                      6007 non-null   int64  
 3   Number_of_Referrals          6007 non-null   int64  
 4   Tenure_in_Months             6007 non-null   int64  
 5   Phone_Service                6007 non-null   int64  
 6   Multiple_Lines               6007 non-null   int64  
 7   Online_Security              6007 non-null   int64  
 8   Online_Backup                6007 non-null   int64  
 9   Device_Protection_Plan       6007 non-null   int64  
 10  Premium_Support              6007 non-null   int64  
 11  Streaming_TV                 6007 non-null   int64  
 12  Streaming_Movies             6007 non-null   int64  
 13  Streaming_Music   

In [66]:
X = df.drop(columns=['Customer_ID', 'Customer_Status', 'Churn_Reason','Churn_Category'])
y = df['Customer_Status']

In [50]:
X_train,X_test,y_train,y_test =train_test_split(X,y,test_size=0.2,random_state=44)

In [51]:
rf = RandomForestClassifier(
    n_estimators=300,
    random_state=44
)

rf.fit(X_train, y_train)

In [61]:
y_pred = rf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))




[[835  50]
 [116 201]]
              precision    recall  f1-score   support

           0       0.88      0.94      0.91       885
           1       0.80      0.63      0.71       317

    accuracy                           0.86      1202
   macro avg       0.84      0.79      0.81      1202
weighted avg       0.86      0.86      0.86      1202

0.8618968386023295


In [64]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train, y_train)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [65]:
y_predlr = lr.predict(X_test)
print(confusion_matrix(y_test, y_predlr))
print(classification_report(y_test, y_predlr))
print(accuracy_score(y_test, y_predlr))




[[776 109]
 [142 175]]
              precision    recall  f1-score   support

           0       0.85      0.88      0.86       885
           1       0.62      0.55      0.58       317

    accuracy                           0.79      1202
   macro avg       0.73      0.71      0.72      1202
weighted avg       0.78      0.79      0.79      1202

0.7911813643926788


In [68]:
from sklearn.preprocessing import StandardScaler

num_cols = [
    'Age','Number_of_Referrals','Tenure_in_Months',
    'Monthly_Charge','Total_Charges','Total_Refunds',
    'Total_Extra_Data_Charges','Total_Long_Distance_Charges',
    'Total_Revenue'
]

scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])


In [69]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier


In [70]:
svm_model = SVC(kernel='rbf', probability=True, random_state=44)
svm_model.fit(X_train, y_train)


In [73]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)


In [72]:
gb_model = GradientBoostingClassifier(
    n_estimators=300,
    learning_rate=0.1,
    max_depth=3,
    random_state=44
)
gb_model.fit(X_train, y_train)


In [74]:
models = {'SVM': svm_model, 'KNN': knn_model, 'Gradient Boosting': gb_model}

for name, model in models.items():
    y_pred = model.predict(X_test)

    print(f"\n{name} Model:")
    print("Accuracy:", round(accuracy_score(y_test, y_pred), 4))
    print("Classification Report:\n", classification_report(y_test, y_pred))



SVM Model:
Accuracy: 0.8386
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.91      0.89       885
           1       0.72      0.63      0.67       317

    accuracy                           0.84      1202
   macro avg       0.80      0.77      0.78      1202
weighted avg       0.83      0.84      0.83      1202


KNN Model:
Accuracy: 0.7837
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.84      0.85       885
           1       0.58      0.64      0.61       317

    accuracy                           0.78      1202
   macro avg       0.72      0.74      0.73      1202
weighted avg       0.79      0.78      0.79      1202


Gradient Boosting Model:
Accuracy: 0.8561
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.92      0.90       885
           1       0.76      0.67      0.71       317

    accur

In [76]:
feat_imp = pd.Series(rf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
print(feat_imp.head(15))


Total_Revenue                  0.133862
Total_Charges                  0.125890
Total_Long_Distance_Charges    0.079464
Monthly_Charge                 0.076490
Contract_Two Year              0.060212
Age                            0.056193
Tenure_in_Months               0.042277
Number_of_Referrals            0.035690
Contract_One Year              0.032469
Internet_Type_Fiber Optic      0.030647
Value_Deal_Deal 5              0.020279
Payment_Method_Credit Card     0.017413
Online_Security                0.016083
Premium_Support                0.015631
Internet_Service_Yes           0.012800
dtype: float64


In [79]:
folder_path = '/content/drive/MyDrive/'

file_path = f'{folder_path}/vw_JoinData.csv'
JoinData = pd.read_csv(file_path)
JoinData.head()

Unnamed: 0,Customer_ID,Gender,Age,Married,State,Number_of_Referrals,Tenure_in_Months,Value_Deal,Phone_Service,Multiple_Lines,...,Payment_Method,Monthly_Charge,Total_Charges,Total_Refunds,Total_Extra_Data_Charges,Total_Long_Distance_Charges,Total_Revenue,Customer_Status,Churn_Category,Churn_Reason
0,93520-GUJ,Female,67,No,Gujarat,13,19,Deal 5,Yes,Yes,...,Bank Withdrawal,72.1,72.1,0.0,0,7.77,79.87,Joined,Others,Others
1,57256-BIH,Female,18,No,Bihar,9,7,,Yes,No,...,Credit Card,19.85,57.2,0.0,0,9.36,66.56,Joined,Others,Others
2,72357-MAD,Female,53,No,Madhya Pradesh,14,12,Deal 5,Yes,No,...,Credit Card,44.3,44.3,0.0,0,42.95,87.25,Joined,Others,Others
3,66612-KAR,Female,58,Yes,Karnataka,11,18,,Yes,No,...,Credit Card,19.95,58.0,0.0,0,8.07,66.07,Joined,Others,Others
4,22119-WES,Male,31,Yes,West Bengal,5,5,,Yes,No,...,Credit Card,20.05,33.7,0.0,0,3.62,37.32,Joined,Others,Others


In [80]:
og_data = JoinData.copy()

In [81]:
Customer_ID = JoinData['Customer_ID']

In [82]:
JoinData = JoinData.drop(['Customer_ID', 'Customer_Status', 'Churn_Category', 'Churn_Reason'], axis=1)

In [85]:
label_encoders = {}

for column in JoinData.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    JoinData[column] = le.fit_transform(JoinData[column])
    label_encoders[column] = le

In [87]:
JoinData = JoinData.reindex(columns=X_train.columns, fill_value=0)
new_predictions = rf.predict(JoinData)


In [89]:
og_data['Customer_Status_Predicted'] = new_predictions


In [90]:
og_data.head()

Unnamed: 0,Customer_ID,Gender,Age,Married,State,Number_of_Referrals,Tenure_in_Months,Value_Deal,Phone_Service,Multiple_Lines,...,Monthly_Charge,Total_Charges,Total_Refunds,Total_Extra_Data_Charges,Total_Long_Distance_Charges,Total_Revenue,Customer_Status,Churn_Category,Churn_Reason,Customer_Status_Predicted
0,93520-GUJ,Female,67,No,Gujarat,13,19,Deal 5,Yes,Yes,...,72.1,72.1,0.0,0,7.77,79.87,Joined,Others,Others,1
1,57256-BIH,Female,18,No,Bihar,9,7,,Yes,No,...,19.85,57.2,0.0,0,9.36,66.56,Joined,Others,Others,1
2,72357-MAD,Female,53,No,Madhya Pradesh,14,12,Deal 5,Yes,No,...,44.3,44.3,0.0,0,42.95,87.25,Joined,Others,Others,1
3,66612-KAR,Female,58,Yes,Karnataka,11,18,,Yes,No,...,19.95,58.0,0.0,0,8.07,66.07,Joined,Others,Others,1
4,22119-WES,Male,31,Yes,West Bengal,5,5,,Yes,No,...,20.05,33.7,0.0,0,3.62,37.32,Joined,Others,Others,1


In [91]:
og_data['Customer_Status_Predicted'].value_counts()

Unnamed: 0_level_0,count
Customer_Status_Predicted,Unnamed: 1_level_1
1,391
0,20


In [93]:
original_data = og_data[og_data['Customer_Status_Predicted'] == 1]
file_path = '/content/drive/MyDrive/Customer_Churn_Predicted.csv'
original_data.to_csv(file_path, index=False)
print(f"File saved to {file_path}")


File saved to /content/drive/MyDrive/Customer_Churn_Predicted.csv
