In [1]:
import pandas as pd
import matplotlib.pyplot as plt

# Step 1: Load the Data
data = pd.read_excel("C:/Users/Divine/Downloads/customer_churn_large_dataset.xlsx")
# Step 2: Check for Data Types and Missing Values
data.info()
print(data.head())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   CustomerID                  100000 non-null  float64
 1   Name                        100000 non-null  object 
 2   Age                         100000 non-null  float64
 3   Gender                      100000 non-null  object 
 4   Location                    100000 non-null  object 
 5   Subscription_Length_Months  100000 non-null  float64
 6   Monthly_Bill                100000 non-null  float64
 7   Total_Usage_GB              100000 non-null  float64
 8   Churn                       100000 non-null  float64
dtypes: float64(6), object(3)
memory usage: 6.9+ MB
   CustomerID        Name   Age  Gender     Location  \
0         1.0  Customer_1  63.0    Male  Los Angeles   
1         2.0  Customer_2  62.0  Female     New York   
2         3.0  Customer_3  24.0  

In [2]:
missing_data = data.isnull().sum()
print(missing_data)

CustomerID                    0
Name                          0
Age                           0
Gender                        0
Location                      0
Subscription_Length_Months    0
Monthly_Bill                  0
Total_Usage_GB                0
Churn                         0
dtype: int64


In [3]:
import numpy as np
from scipy import stats
columns_to_check = ['Monthly_Bill','Total_Usage_GB', 'Age', 'Subscription_Length_Months']
z_scores = np.abs(stats.zscore(data['Monthly_Bill']))
threshold = 2
outlier_counts = {}
for column_name in columns_to_check:
    z_scores = np.abs(stats.zscore(data[column_name]))
    outlier_count = len(z_scores[z_scores>threshold])
    outlier_counts[column_name] = outlier_count
for column_name, count in outlier_counts.items():
    print(f"Number of Outliers in '{column_name}':{count}")

Number of Outliers in 'Monthly_Bill':0
Number of Outliers in 'Total_Usage_GB':0
Number of Outliers in 'Age':0
Number of Outliers in 'Subscription_Length_Months':0


In [4]:
import pandas as pd
data = pd.get_dummies(data, columns=['Gender', 'Location'], drop_first=True)
data.head()

Unnamed: 0,CustomerID,Name,Age,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn,Gender_Male,Location_Houston,Location_Los Angeles,Location_Miami,Location_New York
0,1.0,Customer_1,63.0,17.0,73.36,236.0,0.0,1,0,1,0,0
1,2.0,Customer_2,62.0,1.0,48.76,172.0,0.0,0,0,0,0,1
2,3.0,Customer_3,24.0,5.0,85.47,460.0,0.0,0,0,1,0,0
3,4.0,Customer_4,36.0,3.0,97.94,297.0,1.0,0,0,0,1,0
4,5.0,Customer_5,46.0,19.0,58.14,266.0,0.0,0,0,0,1,0


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
features =  ['Age', 'Subscription_Length_Months', 'Monthly_Bill', 'Gender_Male','Total_Usage_GB','Location_Houston','Location_Los Angeles','Location_Miami','Location_New York']
X = data[features]
y = data['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = MinMaxScaler()
columns_to_normalize = features
X_train[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
X_test[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])


In [12]:
X_train.head()

Unnamed: 0,Age,Subscription_Length_Months,Monthly_Bill,Gender_Male,Total_Usage_GB,Location_Houston,Location_Los Angeles,Location_Miami,Location_New York
75220,0.692308,0.173913,0.778571,0.0,0.344444,0.0,0.0,0.0,1.0
48955,0.192308,1.0,0.743714,1.0,0.42,0.0,0.0,0.0,1.0
44966,0.75,0.478261,0.318429,1.0,0.026667,0.0,0.0,0.0,0.0
13568,0.019231,0.782609,0.036714,1.0,0.273333,1.0,0.0,0.0,0.0
92727,0.730769,0.304348,0.050286,0.0,0.586667,0.0,0.0,1.0,0.0


In [13]:

classifiers = {
    "Random Forest" : RandomForestClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Classifier": SVC(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42)
}
results = {}
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    
    results[name] = {
        "Accuracy": accuracy,
        "Classification Report": class_report
    }
for name, metrics in results.items():
    print(f"Classifier: {name}")
    print(f"Accuracy: {metrics['Accuracy']:.2f}")
    print("Classification Report:")
    print(metrics['Classification Report'])
    print("="*50)


Classifier: Random Forest
Accuracy: 0.50
Classification Report:
              precision    recall  f1-score   support

         0.0       0.50      0.53      0.52     10079
         1.0       0.50      0.47      0.49      9921

    accuracy                           0.50     20000
   macro avg       0.50      0.50      0.50     20000
weighted avg       0.50      0.50      0.50     20000

Classifier: Logistic Regression
Accuracy: 0.50
Classification Report:
              precision    recall  f1-score   support

         0.0       0.50      0.70      0.59     10079
         1.0       0.49      0.29      0.37      9921

    accuracy                           0.50     20000
   macro avg       0.50      0.50      0.48     20000
weighted avg       0.50      0.50      0.48     20000

Classifier: K-Nearest Neighbors
Accuracy: 0.50
Classification Report:
              precision    recall  f1-score   support

         0.0       0.50      0.50      0.50     10079
         1.0       0.49      0.49

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor

In [14]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import max_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import median_absolute_error

In [15]:
lm = LinearRegression()
lm.fit(X_train,y_train)

LinearRegression()

In [16]:
lm_pred_train = lm.predict(X_train)

In [17]:
lm_pred_test = lm.predict(X_test)
print(y_test, lm_pred_test)

75721    0.0
80184    0.0
19864    0.0
76699    1.0
92991    0.0
        ... 
32595    0.0
29313    1.0
37862    1.0
53421    0.0
42410    1.0
Name: Churn, Length: 20000, dtype: float64 [0.48854187 0.50685092 0.49282112 ... 0.51075841 0.49962083 0.50129521]


In [18]:
# Compute and print the error metrics on training data
print("Training R2", r2_score (y_train, lm_pred_train))
print("Training EVS", explained_variance_score (y_train, lm_pred_train))
print("Training MSE", mean_squared_error(y_train, lm_pred_train))
print("Training RMSE", np.sqrt(mean_squared_error(y_train, lm_pred_train)))
print("Training MAE", mean_absolute_error (y_train, lm_pred_train)) 
print("Training Max. Error", max_error (y_train, lm_pred_train))
print("Training MSLE", mean_squared_log_error (y_train, lm_pred_train)) 
print("Training MedAE", median_absolute_error (y_train, lm_pred_train))

Training R2 0.00013603125597538845
Training EVS 0.00013603125597527743
Training MSE 0.2499628419895897
Training RMSE 0.49996284060876933
Training MAE 0.4999256839791793
Training Max. Error 0.5161445924436805
Training MSLE 0.12356818606045661
Training MedAE 0.4999326064622185


In [19]:
# Compute and print the error metrics on testing data
print("Testing R2", r2_score (y_test, lm_pred_test))
print("Testing EVS", explained_variance_score (y_test, lm_pred_test))
print("Testing MSE", mean_squared_error(y_test, lm_pred_test))
print("Testing RMSE", np.sqrt(mean_squared_error(y_test, lm_pred_test)))
print("Testing MAE", mean_absolute_error (y_test, lm_pred_test)) 
print("Testing Max. Error", max_error (y_test, lm_pred_test))
print("Testing MSLE", mean_squared_log_error (y_test, lm_pred_test))
print("Testing MedAE", median_absolute_error (y_test, lm_pred_test))

Testing R2 -0.00010406263040452224
Testing EVS -8.589611530540608e-05
Testing MSE 0.2500104115339639
Testing RMSE 0.5000104114255661
Testing MAE 0.4999735410228508
Testing Max. Error 0.51622706498634
Testing MSLE 0.12376433429673471
Testing MedAE 0.4999231945629993


In [30]:
rf = RandomForestRegressor(n_estimators=150,random_state=99)
rf.fit(X_train,y_train.values.ravel())

RandomForestRegressor(n_estimators=150, random_state=99)

In [31]:
rf_pred_train = rf.predict(X_train)
rf_pred_test = rf.predict(X_test)

In [32]:
# Compute and print the error metrics on training data
print("Training R2", r2_score (y_train, rf_pred_train))
print("Training EVS", explained_variance_score (y_train, rf_pred_train))
print("Training MSE", mean_squared_error(y_train, rf_pred_train))
print("Training RMSE", np.sqrt(mean_squared_error(y_train, rf_pred_train)))
print("Training MAE", mean_absolute_error (y_train, rf_pred_train)) 
print("Training Max. Error", max_error (y_train, rf_pred_train))
print("Training MSLE", mean_squared_log_error (y_train, rf_pred_train)) 
print("Training MedAE", median_absolute_error (y_train, rf_pred_train))

Training R2 0.8555291348614777
Training EVS 0.855529195706689
Training MSE 0.03611726111111111
Training RMSE 0.19004541854807
Training MAE 0.1836431666666667
Training Max. Error 0.4
Training MSLE 0.01992554402998648
Training MedAE 0.18000000000000005


In [33]:
# Compute and print the error metrics on testing data
print("Testing R2", r2_score (y_test, rf_pred_test))
print("Testing EVS", explained_variance_score (y_test, rf_pred_test))
print("Testing MSE", mean_squared_error(y_test, rf_pred_test))
print("Testing RMSE", np.sqrt(mean_squared_error(y_test, rf_pred_test)))
print("Testing MAE", mean_absolute_error (y_test, rf_pred_test)) 
print("Testing Max. Error", max_error (y_test, rf_pred_test))
print("Testing MSLE", mean_squared_log_error (y_test, rf_pred_test))
print("Testing MedAE", median_absolute_error (y_test, rf_pred_test))

Testing R2 -0.049626173836175846
Testing EVS -0.04961550641442214
Testing MSE 0.26239016666666665
Testing RMSE 0.5122403407255881
Testing MAE 0.5004876666666667
Testing Max. Error 0.86
Testing MSLE 0.12898939692442135
Testing MedAE 0.5


In [34]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']} 
grid_SVM = GridSearchCV(SVR(),param_grid,refit=True,verbose=3)
grid_SVM.fit(X_train,y_train.values.ravel())

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.261 total time= 4.7min
[CV 2/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.288 total time= 9.4min
[CV 3/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.265 total time= 4.7min
[CV 4/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.256 total time= 4.7min
[CV 5/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.278 total time= 9.3min
[CV 1/5] END .....C=0.1, gamma=0.1, kernel=rbf;, score=-0.312 total time= 4.7min



KeyboardInterrupt



In [35]:
adb = AdaBoostRegressor(n_estimators=10,random_state=99)
adb.fit(X_train,y_train.values.ravel())

AdaBoostRegressor(n_estimators=10, random_state=99)

In [36]:
adb_pred_train = adb.predict(X_train)
adb_pred_test = adb.predict(X_test)

In [37]:
# Compute and print the error metrics on training data
print("Training R2", r2_score (y_train, adb_pred_train))
print("Training EVS", explained_variance_score (y_train, adb_pred_train))
print("Training MSE", mean_squared_error(y_train, adb_pred_train))
print("Training RMSE", np.sqrt(mean_squared_error(y_train, adb_pred_train)))
print("Training MAE", mean_absolute_error (y_train, adb_pred_train)) 
print("Training Max. Error", max_error (y_train, adb_pred_train))
print("Training MSLE", mean_squared_log_error (y_train, adb_pred_train)) 
print("Training MedAE", median_absolute_error (y_train, adb_pred_train))

Training R2 -6.77426538124859e-05
Training EVS -6.288449391655959e-05
Training MSE 0.2500137848250215
Training RMSE 0.5000137846350053
Training MAE 0.49974467105981896
Training Max. Error 0.8148148148148148
Training MSLE 0.12349361466872287
Training MedAE 0.49388203304747963


In [38]:
# Compute and print the error metrics on testing data
print("Testing R2", r2_score (y_test, adb_pred_test))
print("Testing EVS", explained_variance_score (y_test, adb_pred_test))
print("Testing MSE", mean_squared_error(y_test, adb_pred_test))
print("Testing RMSE", np.sqrt(mean_squared_error(y_test, adb_pred_test)))
print("Testing MAE", mean_absolute_error (y_test, adb_pred_test)) 
print("Testing Max. Error", max_error (y_test, adb_pred_test))
print("Testing MSLE", mean_squared_log_error (y_test, adb_pred_test))
print("Testing MedAE", median_absolute_error (y_test, adb_pred_test))

Testing R2 -0.001051968670044534
Testing EVS -0.0010480923014097865
Testing MSE 0.25024737325416996
Testing RMSE 0.5002473120908997
Testing MAE 0.5000032744871977
Testing Max. Error 0.8148148148148148
Testing MSLE 0.12377570499533022
Testing MedAE 0.49388203304747963


In [39]:
gpr = GaussianProcessRegressor(random_state=99)

In [40]:
gpr.fit(X_train,y_train)

MemoryError: Unable to allocate 47.7 GiB for an array with shape (80000, 80000) and data type float64