In [3]:
import pandas as pd

In [4]:
# Read and prepare the dataset
df = pd.read_csv(
    "datasets/Customer-Churn-Records.csv",
    header=0,
    na_values="NA",
    comment="\t",
    sep=",",
    skipinitialspace=True,
    encoding='utf-8'    # Added encoding to handle special characters
)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   RowNumber           10000 non-null  int64  
 1   CustomerId          10000 non-null  int64  
 2   Surname             10000 non-null  object 
 3   CreditScore         10000 non-null  int64  
 4   Geography           10000 non-null  object 
 5   Gender              10000 non-null  object 
 6   Age                 10000 non-null  int64  
 7   Tenure              10000 non-null  int64  
 8   Balance             10000 non-null  float64
 9   NumOfProducts       10000 non-null  int64  
 10  HasCrCard           10000 non-null  int64  
 11  IsActiveMember      10000 non-null  int64  
 12  EstimatedSalary     10000 non-null  float64
 13  Exited              10000 non-null  int64  
 14  Complain            10000 non-null  int64  
 15  Satisfaction Score  10000 non-null  int64  
 16  Card 

In [5]:
# Drop unnecessary columns to avoid noise in the model
df.drop(
    columns=[
        "RowNumber",
        "CustomerId",
        "Surname",
        "Complain",
        "Satisfaction Score",
        "Card Type",
        "Point Earned",
    ],
    inplace=True,   # Added inplace=True to modify df directly
)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      10000 non-null  int64  
 1   Geography        10000 non-null  object 
 2   Gender           10000 non-null  object 
 3   Age              10000 non-null  int64  
 4   Tenure           10000 non-null  int64  
 5   Balance          10000 non-null  float64
 6   NumOfProducts    10000 non-null  int64  
 7   HasCrCard        10000 non-null  int64  
 8   IsActiveMember   10000 non-null  int64  
 9   EstimatedSalary  10000 non-null  float64
 10  Exited           10000 non-null  int64  
dtypes: float64(2), int64(7), object(2)
memory usage: 859.5+ KB


In [6]:
X = df.drop(columns=["Exited"])
y = df["Exited"]

In [7]:
# dict = {'France': 1, 'Germany': 2, 'Spain': 3, "Female": 0, "Male": 1}
# X = X.replace(dict, inplace=True)
# The using above approach is commented out. Instead, we use the mapping below.

# Encode categorical variables
X['Geography'] = X['Geography'].map({'France': 1, 'Germany': 2, 'Spain': 3})
X['Gender'] = X['Gender'].map({"Female": 0, "Male": 1})

In [8]:
X

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,541,2,0,45,0,187598.22,3,0,0,151621.13
1,434,1,0,38,4,218123.13,4,1,0,161954.90
2,794,2,0,76,8,174120.73,3,0,1,190933.43
3,696,3,1,34,1,15525.54,2,1,0,185755.19
4,814,3,1,62,0,190596.29,4,0,1,4418.43
...,...,...,...,...,...,...,...,...,...,...
9995,684,2,0,87,0,189987.68,1,0,1,185780.03
9996,651,3,0,94,10,190254.87,3,0,1,177989.96
9997,628,3,1,41,2,218759.09,3,0,0,103231.27
9998,800,1,0,52,5,62207.77,3,0,0,82367.71


In [9]:
# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

We have 2 ways:
- 1. Using cross_val_score
- 2. Using cross_validate

In [30]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

clf = DecisionTreeClassifier(min_samples_leaf=10)
scores = cross_val_score(clf, X_train, y_train, cv=10)
scores

array([0.5375 , 0.5025 , 0.485  , 0.55125, 0.49   , 0.52   , 0.505  ,
       0.4925 , 0.4925 , 0.50625])

In [31]:
print("Average score:", scores.mean())

Average score: 0.50825


In [32]:
from sklearn.model_selection import cross_validate

clf = DecisionTreeClassifier(min_samples_leaf=10)
cv_results = cross_validate(clf, X_train, y_train, cv=10, return_estimator=True)

In [34]:
type(cv_results)

dict

In [36]:
cv_results.keys()

dict_keys(['fit_time', 'score_time', 'estimator', 'test_score'])

In [37]:
cv_results['test_score']

array([0.535  , 0.5    , 0.48   , 0.54875, 0.48875, 0.51875, 0.50125,
       0.49125, 0.4925 , 0.50125])

In [38]:
print('Mean score:', cv_results['test_score'].mean())

Mean score: 0.50575


In [39]:
cv_results['estimator'][0]

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,10
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [41]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

scoring = ['roc_auc', 'accuracy', 'f1']
scores = cross_validate(clf, X_train, y_train, cv=10, scoring=scoring)

In [42]:
scores.keys()

dict_keys(['fit_time', 'score_time', 'test_roc_auc', 'test_accuracy', 'test_f1'])

In [43]:
scores['test_roc_auc'].mean()

np.float64(0.5109929051326928)

In [44]:
scores['test_accuracy'].mean()

np.float64(0.507125)

In [45]:
scores['test_f1'].mean()

np.float64(0.5012252469782839)

In [46]:
testscores = []
for i in range(len(cv_results['estimator'])):
    testscores.append(cv_results['estimator'][i].score(X_test, y_test))

In [47]:
testscores

[0.497, 0.4905, 0.479, 0.497, 0.489, 0.4885, 0.5145, 0.5165, 0.4825, 0.497]

In [48]:
y_pred = cv_results['estimator'][0].predict(X_test)

In [49]:
probabilities = cv_results['estimator'][0].predict_proba(X_test)
print(probabilities)

[[0.6875     0.3125    ]
 [0.27272727 0.72727273]
 [0.26666667 0.73333333]
 ...
 [0.66666667 0.33333333]
 [0.63157895 0.36842105]
 [0.47368421 0.52631579]]
