# Predicting Loyal/Churn customers using `scikit-learn` Decision Tree Classifier

## [Dataset Reference](https://www.youtube.com/redirect?event=video_description&redir_token=QUFFLUhqbXhyQnBzb1JxdXR0NnhudkpISE9jb1kxSnNzZ3xBQ3Jtc0tuR1g3SDJHaTREMTFMR3FhVllRclUyMzNiNWFyU3BfTUFMd21EdHJJMVlvbENzbi1nNGZWc2l5aUVRR1RFU0k0Y0F4U0p1TnFRY3lSNGdlTURkRkFhdE4zNmRkRGJuc1JQSXBJRzRHem1iNnJHYkRnOA&q=http%3A%2F%2Fstatic.rapidminer.com%2Feducation%2Fgetting_started%2FFollow-along-Files.zip&v=DS-tYhgA5lA)

# 1. Import Dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt

# 2. Loading the Dataset

In [2]:
df = pd.read_excel("raw-customer-churn-data.xlsx")

  warn(msg)


# 3. Exploratory Data Analysis

In [3]:
df.head()

Unnamed: 0,Name,Gender,Age,Payment Method,Churn,LastTransaction
0,Nicolas Garrett,male,64.0,credit card,loyal,98
1,Isaac Reyes,male,35.0,cheque,churn,118
2,Jaime Sullivan,female,25.0,credit card,loyal,107
3,Geraldine Miller,female,39.0,credit card,,177
4,Curtis Frazier,m,39.0,credit card,loyal,90


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             999 non-null    object 
 1   Gender           998 non-null    object 
 2   Age              998 non-null    float64
 3   Payment Method   999 non-null    object 
 4   Churn            903 non-null    object 
 5   LastTransaction  999 non-null    int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 47.0+ KB


### There are 999 rows and 6 columns

### **Churn** data has 96 missing values

In [5]:
df.describe()

Unnamed: 0,Age,LastTransaction
count,998.0,999.0
mean,45.593186,110.945946
std,18.76532,44.977049
min,17.0,1.0
25%,30.0,77.0
50%,44.0,110.0
75%,58.0,144.5
max,91.0,223.0


In [6]:
df.drop(columns=["Name"], inplace=True)

In [7]:
df_dropped = df.dropna()

In [8]:
df_dropped.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 901 entries, 0 to 998
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Gender           901 non-null    object 
 1   Age              901 non-null    float64
 2   Payment Method   901 non-null    object 
 3   Churn            901 non-null    object 
 4   LastTransaction  901 non-null    int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 42.2+ KB


In [9]:
df_missing_churn = df[df["Churn"].isnull()]

In [10]:
df_missing_churn.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96 entries, 3 to 996
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Gender           96 non-null     object 
 1   Age              96 non-null     float64
 2   Payment Method   96 non-null     object 
 3   Churn            0 non-null      object 
 4   LastTransaction  96 non-null     int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 4.5+ KB


In [11]:
numerical_cols = df._get_numeric_data().columns
numerical_cols

Index(['Age', 'LastTransaction'], dtype='object')

In [12]:
categorical_cols = df.drop(columns=numerical_cols).columns

In [13]:
categorical_cols

Index(['Gender', 'Payment Method', 'Churn'], dtype='object')

In [14]:
X = df_dropped.drop(columns=["Churn"])

In [15]:
X.head()

Unnamed: 0,Gender,Age,Payment Method,LastTransaction
0,male,64.0,credit card,98
1,male,35.0,cheque,118
2,female,25.0,credit card,107
4,m,39.0,credit card,90
5,female,28.0,cheque,189


In [16]:
y = df_dropped["Churn"]

In [17]:
y

0      loyal
1      churn
2      loyal
4      loyal
5      churn
       ...  
993    loyal
994    loyal
995    loyal
997    loyal
998    loyal
Name: Churn, Length: 901, dtype: object

In [18]:
from sklearn.preprocessing import LabelEncoder

In [19]:
le_gender = LabelEncoder()
le_payment = LabelEncoder()

In [21]:
X["Gender_le"] = le_gender.fit_transform(X["Gender"])
X["Payment_le"] = le_payment.fit_transform(X["Payment Method"])

In [22]:
X.head()

Unnamed: 0,Gender,Age,Payment Method,LastTransaction,Gender_le,Payment_le
0,male,64.0,credit card,98,2,2
1,male,35.0,cheque,118,2,1
2,female,25.0,credit card,107,0,2
4,m,39.0,credit card,90,1,2
5,female,28.0,cheque,189,0,1


In [25]:
X = X.drop(columns=["Gender", "Payment Method"])

In [26]:
X

Unnamed: 0,Age,LastTransaction,Gender_le,Payment_le
0,64.0,98,2,2
1,35.0,118,2,1
2,25.0,107,0,2
4,39.0,90,1,2
5,28.0,189,0,1
...,...,...,...,...
993,19.0,92,2,2
994,29.0,33,0,2
995,84.0,124,2,2
997,19.0,49,2,2


In [27]:
from sklearn.model_selection import train_test_split

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
model = DecisionTreeClassifier()

In [30]:
model.fit(X_train, y_train)

DecisionTreeClassifier()

In [31]:
model.score(X_test, y_test)

0.7182320441988951

In [32]:
model.score(X_train, y_train)

0.9972222222222222