<a href="https://colab.research.google.com/github/chandans16/chandans16/blob/main/Churn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Installation of required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression  
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, GridSearchCV

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=FutureWarning) 
warnings.filterwarnings("ignore", category=UserWarning) 

%config InlineBackend.figure_format = 'retina'

# to display all columns and rows:
pd.set_option('display.max_columns', None); pd.set_option('display.max_rows', None);

In [8]:
# Descriptive statistics of the data set accessed.
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
customerid,10000.0,15690940.0,71936.186123,15565701.0,15628528.25,15690740.0,15753230.0,15815690.0
creditscore,10000.0,650.5288,96.653299,350.0,584.0,652.0,718.0,850.0
age,10000.0,38.9218,10.487806,18.0,32.0,37.0,44.0,92.0
tenure,10000.0,5.0128,2.892174,0.0,3.0,5.0,7.0,10.0
balance,10000.0,76485.89,62397.405202,0.0,0.0,97198.54,127644.2,250898.09
numofproducts,10000.0,1.5302,0.581654,1.0,1.0,1.0,2.0,4.0
hascrcard,10000.0,0.7055,0.45584,0.0,0.0,1.0,1.0,1.0
isactivemember,10000.0,0.5151,0.499797,0.0,0.0,1.0,1.0,1.0
estimatedsalary,10000.0,100090.2,57510.492818,11.58,51002.11,100193.9,149388.2,199992.48
exited,10000.0,0.2037,0.402769,0.0,0.0,0.0,0.0,1.0


In [4]:
# Reading the dataset
df = pd.read_csv("churn.csv", index_col=0)
df.columns = map(str.lower, df.columns)

In [5]:
# The first 5 observation units of the data set were accessed.
df.head()

Unnamed: 0_level_0,customerid,surname,creditscore,geography,gender,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary,exited
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [6]:
# The size of the data set was examined. It consists of 10000 observation units and 13 variables.
df.shape

(10000, 13)

In [7]:
# Feature information
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 1 to 10000
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   customerid       10000 non-null  int64  
 1   surname          10000 non-null  object 
 2   creditscore      10000 non-null  int64  
 3   geography        10000 non-null  object 
 4   gender           10000 non-null  object 
 5   age              10000 non-null  int64  
 6   tenure           10000 non-null  int64  
 7   balance          10000 non-null  float64
 8   numofproducts    10000 non-null  int64  
 9   hascrcard        10000 non-null  int64  
 10  isactivemember   10000 non-null  int64  
 11  estimatedsalary  10000 non-null  float64
 12  exited           10000 non-null  int64  
dtypes: float64(2), int64(8), object(3)
memory usage: 1.1+ MB


In [9]:
# The average of the age variable was taken according to the dependent variable.
df.groupby("exited").agg("mean")

Unnamed: 0_level_0,customerid,creditscore,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary
exited,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,15691170.0,651.853196,37.408389,5.033279,72745.296779,1.544267,0.707146,0.554565,99738.391772
1,15690050.0,645.351497,44.837997,4.932744,91108.539337,1.475209,0.699067,0.360825,101465.677531


In [10]:
# The average of the age variable according to the gender variable was examined.
df.groupby("gender").agg({"age": "mean"})

Unnamed: 0_level_0,age
gender,Unnamed: 1_level_1
Female,39.238389
Male,38.658237


In [11]:
# The average of the dependent variable according to the gender variable was examined.
df.groupby("gender").agg({"exited": "mean"})

Unnamed: 0_level_0,exited
gender,Unnamed: 1_level_1
Female,0.250715
Male,0.164559


In [12]:
# The average of the dependent variable according to the geography variable was examined.
df.groupby("geography").agg({"exited": "mean"})

Unnamed: 0_level_0,exited
geography,Unnamed: 1_level_1
France,0.161548
Germany,0.324432
Spain,0.166734


In [13]:
# The frequency of the dependent variable has been reached.
df["exited"].value_counts()

0    7963
1    2037
Name: exited, dtype: int64

In [14]:
# Access to those who left us through the dependent variable. (exited == 1)
churn = df[df["exited"] == 1]

In [15]:
# The first 5 observation units were reached.
churn.head()

Unnamed: 0_level_0,customerid,surname,creditscore,geography,gender,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary,exited
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
6,15574012,Chu,645,Spain,Male,44,8,113755.78,2,1,0,149756.71,1
8,15656148,Obinna,376,Germany,Female,29,4,115046.74,4,1,0,119346.88,1
17,15737452,Romeo,653,Germany,Male,58,1,132602.88,1,1,0,5097.67,1


In [16]:
# Size information has been accessed.
churn.shape

(2037, 13)

In [17]:
# Who left most than the gender variable?
churn.groupby("gender").agg({"exited": "count"})

Unnamed: 0_level_0,exited
gender,Unnamed: 1_level_1
Female,1139
Male,898


In [18]:
# Which country has left us the most?
churn.groupby("geography").agg({"exited": "count"})

Unnamed: 0_level_0,exited
geography,Unnamed: 1_level_1
France,810
Germany,814
Spain,413


In [19]:
# Access to those who do not leave us on the dependent variable. (exited == 0)
non_churn = df[df["exited"] == 0]

In [20]:
# The first 5 observation units were reached.
non_churn.head()

Unnamed: 0_level_0,customerid,surname,creditscore,geography,gender,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary,exited
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0
7,15592531,Bartlett,822,France,Male,50,7,0.0,2,1,1,10062.8,0
9,15792365,He,501,France,Male,44,4,142051.07,2,0,1,74940.5,0
