### From the book "Python Machine Learning for Beginners" by AI Publishing

7.1 Preparing Data for Classification Problems

In [1]:
# Like regression, you have to first convert data into a specific format before 
# it can be used to train classification algorithms.
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
# The following script uses the read_csv() method from the 
# Pandas library to read the customer_churn.csv file, which 
# contains records of customers who left the bank six months 
# after various information about them is recorded.
# The head() method prints the first five rows of the dataset.
url = "https://raw.githubusercontent.com/duochen/Python-MachineLearning/master/Reference/PythonMachineLearningforBeginners/Data/customer_churn.csv"
churn_df = pd.read_csv(url)
churn_df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
# The output shows that the dataset contains information such as surname, customer id, 
# geography, gender, age, etc., as shown above. The Exited column contains information 
# regarding whether or not the customer exited the bank after six months.

In [4]:
# We do not need RowNumber, CustomerId, and Surname columns in our dataset 
# since they do not help in predicting if a customer will churn or not.
# To remove these columns, you can use the drop() method.
churn_df = churn_df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)

7.1.1 Dividing Data into Features and Labels

In [5]:
# The next step in classification is to divide the data into the features and labels. 
# The features set, i.e., X in the following script contains all the columns 
# except the Exited column. On the other hand, the labels set, i.e., y, contains values 
# from the Exited column only.
X = churn_df.drop(['Exited'], axis=1)
y = churn_df['Exited']

In [6]:
# Prints the first five rows of the feature set
X.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,France,Female,42,2,0.0,1,1,1,101348.88
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58
2,502,France,Female,42,8,159660.8,3,1,0,113931.57
3,699,France,Female,39,1,0.0,2,0,0,93826.63
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1


In [7]:
# Print the first five rows of the label set
y.head()

0    1
1    0
2    1
3    0
4    0
Name: Exited, dtype: int64

7.1.2 Converting Categorical Data to Numbers

In [8]:
# For the classification problem, too, we need to convert the categorical column to numerical ones.
# The first step then is to create a dataframe containing only numeric values. You can do so 
# by dropping the categorical column and creating a new dataframe.
numerical = X.drop(['Geography', 'Gender'], axis=1)

In [9]:
# Prints the dataframe that contains numeric columns only.
numerical.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,42,2,0.0,1,1,1,101348.88
1,608,41,1,83807.86,1,0,1,112542.58
2,502,42,8,159660.8,3,1,0,113931.57
3,699,39,1,0.0,2,0,0,93826.63
4,850,43,2,125510.82,1,1,1,79084.1


In [10]:
# Create a dataframe that contains categorical values only. 
# You can do so by using filter() function
categorical = X.filter(['Geography', 'Gender'])
categorical.head()

Unnamed: 0,Geography,Gender
0,France,Female
1,Spain,Female
2,France,Female
3,France,Female
4,Spain,Female


In [12]:
# In order to convert categorical features to numeric ones, 
# we will use the one-hot encoding approach.
# The following script converts categorical columns into one-hot encoded columns using 
# the pd.get_dummies() method.
import pandas as pd

cat_numerical = pd.get_dummies(categorical, drop_first=True)
cat_numerical.head()

Unnamed: 0,Geography_Germany,Geography_Spain,Gender_Male
0,0,0,0
1,0,1,0
2,0,0,0
3,0,0,0
4,0,1,0


In [13]:
# Join or concatenate the numeric columns and one-hot encoded categorical columns. 
# To do so, you can use the concat function from the Pandas library
X = pd.concat([numerical, cat_numerical], axis=1)
X.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_Germany,Geography_Spain,Gender_Male
0,619,42,2,0.0,1,1,1,101348.88,0,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,1,0
2,502,42,8,159660.8,3,1,0,113931.57,0,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,1,0


7.1.3 Divide Data into Training and Test Sets

In [14]:
# After you train a machine learning algorithm, you need to evaluate it to see how well it performs 
# on unseen data. Like regression, in classification problems, too, we divide the dataset into two 
# sets, i.e., the training set and test set. The dataset is trained via the training set and 
# evaluated on the test set. 
# To split the data into training and test sets, you can use the train_test_split() function from 
# the Sklearn library, as shown below. The following script divides the data into an 80 percent 
# training set and a 20 percent test set.
from sklearn.model_selection import train_test_split

# Test size is the fraction of test size
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

7.1.4 Data Scaling/Normalization

In [15]:
# You can see that some columns of the dataset contain small values, while the other columns 
# contain very large values. It is better to convert all values to a uniform scale. 
# To do so, you can use the StandardScaler() function from the sklearn.preprocessing module.
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

7.2 Logistic Regression

In [16]:
# Logistic regression is a linear model which makes classification by passing the output of 
# logistric regression through a sigmoid function.
from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression()
classifier = log_clf.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [20]:
# Once you have trained a model and have made predictions on the test set, the next step is to 
# know how well your model has performed for making predictions on the unknown test set. 
# There are various metrics to evaluate a classification method. Some of the most commonly used 
# classification metrics are F1, recall, precision, accuracy, and confusion metric.

# The choice of using a metric for a classification problem depends totally upon you. 
# However, as a rule of thumb, in case of balanced datasets, i.e.,where the number of labels for 
# each class is balanced, accuracy can be used as an evaluation metric. 
# For imbalanced datasets, you can use F1 the measure as the classification metric.
# The methods used to find the value for these metrics are available in the sklearn.metrics class. 
# The predicted and actual values have to be passed to these methods.
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[1526   69]
 [ 309   96]]
              precision    recall  f1-score   support

           0       0.83      0.96      0.89      1595
           1       0.58      0.24      0.34       405

    accuracy                           0.81      2000
   macro avg       0.71      0.60      0.61      2000
weighted avg       0.78      0.81      0.78      2000

0.811


In [21]:
# The output shows that for 81 percent of the records in the test set, 
# logisticregression correctly predicted whether or not a customer will leave the bank.

7.3 KNN Classifier

In [23]:
#  KNN algorithm can be used both for classification and regression. With Sklearn, it is extremely 
# easy to implement KNN classification. To do so, you can use the KNeighborsClassifiers class.
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(n_neighbors=5)
classifier = knn_clf.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

In [24]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[1486  109]
 [ 237  168]]
              precision    recall  f1-score   support

           0       0.86      0.93      0.90      1595
           1       0.61      0.41      0.49       405

    accuracy                           0.83      2000
   macro avg       0.73      0.67      0.69      2000
weighted avg       0.81      0.83      0.81      2000

0.827


7.4 Random Forest Classifier

In [27]:
# Like the random forest regressor, the random forest classifier is a tree-based algorithm 
# that converts features into tree nodes, and then uses entropy loss to make classification predictions.
# RandomForestClassifier class from the Sklearn.ensemble module can be used to implement 
# the random forest regressor algorithm in Python.
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(random_state=42, n_estimators=500)
classifier = rf_clf.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [28]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[1521   74]
 [ 196  209]]
              precision    recall  f1-score   support

           0       0.89      0.95      0.92      1595
           1       0.74      0.52      0.61       405

    accuracy                           0.86      2000
   macro avg       0.81      0.73      0.76      2000
weighted avg       0.86      0.86      0.86      2000

0.865


7.5 Support Vector Classification

In [30]:
# The support vector machine is classification as well as regression algorithms, which minimizes 
# the error between the actual predictions and predicted predictions by maximizing the distance 
# between hyperplanes that contain data for various records.
# With the Sklearn library, you can use the SVM module to implement the support vector classification 
# algorithm, as shown below. The SVC classfrom the SVM module is used to implement 
# the support vector classification.
from sklearn import svm

svm_clf = svm.SVC()
classifier = svm_clf.fit(X_train, y_train)
# Making predictions on test set
y_pred = classifier.predict(X_test)

In [31]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[1547   48]
 [ 225  180]]
              precision    recall  f1-score   support

           0       0.87      0.97      0.92      1595
           1       0.79      0.44      0.57       405

    accuracy                           0.86      2000
   macro avg       0.83      0.71      0.74      2000
weighted avg       0.86      0.86      0.85      2000

0.8635


7.6 K-Fold Cross-Validation

In [32]:
# You can also perform K-fold cross-validation for classification models, just like regression models.
# You can use cross_val_score() function from the sklearn.model_selection module to perform cross-validation.
from sklearn.model_selection import cross_val_score
print(cross_val_score(classifier, X, y, cv=5, scoring='accuracy'))

[0.796  0.796  0.7965 0.7965 0.7965]


7.7 Predicting a Single Value

In [33]:
churn_df.loc[100]

CreditScore              665
Geography             France
Gender                Female
Age                       40
Tenure                     6
Balance                  0.0
NumOfProducts              1
HasCrCard                  1
IsActiveMember             1
EstimatedSalary    161848.03
Exited                     0
Name: 100, dtype: object

In [34]:
# The output above shows that the customer did not exit the bank after six months since the value 
# for the Exited attribute is 0. Let’s see what our classification model predicts.
from sklearn.ensemble import RandomForestClassifier

# Training the random forest algorithm
rf_clf = RandomForestClassifier(random_state=42, n_estimators=500)
classifier = rf_clf.fit(X_train, y_train)

# Scaling single record
single_record = sc.transform(X.values[100].reshape(1, -1))

# Making predictions on the single record
predicted_churn = classifier.predict(single_record)
print(predicted_churn)

[0]




In [35]:
# The output is 0, which shows that our model correctly predicted that the customer will not churn 
# after six months.