# Classification Techniques

Developed by: David

---

In-depth exploration of how multiple methods can be used to categorize data into predefined classes.

---

# Importing the libraries

In [None]:
import numpy as np
import pandas as pd

# Data Pre-Processing

## Importing the dataset

In [None]:
dataset = pd.read_csv('data/default_credit_card_clients.txt', delimiter='\t',skiprows=1)
X = dataset.iloc[:, 1:-1].values # All columns except the first and the last
y = dataset.iloc[:, -1].values # Only the last column

In [None]:
print(X)

In [None]:
print(y)

## Encode the categorical variables

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
# transformed_data = column_transformer.fit_transform(your_dataframe)

ct = ColumnTransformer(
    transformers=[
        ('gender_encoder', OneHotEncoder(), [1]),  # Encode the Gender variable
        ('education_encoder', OneHotEncoder(), [2]),  # Encode the Education variable
        ('marital_encoder', OneHotEncoder(), [3])  # Encode the Marital Status variable
    ],
    remainder='passthrough'  # Keep the rest of the columns as they are
)

# Apply the ColumnTransformer to X
X = np.array(ct.fit_transform(X))

In [None]:
print(X)

## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Logistic Regression

## Training the Logistic Regression model on the Training set

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

## Making the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
cm_lr_score = accuracy_score(y_test, y_pred)

## Applying k-Fold Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
kf_lr_accuracy = "{:.2f} %".format(accuracies.mean()*100)
kf_lr_sd = "{:.2f} %".format(accuracies.std()*100)

# K-Nearest Neighbours (K-NN)

## Training the K-NN model on the Training set

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

## Making the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
cm_knn_score = accuracy_score(y_test, y_pred)

## Applying k-Fold Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
kf_knn_accuracy = "{:.2f} %".format(accuracies.mean()*100)
kf_knn_sd = "{:.2f} %".format(accuracies.std()*100)

# Support Vector Machine

## Training the SVM model on the Training set

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
cm_svm_score = accuracy_score(y_test, y_pred)

## Applying k-Fold Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
kf_svm_accuracy = "{:.2f} %".format(accuracies.mean()*100)
kf_svm_sd = "{:.2f} %".format(accuracies.std()*100)

# Kernel SVM

## Training the Kernel SVM model on the Training set

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

## Making the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
cm_kernel_svm_score = accuracy_score(y_test, y_pred)

## Applying k-Fold Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
kf_kernel_svm_accuracy = "{:.2f} %".format(accuracies.mean()*100)
kf_kernel_svm_sd = "{:.2f} %".format(accuracies.std()*100)

# Naive Bayes

## Training the Naive Bayes model on the Training set

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

## Making the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
cm_nb_score = accuracy_score(y_test, y_pred)

## Applying k-Fold Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
kf_nb_accuracy = "{:.2f} %".format(accuracies.mean()*100)
kf_nb_sd = "{:.2f} %".format(accuracies.std()*100)

# Decision Tree Classification

## Training the Decision Tree Classification model on the Training set

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

## Making the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
cm_dtc_score = accuracy_score(y_test, y_pred)

## Applying k-Fold Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
kf_dtc_accuracy = "{:.2f} %".format(accuracies.mean()*100)
kf_dtc_sd = "{:.2f} %".format(accuracies.std()*100)

# Random Forest Classification

## Training the Random Forest Classification model on the Training set

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

## Making the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
cm_rfc_score = accuracy_score(y_test, y_pred)

## Applying k-Fold Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
kf_rfc_accuracy = "{:.2f} %".format(accuracies.mean()*100)
kf_rfc_sd = "{:.2f} %".format(accuracies.std()*100)

# Determining the best model

In [None]:
data = {'Model': ['Logistic Regression', 'K-Nearest Neighbours (K-NN)','Support Vector Machine (SVM)',
                  'Kernel SVM','Naive Bayes','Decision Tree Class.','Random Forest Class.'],
        'CM Score': [cm_lr_score, cm_knn_score, cm_svm_score, cm_kernel_svm_score, cm_nb_score, cm_dtc_score , cm_rfc_score],
        'k-Fold Accuracy': [kf_lr_accuracy, kf_knn_accuracy,kf_svm_accuracy,kf_kernel_svm_accuracy,kf_nb_accuracy,kf_dtc_accuracy,kf_rfc_accuracy],
        'k-Fold SD': [kf_lr_sd, kf_knn_sd,kf_svm_sd,kf_kernel_svm_sd,kf_nb_sd,kf_dtc_sd,kf_rfc_sd]}

df = pd.DataFrame(data)

print(df)