In [360]:
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')
pd.options.display.float_format = "{:,.2f}".format

In [372]:
# Read Dataset
df = pd.read_csv('kyphosis.csv')

In [362]:
df.head()

Unnamed: 0,Kyphosis,Age,Number,Start
0,absent,71,3,5
1,absent,158,3,14
2,present,128,4,5
3,absent,2,5,1
4,absent,1,4,15


In [363]:
list(df.columns)

['Kyphosis', 'Age', 'Number', 'Start']

In [364]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81 entries, 0 to 80
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Kyphosis  81 non-null     object
 1   Age       81 non-null     int64 
 2   Number    81 non-null     int64 
 3   Start     81 non-null     int64 
dtypes: int64(3), object(1)
memory usage: 2.7+ KB


# Converts the categorical data into numerical

In [365]:
def label_encode_columns(df):
    le = LabelEncoder()

    # Select object columns
    object_cols = df.select_dtypes(include='object').columns

    # Apply LabelEncoder to each column
    for col in object_cols:
        df[col] = le.fit_transform(df[col])

    # Return the modified dataframe
    return df

label_encode_columns(df)

Unnamed: 0,Kyphosis,Age,Number,Start
0,0,71,3,5
1,0,158,3,14
2,1,128,4,5
3,0,2,5,1
4,0,1,4,15
...,...,...,...,...
76,1,157,3,13
77,0,26,7,13
78,0,120,2,13
79,1,42,7,6


In [366]:
# Types after encoding data
df.dtypes

Kyphosis    int32
Age         int64
Number      int64
Start       int64
dtype: object

In [367]:
df.Kyphosis.unique()

array([0, 1])

In [368]:
df.Kyphosis.value_counts(normalize=True)

Kyphosis
0   0.79
1   0.21
Name: proportion, dtype: float64

# Split data into Train and Test

In [369]:
# Split the data into 70% training dataset and 30% test dataset.
from sklearn.model_selection import train_test_split

X = df.iloc[:,1:]                    # Select Features
y = df['Kyphosis'].copy()            # Select independent variable

#Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.3, random_state = 42 )

# Standardize data

In [370]:
# Standardize training and test datasets.
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model ML and Evaluate models

In [371]:
classifiers = {'Logistic Regression': LogisticRegression(), 'Decision Tree': DecisionTreeClassifier(),
               'Random Forest': RandomForestClassifier(), 'Gradient Boosting': GradientBoostingClassifier(), 'SVM': SVC()}

for model_name, model in classifiers.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(model_name)
    print(classification_report(y_test, y_pred))
    print('--------------------------------------------------------')

Logistic Regression
              precision    recall  f1-score   support

           0       0.79      1.00      0.88        19
           1       1.00      0.17      0.29         6

    accuracy                           0.80        25
   macro avg       0.90      0.58      0.58        25
weighted avg       0.84      0.80      0.74        25

--------------------------------------------------------
Decision Tree
              precision    recall  f1-score   support

           0       0.80      0.84      0.82        19
           1       0.40      0.33      0.36         6

    accuracy                           0.72        25
   macro avg       0.60      0.59      0.59        25
weighted avg       0.70      0.72      0.71        25

--------------------------------------------------------
Random Forest
              precision    recall  f1-score   support

           0       0.79      1.00      0.88        19
           1       1.00      0.17      0.29         6

    accuracy        

# Conclusion

Based on the results presented, the SVM model is the most suitable for classifying customers with or without kyphosis. It shows the best performance across all evaluation metrics, including accuracy, precision, recall, and F1-score.

Accuracy is an important performance metric, but it is not the only one. In the specific case of a dataset of customers classified with kyphosis or not, the SVM is a good choice because it is able to separate the two groups of customers with high precision. This is important because the goal of a classification model is to minimize classification error.

It is important to note that the choice of the ideal model should take into account the specific objectives of the problem. In the case of classifying customers with kyphosis, the SVM is a good choice because it shows consistent performance across all evaluation metrics.