In [16]:
# Import required libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler


In [17]:
# Read ortho data
file_path = "https://static.bc-edx.com/ai/ail-v-1-0/m13/lesson_3/datasets/vertebral-column.csv"
df_ortho = pd.read_csv(file_path)
df_ortho.head()

Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis,class
0,63.027817,22.552586,39.609117,40.475232,98.672917,-0.2544,Hernia
1,39.056951,10.060991,25.015378,28.99596,114.405425,4.564259,Hernia
2,68.832021,22.218482,50.092194,46.613539,105.985135,-3.530317,Hernia
3,69.297008,24.652878,44.311238,44.64413,101.868495,11.211523,Hernia
4,49.712859,9.652075,28.317406,40.060784,108.168725,7.918501,Hernia


In [18]:
df_ortho.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 310 entries, 0 to 309
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   pelvic_incidence          310 non-null    float64
 1   pelvic_tilt               310 non-null    float64
 2   lumbar_lordosis_angle     310 non-null    float64
 3   sacral_slope              310 non-null    float64
 4   pelvic_radius             310 non-null    float64
 5   degree_spondylolisthesis  310 non-null    float64
 6   class                     310 non-null    object 
dtypes: float64(6), object(1)
memory usage: 17.1+ KB


Setting 'class' as the target.

In [19]:
unique_classes = df_ortho['class'].unique()

# Print unique class labels
print("Unique class labels:", unique_classes)


Unique class labels: ['Hernia' 'Spondylolisthesis' 'Normal']


Balance check

In [20]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode the target variable ('class')
df_ortho['class'] = label_encoder.fit_transform(df_ortho['class'])

# Print encoded class mapping
class_mapping = {label: idx for idx, label in enumerate(label_encoder.classes_)}
print("Class Mapping:", class_mapping)

Class Mapping: {'Hernia': 0, 'Normal': 1, 'Spondylolisthesis': 2}


In [21]:
# Split into features (X) and target (y)
X = df_ortho.drop(columns=['class'])  # Features
y = df_ortho['class']  # Target

In [22]:
# Check the balance of the labels variable (`y`) by using the `value_counts` function.
label_distribution = y.value_counts()
print(label_distribution)

class
2    150
1    100
0     60
Name: count, dtype: int64


Train test split


In [23]:
# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X.sample(5)
# Split the data into X_train, X_test, y_train, y_test
# Using a 20% test size as that seems to be a more standard split, 
# also stratifying the data to ensure that the distribution of the 
# labels is the same in both the training and testing sets.

Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis
143,60.626217,20.595958,64.535262,40.030259,117.225554,104.859247
247,49.828135,16.736435,28.0,33.0917,121.435558,1.913307
123,65.665347,10.540675,56.489135,55.124672,109.162777,53.93202
248,47.319648,8.57368,35.560252,38.745967,120.576972,1.630664
218,51.624672,15.969344,35.0,35.655328,129.385308,1.009228


Scaling

In [24]:
# Feature scaling (StandardScaler)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [25]:
y_train.value_counts()

class
2    119
1     80
0     49
Name: count, dtype: int64

In [26]:
y_test.value_counts()

class
2    31
1    20
0    11
Name: count, dtype: int64

In [27]:
# Scale the training data
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X.columns)
X_train_scaled_df.head()

Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis
0,0.066767,0.550966,-0.315815,-0.315612,0.592944,-0.748397
1,0.159351,0.536519,-0.669224,-0.188798,-1.580149,-0.68669
2,0.545704,0.391437,0.03948,0.401885,-0.120824,0.698388
3,1.602732,0.314612,2.250881,1.785829,0.21255,1.249534
4,-0.923789,-0.794671,-0.001811,-0.584589,1.320672,0.051758


In [28]:
# Create and fit a Logistic Regression model
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(max_iter=1000, random_state=1)  # Increase max_iter to ensure convergence
log_reg.fit(X_train_scaled, y_train)

In [None]:
#log_reg_predictions_df

Unnamed: 0,Actual,Predicted
78,2,0
244,1,0
185,2,0
70,2,0
120,2,0
...,...,...
146,2,0
9,0,0
127,2,0
16,0,0


In [30]:
# Make and save testing predictions with the trained Logistic Regression model using the test data
y_pred_log_reg = log_reg.predict(X_test_scaled)
log_reg_predictions_df = pd.DataFrame({"Actual": y_test, "Predicted": y_pred_log_reg})

# Review the predictions
y_pred_log_reg

array([2, 0, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 0, 1, 1, 2, 0, 1, 1, 2, 1, 1,
       0, 2, 0, 2, 2, 2, 1, 2, 2, 1, 1, 0, 0, 2, 1, 0, 2, 2, 1, 1, 2, 1,
       2, 2, 1, 0, 2, 1, 1, 1, 1, 2, 2, 0, 2, 2, 0, 2, 1, 2])

In [31]:
# Make and save testing predictions with the trained Logistic Regression model using the test data
log_reg_predictions = log_reg.predict(X_test_scaled)
#log_reg_predictions_df = pd.DataFrame({"Actual": y_test, "Predicted": y_pred_log_reg})

# Review the predictions
log_reg_predictions

array([2, 0, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 0, 1, 1, 2, 0, 1, 1, 2, 1, 1,
       0, 2, 0, 2, 2, 2, 1, 2, 2, 1, 1, 0, 0, 2, 1, 0, 2, 2, 1, 1, 2, 1,
       2, 2, 1, 0, 2, 1, 1, 1, 1, 2, 2, 0, 2, 2, 0, 2, 1, 2])

In [32]:
print(df_ortho['class'].unique())


[0 2 1]


In [33]:
from sklearn.metrics import accuracy_score

In [34]:
# Calculate the accuracy score by evaluating `y_test` vs. `testing_predictions`.
accuracy = accuracy_score(y_test, y_pred_log_reg)
print(f"Accuracy Score: {accuracy:.10f}")

Accuracy Score: 0.8225806452


Random Forest Model

In [35]:
# Create and fit a Random Forest Classifier model
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=1)
rf_clf.fit(X_train_scaled, y_train)

In [36]:
# Make and save testing predictions with the trained Random Forest Classifier model using the test data
y_pred_rf = rf_clf.predict(X_test_scaled)
rf_predictions_df = pd.DataFrame({"Actual": y_test, "Predicted": y_pred_rf})

# Review the predictions
y_pred_rf

array([2, 0, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 0, 1, 1, 2, 0, 1, 1, 2, 2, 1,
       0, 1, 0, 2, 2, 2, 2, 2, 2, 1, 1, 0, 0, 2, 1, 0, 2, 2, 1, 1, 2, 1,
       2, 2, 1, 0, 2, 1, 1, 1, 0, 2, 2, 0, 2, 2, 0, 2, 1, 2])

In [37]:
# Calculate the accuracy score by evaluating `y_test` vs. `testing_predictions`.
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy Score: {accuracy_rf:.10f}")

Accuracy Score: 0.8548387097


KNN

In [None]:
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler