In [2]:
# Import required libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler



In [3]:
# Read ortho data
file_path = "https://static.bc-edx.com/ai/ail-v-1-0/m13/lesson_3/datasets/vertebral-column.csv"
df_ortho = pd.read_csv(file_path)
df_ortho.head()


Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis,class
0,63.027817,22.552586,39.609117,40.475232,98.672917,-0.2544,Hernia
1,39.056951,10.060991,25.015378,28.99596,114.405425,4.564259,Hernia
2,68.832021,22.218482,50.092194,46.613539,105.985135,-3.530317,Hernia
3,69.297008,24.652878,44.311238,44.64413,101.868495,11.211523,Hernia
4,49.712859,9.652075,28.317406,40.060784,108.168725,7.918501,Hernia


In [7]:
df_ortho.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 310 entries, 0 to 309
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   pelvic_incidence          310 non-null    float64
 1   pelvic_tilt               310 non-null    float64
 2   lumbar_lordosis_angle     310 non-null    float64
 3   sacral_slope              310 non-null    float64
 4   pelvic_radius             310 non-null    float64
 5   degree_spondylolisthesis  310 non-null    float64
 6   class                     310 non-null    object 
dtypes: float64(6), object(1)
memory usage: 17.1+ KB


In [6]:
unique_classes = df_ortho['class'].unique()

# Print unique class labels
print("Unique class labels:", unique_classes)

Unique class labels: ['Hernia' 'Spondylolisthesis' 'Normal']


In [8]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode the target variable ('class')
df_ortho['class'] = label_encoder.fit_transform(df_ortho['class'])

# Print encoded class mapping
class_mapping = {label: idx for idx, label in enumerate(label_encoder.classes_)}
print("Class Mapping:", class_mapping)

Class Mapping: {'Hernia': 0, 'Normal': 1, 'Spondylolisthesis': 2}


In [9]:
# Split into features (X) and target (y)
X = df_ortho.drop(columns=['class'])  # Features
y = df_ortho['class']  # Target

In [10]:
# Check the balance of the labels variable (`y`) by using the `value_counts` function.
label_distribution = y.value_counts()
print(label_distribution)

class
2    150
1    100
0     60
Name: count, dtype: int64


In [11]:
# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X.sample(5)
# Split the data into X_train, X_test, y_train, y_test
# Using a 20% test size as that seems to be a more standard split, 
# also stratifying the data to ensure that the distribution of the 
# labels is the same in both the training and testing sets.

Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis
18,38.697912,13.444749,31.0,25.253163,123.159251,1.429186
306,53.936748,20.721496,29.220534,33.215251,114.365845,-0.42101
210,38.505273,16.964297,35.112814,21.540976,127.632875,7.986683
60,74.377678,32.053104,78.772013,42.324573,143.56069,56.125906
255,67.538182,14.655042,58.001429,52.883139,123.63226,25.970206


In [12]:
# Feature scaling (StandardScaler)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)
y_train.value_counts()

class
2    119
1     80
0     49
Name: count, dtype: int64

In [13]:
y_test.value_counts()

class
2    31
1    20
0    11
Name: count, dtype: int64

In [14]:
# Scale the training data
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X.columns)
X_train_scaled_df.head()

Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis
0,0.066767,0.550966,-0.315815,-0.315612,0.592944,-0.748397
1,0.159351,0.536519,-0.669224,-0.188798,-1.580149,-0.68669
2,0.545704,0.391437,0.03948,0.401885,-0.120824,0.698388
3,1.602732,0.314612,2.250881,1.785829,0.21255,1.249534
4,-0.923789,-0.794671,-0.001811,-0.584589,1.320672,0.051758


In [15]:
# Create and fit a Logistic Regression model
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(max_iter=1000, random_state=1)  # Increase max_iter to ensure convergence
log_reg.fit(X_train_scaled, y_train)
LogisticRegression(max_iter=1000, random_state=1)

#log_reg_predictions_df

In [16]:
# Make and save testing predictions with the trained Logistic Regression model using the test data
y_pred_log_reg = log_reg.predict(X_test_scaled)
log_reg_predictions_df = pd.DataFrame({"Actual": y_test, "Predicted": y_pred_log_reg})

In [17]:
# Review the predictions
y_pred_log_reg

array([2, 0, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 0, 1, 1, 2, 0, 1, 1, 2, 1, 1,
       0, 2, 0, 2, 2, 2, 1, 2, 2, 1, 1, 0, 0, 2, 1, 0, 2, 2, 1, 1, 2, 1,
       2, 2, 1, 0, 2, 1, 1, 1, 1, 2, 2, 0, 2, 2, 0, 2, 1, 2])

In [18]:
# Make and save testing predictions with the trained Logistic Regression model using the test data
log_reg_predictions = log_reg.predict(X_test_scaled)
#log_reg_predictions_df = pd.DataFrame({"Actual": y_test, "Predicted": y_pred_log_reg})

# Review the predictions
log_reg_predictions

array([2, 0, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 0, 1, 1, 2, 0, 1, 1, 2, 1, 1,
       0, 2, 0, 2, 2, 2, 1, 2, 2, 1, 1, 0, 0, 2, 1, 0, 2, 2, 1, 1, 2, 1,
       2, 2, 1, 0, 2, 1, 1, 1, 1, 2, 2, 0, 2, 2, 0, 2, 1, 2])

In [19]:
print(df_ortho['class'].unique())

[0 2 1]


In [20]:
from sklearn.metrics import accuracy_score
# Calculate the accuracy score by evaluating `y_test` vs. `testing_predictions`.
accuracy = accuracy_score(y_test, y_pred_log_reg)
print(f"Accuracy Score: {accuracy:.10f}")

Accuracy Score: 0.8225806452


Random Forest Model

In [21]:
# Create and fit a Random Forest Classifier model
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=1)
rf_clf.fit(X_train_scaled, y_train)
RandomForestClassifier(random_state=1)



In [26]:
# Make and save testing predictions with the trained Random Forest Classifier model using the test data
y_pred_rf = rf_clf.predict(X_test_scaled)
rf_predictions_df = pd.DataFrame({"Actual": y_test, "Predicted": y_pred_rf})


In [27]:
# Review the predictions
y_pred_rf

array([2, 0, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 0, 1, 1, 2, 0, 1, 1, 2, 2, 1,
       0, 1, 0, 2, 2, 2, 2, 2, 2, 1, 1, 0, 0, 2, 1, 0, 2, 2, 1, 1, 2, 1,
       2, 2, 1, 0, 2, 1, 1, 1, 0, 2, 2, 0, 2, 2, 0, 2, 1, 2])

In [28]:
# Calculate the accuracy score by evaluating `y_test` vs. `testing_predictions`.
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy Score: {accuracy_rf:.10f}")

Accuracy Score: 0.8548387097


In [29]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [30]:
# Create and train the SVM classifier with an RBF kernel
svm_clf = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=1)
svm_clf.fit(X_train_scaled, y_train)

In [31]:
# Make predictions on the test set
y_pred_svm = svm_clf.predict(X_test_scaled)

In [32]:
# Evaluate model performance
svm_accuracy = accuracy_score(y_test, y_pred_svm)
svm_report = classification_report(y_test, y_pred_svm)

In [33]:
print(f"SVM Accuracy: {svm_accuracy:.4f}")
print("\nClassification Report:\n", svm_report)

SVM Accuracy: 0.8065

Classification Report:
               precision    recall  f1-score   support

           0       0.58      0.64      0.61        11
           1       0.70      0.80      0.74        20
           2       1.00      0.87      0.93        31

    accuracy                           0.81        62
   macro avg       0.76      0.77      0.76        62
weighted avg       0.83      0.81      0.81        62



In [34]:
# Overfitting & Underfitting Discussion
train_accuracy = accuracy_score(y_train, svm_clf.predict(X_train_scaled))
test_accuracy = accuracy_score(y_test, y_pred_svm)

In [35]:
diff = train_accuracy - test_accuracy

print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Testing Accuracy: {test_accuracy:.4f}")
print(f"Accuracy Difference (Train - Test): {diff:.4f}")

Training Accuracy: 0.8468
Testing Accuracy: 0.8065
Accuracy Difference (Train - Test): 0.0403


In [36]:
if diff > 0.1:
    print("Potential Overfitting: Model performs significantly better on training data than on test data.")
elif test_accuracy < 0.7:
    print("Potential Underfitting: Model may not be capturing enough patterns from the data.")
else:
    print("Model seems well-balanced with training and test performance.")

Model seems well-balanced with training and test performance.


In [37]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestClassifier(random_state=1)
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

best_rf = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

train_acc = best_rf.score(X_train_scaled, y_train)
test_acc = best_rf.score(X_test_scaled, y_test)
print(f"Training Accuracy: {train_acc:.2%}, Testing Accuracy: {test_acc:.2%}")


Best Parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
Training Accuracy: 95.56%, Testing Accuracy: 85.48%


In [38]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=1)
mlp.fit(X_train_scaled, y_train)

print("Training Accuracy:", mlp.score(X_train_scaled, y_train))
print("Testing Accuracy:", mlp.score(X_test_scaled, y_test))


Training Accuracy: 0.9919354838709677
Testing Accuracy: 0.7903225806451613
