In [56]:
import pandas as pd
import numpy as np
import pickle
import warnings
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from scipy.stats import uniform, randint
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [57]:
warnings.filterwarnings('ignore')

In [58]:
df=pd.read_csv('/kaggle/input/osteoporosis/osteoporosis.csv')

In [59]:
df.head()

Unnamed: 0,Id,Age,Gender,Hormonal Changes,Family History,Race/Ethnicity,Body Weight,Calcium Intake,Vitamin D Intake,Physical Activity,Smoking,Alcohol Consumption,Medical Conditions,Medications,Prior Fractures,Osteoporosis
0,1734616,69,Female,Normal,Yes,Asian,Underweight,Low,Sufficient,Sedentary,Yes,Moderate,Rheumatoid Arthritis,Corticosteroids,Yes,1
1,1419098,32,Female,Normal,Yes,Asian,Underweight,Low,Sufficient,Sedentary,No,,,,Yes,1
2,1797916,89,Female,Postmenopausal,No,Caucasian,Normal,Adequate,Sufficient,Active,No,Moderate,Hyperthyroidism,Corticosteroids,No,1
3,1805337,78,Female,Normal,No,Caucasian,Underweight,Adequate,Insufficient,Sedentary,Yes,,Rheumatoid Arthritis,Corticosteroids,No,1
4,1351334,38,Male,Postmenopausal,Yes,African American,Normal,Low,Sufficient,Active,Yes,,Rheumatoid Arthritis,,Yes,1


In [60]:
df.shape

(1958, 16)

In [61]:
df.isnull().sum()

Id                       0
Age                      0
Gender                   0
Hormonal Changes         0
Family History           0
Race/Ethnicity           0
Body Weight              0
Calcium Intake           0
Vitamin D Intake         0
Physical Activity        0
Smoking                  0
Alcohol Consumption    988
Medical Conditions     647
Medications            985
Prior Fractures          0
Osteoporosis             0
dtype: int64

In [62]:
AlcoholConsumptionMode= df['Alcohol Consumption'].mode()[0]
df['Alcohol Consumption'].fillna(AlcoholConsumptionMode, inplace=True)

MedicalConditionsMode= df['Medical Conditions'].mode()[0]
df['Medical Conditions'].fillna(MedicalConditionsMode, inplace=True)

MedicationsMode= df['Medications'].mode()[0]
df['Medications'].fillna(MedicationsMode, inplace=True)

In [63]:
df.isnull().sum()

Id                     0
Age                    0
Gender                 0
Hormonal Changes       0
Family History         0
Race/Ethnicity         0
Body Weight            0
Calcium Intake         0
Vitamin D Intake       0
Physical Activity      0
Smoking                0
Alcohol Consumption    0
Medical Conditions     0
Medications            0
Prior Fractures        0
Osteoporosis           0
dtype: int64

In [64]:
X = df.drop('Osteoporosis', axis=1)
y = df['Osteoporosis']

In [65]:
X.shape, y.shape

((1958, 15), (1958,))

In [66]:
categorical_features = ['Gender','Hormonal Changes', 'Family History', 'Race/Ethnicity', 'Body Weight', 'Calcium Intake', 'Vitamin D Intake', 'Physical Activity', 'Smoking', 'Alcohol Consumption', 'Medical Conditions','Medications', 'Prior Fractures']
numerical_features = ['Id', 'Age']

In [67]:
X_encoded = pd.get_dummies(X, columns=categorical_features, drop_first=True)
scaler = StandardScaler()
X_encoded[numerical_features] = scaler.fit_transform(X_encoded[numerical_features])

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=1)

In [69]:
X_encoded

Unnamed: 0,Id,Age,Gender_Male,Hormonal Changes_Postmenopausal,Family History_Yes,Race/Ethnicity_Asian,Race/Ethnicity_Caucasian,Body Weight_Underweight,Calcium Intake_Low,Vitamin D Intake_Sufficient,Physical Activity_Sedentary,Smoking_Yes,Medical Conditions_Rheumatoid Arthritis,Prior Fractures_Yes
0,0.706645,1.400418,False,False,True,True,False,True,True,True,True,True,True,True
1,-0.500580,-0.332606,False,False,True,True,False,True,True,True,True,False,False,True
2,0.948841,2.337187,False,True,False,False,True,False,False,True,False,False,False,False
3,0.977235,1.821964,False,False,False,False,True,True,False,False,True,True,True,False
4,-0.759856,-0.051575,True,True,True,False,False,False,True,True,False,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1953,1.690042,-0.941506,False,False,True,False,False,False,False,True,True,True,True,True
1954,-0.081603,-0.754152,False,True,True,False,True,True,True,False,False,False,False,False
1955,1.687448,-0.238929,False,True,False,False,False,True,True,True,True,False,False,False
1956,0.879710,-0.660475,True,True,False,False,False,False,True,False,True,True,True,True


In [70]:
X_encoded.iloc[:, 2:] = X_encoded.iloc[:, 2:].astype(int)

In [71]:
X_encoded

Unnamed: 0,Id,Age,Gender_Male,Hormonal Changes_Postmenopausal,Family History_Yes,Race/Ethnicity_Asian,Race/Ethnicity_Caucasian,Body Weight_Underweight,Calcium Intake_Low,Vitamin D Intake_Sufficient,Physical Activity_Sedentary,Smoking_Yes,Medical Conditions_Rheumatoid Arthritis,Prior Fractures_Yes
0,0.706645,1.400418,0,0,1,1,0,1,1,1,1,1,1,1
1,-0.500580,-0.332606,0,0,1,1,0,1,1,1,1,0,0,1
2,0.948841,2.337187,0,1,0,0,1,0,0,1,0,0,0,0
3,0.977235,1.821964,0,0,0,0,1,1,0,0,1,1,1,0
4,-0.759856,-0.051575,1,1,1,0,0,0,1,1,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1953,1.690042,-0.941506,0,0,1,0,0,0,0,1,1,1,1,1
1954,-0.081603,-0.754152,0,1,1,0,1,1,1,0,0,0,0,0
1955,1.687448,-0.238929,0,1,0,0,0,1,1,1,1,0,0,0
1956,0.879710,-0.660475,1,1,0,0,0,0,1,0,1,1,1,1


In [72]:
y

0       1
1       1
2       1
3       1
4       1
       ..
1953    0
1954    0
1955    0
1956    0
1957    0
Name: Osteoporosis, Length: 1958, dtype: int64

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=1)

models = [
    {
        'model': Pipeline([
            ('scaler', StandardScaler()),
            ('classifier', LogisticRegression(max_iter=1000))
        ]),
        'params': {
            'classifier__C': uniform(0.01, 100),
            'classifier__solver': ['lbfgs', 'liblinear']
        }
    },
    {
        'model': Pipeline([
            ('classifier', RandomForestClassifier())
        ]),
        'params': {
            'classifier__n_estimators': randint(50, 200),
            'classifier__max_depth': [None, 10, 20, 30],
            'classifier__min_samples_split': randint(2, 10),
            'classifier__min_samples_leaf': randint(1, 4)
        }
    },
    {
        'model': Pipeline([
            ('scaler', StandardScaler()),
            ('classifier', SVC())
        ]),
        'params': {
            'classifier__C': uniform(0.01, 100),
            'classifier__kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
            'classifier__gamma': ['scale', 'auto']
        }
    },
    {
        'model': Pipeline([
            ('scaler', StandardScaler()),
            ('classifier', GradientBoostingClassifier())
        ]),
        'params': {
            'classifier__n_estimators': randint(50, 200),
            'classifier__learning_rate': uniform(0.01, 0.1),
            'classifier__max_depth': randint(3, 10),
            'classifier__min_samples_split': randint(2, 10),
            'classifier__min_samples_leaf': randint(1, 4)
        }
    }
]

best_models = []
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
for item in models:
    random_search = RandomizedSearchCV(item['model'], item['params'], cv=cv, scoring='accuracy', n_jobs=-1, n_iter=50, random_state=1)
    random_search.fit(X_train, y_train)
    best_models.append((random_search.best_estimator_, random_search.best_params_, random_search.best_score_))

best_model, best_params, best_score = max(best_models, key=lambda x: x[2])

print("Best Model:", best_model)
print("Best Parameters:", best_params)
print("Best Cross-validation Score:", best_score)

Best Model: Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier',
                 GradientBoostingClassifier(learning_rate=0.07788355329398909,
                                            min_samples_split=4,
                                            n_estimators=60))])
Best Parameters: {'classifier__learning_rate': 0.07788355329398909, 'classifier__max_depth': 3, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 4, 'classifier__n_estimators': 60}
Best Cross-validation Score: 0.9086852119411489


In [75]:
best_model.fit(X_train, y_train)

In [76]:
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

In [77]:
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

In [78]:
train_accuracy, test_accuracy

(0.9144316730523627, 0.9132653061224489)

In [79]:
with open('Osteoporosis.pkl', 'wb') as file:
    pickle.dump(best_model, file)