In [2]:
import pandas as pd

df = pd.read_excel('Healthcare_dataset.xlsx', sheet_name=1)


In [3]:
# Display basic information about the dataset
print(df.info())

# Display the first few rows of the dataset to understand its structure
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3424 entries, 0 to 3423
Data columns (total 69 columns):
 #   Column                                                              Non-Null Count  Dtype 
---  ------                                                              --------------  ----- 
 0   Ptid                                                                3424 non-null   object
 1   Persistency_Flag                                                    3424 non-null   object
 2   Gender                                                              3424 non-null   object
 3   Race                                                                3424 non-null   object
 4   Ethnicity                                                           3424 non-null   object
 5   Region                                                              3424 non-null   object
 6   Age_Bucket                                                          3424 non-null   object
 7   Ntm_Speciality          

In [4]:
# Check for missing values
missing_values = df.isnull().sum()
print(missing_values)


Ptid                              0
Persistency_Flag                  0
Gender                            0
Race                              0
Ethnicity                         0
                                 ..
Risk_Hysterectomy_Oophorectomy    0
Risk_Estrogen_Deficiency          0
Risk_Immobilization               0
Risk_Recurring_Falls              0
Count_Of_Risks                    0
Length: 69, dtype: int64


In [8]:
from sklearn.preprocessing import OneHotEncoder

# Define the features (X) and target variable (y)
X = df.drop(columns=['Persistency_Flag', 'Ptid'])  # Exclude target variable and patient ID
y = df['Persistency_Flag']

# Identify categorical columns (excluding the target variable)
categorical_columns = X.select_dtypes(include=['object']).columns.tolist()

# Apply one-hot encoding
encoder = OneHotEncoder(sparse_output=False)
encoded_data = encoder.fit_transform(X[categorical_columns])
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_columns))

# Concatenate with the original dataframe (excluding the original categorical columns)
X_encoded = pd.concat([X.drop(categorical_columns, axis=1), encoded_df], axis=1)



In [9]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42)


In [10]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the RandomForestClassifier
clf = RandomForestClassifier(random_state=42)

# Train the model on the training data
clf.fit(X_train, y_train)


In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='Persistent')  # Adjust as needed
recall = recall_score(y_test, y_pred, pos_label='Persistent')        # Adjust as needed
roc_auc = roc_auc_score(pd.get_dummies(y_test), pd.get_dummies(y_pred))

# Print the metrics
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'ROC-AUC: {roc_auc}')


Accuracy: 0.8044747081712063
Precision: 0.7566765578635015
Recall: 0.6818181818181818
ROC-AUC: 0.77821795941062


In [12]:
from sklearn.model_selection import GridSearchCV

# Define a set of parameters to test
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [10, 20, 30],        # Maximum depth of the tree
    # Add more parameters here as needed
}

# Initialize the GridSearchCV
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=3, scoring='accuracy')

# Perform the grid search on the training data
grid_search.fit(X_train, y_train)

# Print the best parameters
print(f"Best parameters: {grid_search.best_params_}")


Best parameters: {'max_depth': 20, 'n_estimators': 200}
