In [33]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

# Load the test data from the specified file path
file_path = r'C:\Users\gyala\allmynotes\cleaned_file2.csv'
test_data = pd.read_csv(file_path)

# Display the first few rows of the test data
print("Test Data Preview:")
print(test_data.head())

# Generate an expanded synthetic dataset with diverse feature ranges
np.random.seed(42)
expanded_data = pd.DataFrame({
    'Rate - Serious Complications': np.random.uniform(0, 2, size=300),
    'Rate - Accidental cuts and tears from medical treatment': np.random.uniform(0, 5, size=300),
    'Rate - Death from serious treatable complications after surgery': np.random.uniform(0, 10, size=300),
    'Rate - Collapsed lung due to medical treatment': np.random.uniform(0, 5, size=300),
    'Rate - Serious blood clots after surgery': np.random.uniform(0, 3, size=300),
    'Rate - A wound that splits open after surgery': np.random.uniform(0, 4, size=300),
    'Target_Category': np.random.choice(['Low', 'Medium', 'High'], size=300)
})

# Extract features and target from the expanded synthetic dataset
X_enhanced = expanded_data[
    [
        'Rate - Serious Complications',
        'Rate - Accidental cuts and tears from medical treatment',
        'Rate - Death from serious treatable complications after surgery',
        'Rate - Collapsed lung due to medical treatment',
        'Rate - Serious blood clots after surgery',
        'Rate - A wound that splits open after surgery'
    ]
]
y_enhanced = expanded_data['Target_Category']

# Train the classifier using the enhanced dataset
clf_model = RandomForestClassifier(random_state=42)
clf_model.fit(X_enhanced, y_enhanced)

# Prepare test data for prediction
# Select relevant numerical columns and clean the data
test_features = test_data[
    [
        'Rate - Serious Complications',
        'Rate - Accidental cuts and tears from medical treatment',
        'Rate - Death from serious treatable complications after surgery',
        'Rate - Collapsed lung due to medical treatment',
        'Rate - Serious blood clots after surgery',
        'Rate - A wound that splits open after surgery'
    ]
]
test_features_cleaned = test_features.apply(pd.to_numeric, errors='coerce').dropna()

# Predict categories for the test data
predicted_categories = clf_model.predict(test_features_cleaned)

# Add predictions to the test data
test_data_cleaned = test_data.loc[test_features_cleaned.index].copy()
test_data_cleaned['Predicted Category'] = predicted_categories

# Filter hospitals predicted to have "Low" serious complications
best_hospitals = test_data_cleaned[test_data_cleaned['Predicted Category'] == 'Low']

# Save the results to a CSV file for further review
output_file = r'C:\Users\gyala\allmynotes\best_hospitals_low.csv'
best_hospitals.to_csv(output_file, index=False)
print(f"\nBest hospitals data has been saved to {output_file}")

# Rank hospitals within the "Low" category by "Rate - Death from serious treatable complications after surgery"
low_risk_hospitals_sorted = best_hospitals.sort_values(
    by='Rate - Death from serious treatable complications after surgery', ascending=True
)

# Select the best hospital
best_hospital = low_risk_hospitals_sorted.iloc[0]

# Display the best hospital
print("Best Hospital Based on Predictions:")
print(best_hospital[
    [
        'Hospital Name',
        'City',
        'State',
        'Rate - Serious Complications',
        'Rate - Accidental cuts and tears from medical treatment',
        'Rate - Death from serious treatable complications after surgery',
        'Rate - Collapsed lung due to medical treatment',
        'Rate - Serious blood clots after surgery',
        'Rate - A wound that splits open after surgery'
    ]
])


Test Data Preview:
   Provider Number                     Hospital Name  \
0            10001  SOUTHEAST ALABAMA MEDICAL CENTER   
1            10005     MARSHALL MEDICAL CENTER SOUTH   
2            10006    ELIZA COFFEE MEMORIAL HOSPITAL   
3            10007          MIZELL MEMORIAL HOSPITAL   
4            10008       CRENSHAW COMMUNITY HOSPITAL   

                    Address 1  Address 2  Address 3      City State  ZIP Code  \
0      1108 ROSS CLARK CIRCLE        NaN        NaN    DOTHAN    AL     36301   
1  2505 U S HIGHWAY 431 NORTH        NaN        NaN      BOAZ    AL     35957   
2          205 MARENGO STREET        NaN        NaN  FLORENCE    AL     35631   
3               702 N MAIN ST        NaN        NaN       OPP    AL     36467   
4         101 HOSPITAL CIRCLE        NaN        NaN   LUVERNE    AL     36049   

  County Name  Phone Number  ...  \
0     HOUSTON    3347938701  ...   
1    MARSHALL    2565938310  ...   
2  LAUDERDALE    2567688400  ...   
3   COVINGTON

In [37]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# Generate an expanded synthetic dataset with diverse feature ranges
np.random.seed(42)
expanded_data = pd.DataFrame({
    'Rate - Serious Complications': np.random.uniform(0, 2, size=300),
    'Rate - Accidental cuts and tears from medical treatment': np.random.uniform(0, 5, size=300),
    'Rate - Death from serious treatable complications after surgery': np.random.uniform(0, 10, size=300),
    'Rate - Collapsed lung due to medical treatment': np.random.uniform(0, 5, size=300),
    'Rate - Serious blood clots after surgery': np.random.uniform(0, 3, size=300),
    'Rate - A wound that splits open after surgery': np.random.uniform(0, 4, size=300),
    'Target_Category': np.random.choice(['Low', 'Medium', 'High'], size=300)
})

# Extract features and target from the expanded synthetic dataset
X = expanded_data[
    [
        'Rate - Serious Complications',
        'Rate - Accidental cuts and tears from medical treatment',
        'Rate - Death from serious treatable complications after surgery',
        'Rate - Collapsed lung due to medical treatment',
        'Rate - Serious blood clots after surgery',
        'Rate - A wound that splits open after surgery'
    ]
]
y = expanded_data['Target_Category']

# Split the synthetic dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the classifier using the training dataset
clf_model = RandomForestClassifier(random_state=42)
clf_model.fit(X_train, y_train)

# Make predictions on the test dataset
y_pred = clf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Display the evaluation metrics
print("Model Evaluation Metrics:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision (Weighted): {precision:.2f}")
print(f"Recall (Weighted): {recall:.2f}")
print(f"F1 Score (Weighted): {f1:.2f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)


Model Evaluation Metrics:
Accuracy: 0.38
Precision (Weighted): 0.37
Recall (Weighted): 0.38
F1 Score (Weighted): 0.37

Confusion Matrix:
[[ 7 12 12]
 [ 8 10 11]
 [ 6  7 17]]

Classification Report:
              precision    recall  f1-score   support

        High       0.33      0.23      0.27        31
         Low       0.34      0.34      0.34        29
      Medium       0.42      0.57      0.49        30

    accuracy                           0.38        90
   macro avg       0.37      0.38      0.37        90
weighted avg       0.37      0.38      0.37        90



In [35]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score, KFold

# Evaluate the model on the enhanced dataset
y_pred_train = clf_model.predict(X_enhanced)

# Classification metrics
accuracy = accuracy_score(y_enhanced, y_pred_train)
precision = precision_score(y_enhanced, y_pred_train, average='weighted')
recall = recall_score(y_enhanced, y_pred_train, average='weighted')
f1 = f1_score(y_enhanced, y_pred_train, average='weighted')

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

# Confusion matrix and classification report
conf_matrix = confusion_matrix(y_enhanced, y_pred_train)
class_report = classification_report(y_enhanced, y_pred_train)

print("\nConfusion Matrix:")
print(conf_matrix)

print("\nClassification Report:")
print(class_report)

# Cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(clf_model, X_enhanced, y_enhanced, cv=kf, scoring='accuracy')
print(f"\nCross-Validation Accuracy: {cv_scores.mean():.2f} ± {cv_scores.std():.2f}")


Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1-Score: 1.00

Confusion Matrix:
[[ 96   0   0]
 [  0  94   0]
 [  0   0 110]]

Classification Report:
              precision    recall  f1-score   support

        High       1.00      1.00      1.00        96
         Low       1.00      1.00      1.00        94
      Medium       1.00      1.00      1.00       110

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300


Cross-Validation Accuracy: 0.32 ± 0.04
