In [19]:
import pandas as pd
import joblib

print("Loading model and scaler...")
model = joblib.load('random_forest_model.pkl')
scaler = joblib.load('scaler.pkl')

# Define categorical columns and numerical columns
categorical_columns = ['gender', 'dialysisrenalendstage', 'asthma', 'irondef', 'pneum', 
                       'substancedependence', 'psychologicaldisordermajor', 'depress', 'psychother',
                       'fibrosisandother', 'malnutrition']

numerical_columns = ['rcount', 'hemo', 'hematocrit', 'neutrophils', 'sodium',
                     'glucose', 'bloodureanitro', 'creatinine', 'bmi', 'pulse', 'respiration']

print("Loading test data...")
test_data = pd.read_csv('test_data.csv', dtype={'gender': str})

# Preprocess test data
test_data.fillna(method='ffill', inplace=True)

if '0.0' in test_data['gender'].unique():
    test_data['gender'] = test_data['gender'].replace({'0.0': 'F'})
if '1.0' in test_data['gender'].unique():
    test_data['gender'] = test_data['gender'].replace({'1.0': 'M'})

# Drop the 'facid' column if present in test_data
if 'facid' in test_data.columns:
    test_data.drop('facid', axis=1, inplace=True)

# One-hot encode categorical columns
test_data = pd.get_dummies(test_data, columns=categorical_columns)

# Scale the numerical columns
test_data[numerical_columns] = scaler.transform(test_data[numerical_columns])

# Save the original 'id' column
original_id = test_data['id'].copy()

# Load the model's feature names
try:
    trained_feature_names = model.feature_names_in_
    print("Feature Columns used in Training:")
    print(trained_feature_names)
except AttributeError:
    print("The model object doesn't have feature names attribute. Please check if you saved the feature names during training.")

# Compare feature columns
if set(test_data.columns) - {'id'} != set(trained_feature_names):
    print("Mismatch in feature columns:")
    print("Missing in test data:", set(trained_feature_names) - set(test_data.columns))
    print("Additional in test data:", set(test_data.columns) - set(trained_feature_names))

# Reorder the columns in test data to match the order used during training
test_data = test_data[['id'] + list(trained_feature_names)]

# Make predictions using only the feature columns
length_of_stay_predictions = model.predict(test_data[trained_feature_names])

# Replace dummy 'id' column with the original for the results
result_df = pd.DataFrame({
    'id': original_id,
    'lengthofstay': length_of_stay_predictions
})

# Save the results to a CSV file
result_df.to_csv('length_of_stay_predictions3.csv', index=False)

print("Finished")

Loading model and scaler...
Loading test data...
Feature Columns used in Training:
['asthma_False' 'asthma_True' 'bloodureanitro' 'bmi' 'creatinine'
 'depress_False' 'depress_True' 'dialysisrenalendstage_False'
 'dialysisrenalendstage_True' 'fibrosisandother_False'
 'fibrosisandother_True' 'gender_F' 'gender_M' 'glucose' 'hematocrit'
 'hemo' 'irondef_False' 'irondef_True' 'malnutrition_False'
 'malnutrition_True' 'neutrophils' 'pneum_False' 'pneum_True'
 'psychologicaldisordermajor_False' 'psychologicaldisordermajor_True'
 'psychother_False' 'psychother_True' 'pulse' 'rcount' 'respiration'
 'secondarydiagnosisnonicd9' 'sodium' 'substancedependence_False'
 'substancedependence_True']
Finished
