In [1]:
import pandas as pd
import joblib

model = joblib.load('random_forest_model.pkl')
scaler = joblib.load('scaler.pkl')

# Definer kollonner
categorical_columns = ['gender', 'dialysisrenalendstage', 'asthma', 'irondef', 'pneum', 
                       'substancedependence', 'psychologicaldisordermajor', 'depress', 'psychother',
                       'fibrosisandother', 'malnutrition']

numerical_columns = ['rcount', 'hemo', 'hematocrit', 'neutrophils', 'sodium',
                     'glucose', 'bloodureanitro', 'creatinine', 'bmi', 'pulse', 'respiration']

test_data = pd.read_csv('test_data.csv', dtype={'gender': str})

# test-data brukte double som 'gender' verdi, så endret dette for å passe modell
test_data.fillna(method='ffill', inplace=True)

if '0.0' in test_data['gender'].unique():
    test_data['gender'] = test_data['gender'].replace({'0.0': 'F'})
if '1.0' in test_data['gender'].unique():
    test_data['gender'] = test_data['gender'].replace({'1.0': 'M'})

# modell ble generert uten facid, så dette måtte fjernes for å kunne kjøre test-data
if 'facid' in test_data.columns:
    test_data.drop('facid', axis=1, inplace=True)

# One-hot encode categorical columns
test_data = pd.get_dummies(test_data, columns=categorical_columns)

# skalering
test_data[numerical_columns] = scaler.transform(test_data[numerical_columns])

# Save the original 'id' column
original_id = test_data['id'].copy()

# Load the model's feature names
try:
    trained_feature_names = model.feature_names_in_
except AttributeError:
    print("")


# Endre rekkefølge
test_data = test_data[['id'] + list(trained_feature_names)]

# Bare bruk features fra modell
length_of_stay_predictions = model.predict(test_data[trained_feature_names])

result_df = pd.DataFrame({
    'id': original_id,
    'lengthofstay': length_of_stay_predictions
})

# lagre forventet resultat
result_df.to_csv('length_of_stay_predictions3.csv', index=False)

print("Finished")

FileNotFoundError: [Errno 2] No such file or directory: 'random_forest_model.pkl'

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import joblib
from scipy.stats import randint


print("Laster data")

train_data = pd.read_csv('training_data.csv', parse_dates=['vdate'])
metadata = pd.read_csv('metadata.csv')



print("Forhåndsprosesserer")

train_data.fillna(method='ffill', inplace=True)

# kollonner
categorical_columns = ['gender', 'dialysisrenalendstage', 'asthma', 'irondef', 'pneum', 
                       'substancedependence', 'psychologicaldisordermajor', 'depress', 'psychother',
                       'fibrosisandother', 'malnutrition']
numerical_columns = ['rcount', 'hemo', 'hematocrit', 'neutrophils', 'sodium',
                     'glucose', 'bloodureanitro', 'creatinine', 'bmi', 'pulse', 'respiration']


train_data = pd.get_dummies(train_data, columns=categorical_columns)


scaler = StandardScaler()
train_data[numerical_columns] = scaler.fit_transform(train_data[numerical_columns])


selected_features = train_data.columns.difference(['id', 'vdate', 'discharged', 'facid', 'lengthofstay'])
X = train_data[selected_features]
y = train_data['lengthofstay']

print("splitter datasett")

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("Trener modell")
# Initialize and Train Model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Evaluate Model
y_pred = model.predict(X_val)
print("R2 Score:", r2_score(y_val, y_pred))

# Define Parameter Distributions
param_distributions = {
    'n_estimators': randint(50, 200),
    'max_depth': [None] + list(randint(10, 30).rvs(size=3)),
    'max_features': ['sqrt', 'log2'],
}

print("Randomized search starter")

random_search = RandomizedSearchCV(model, param_distributions, n_iter=50, cv=3, scoring='r2', n_jobs=-1, verbose=2)
random_search.fit(X_train, y_train)

print("Modellagring")
# Best Model
best_model = random_search.best_estimator_

# Save the best model 
joblib.dump(best_model, 'random_forest_model.pkl')
joblib.dump(scaler, 'scaler.pkl')




print("Finished")


In [None]:
import pandas as pd
import gradio as gr
import joblib

# Laster inn modellen og scaleren
print("Loading model and scaler...")
model = joblib.load('random_forest_model.pkl')
scaler = joblib.load('scaler.pkl')

# Definerer kolonnene
cat_cols = ['gender', 'dialysisrenalendstage', 'asthma', 'irondef', 'pneum', 
            'substancedependence', 'psychologicaldisordermajor', 'depress', 'psychother',
            'fibrosisandother', 'malnutrition']

num_cols = ['rcount', 'hemo', 'hematocrit', 'neutrophils', 'sodium',
            'glucose', 'bloodureanitro', 'creatinine', 'bmi', 'pulse', 'respiration']

# Prediksjon
def predict_stay(*inputs):
    input_data = pd.DataFrame([inputs], columns=num_cols + cat_cols)
    input_data.fillna(method='ffill', inplace=True)
    input_data = pd.get_dummies(input_data, columns=cat_cols)
    input_data[num_cols] = scaler.transform(input_data[num_cols])
    stay_length = model.predict(input_data)
    return stay_length[0]

# Definerer input-komponentene
num_input_components = [gr.Number(label=col) for col in num_cols]
gender_input_component = [gr.Radio(['F', 'M'], label='gender')]
cat_input_components = [gr.Checkbox(label=col) for col in cat_cols if col != 'gender']

# Kombinerer input-komponentene
all_input_components = num_input_components + gender_input_component + cat_input_components

# Lager Gradio-grensesnittet
iface = gr.Interface(
    fn=predict_stay,
    inputs=all_input_components,
    outputs=gr.Number(label="Estimated Length of Stay")
)

# Starter Gradio-grensesnittet
iface.launch()
