In [21]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error

test_df = pd.read_csv("test.csv", encoding='latin1')
train_df = pd.read_csv("train.csv", encoding='latin1')

##Data Cleaning for Sex
# checking the variables of sex
# Define a list of allowed values for the 'Sex' column

allowed_values = ['FEMALE', 'MALE']

# Check if the 'Sex' column only has allowed values
unique_values = test_df['Sex'].unique()
if set(unique_values) != set(allowed_values):
    pass
    

def replace_with_mapping(df, col_name, mapping):
    df[col_name] = df[col_name].replace(mapping)
    return df

mapping = {'FEMALE': 1, 'MALE': 0}
test_df = replace_with_mapping(test_df, 'Sex', mapping)
train_df = replace_with_mapping(train_df, 'Sex', mapping)

##Data Cleaning for AnesthesiaType

def replace_null(df, column):
    df[column].fillna(0, inplace=True)
    return df

def remove_object(df, column, object_to_remove):
    df = df.drop(df[df[column] == object_to_remove].index)
    return df

def calculate_mean(df, groupby_column, value_column):
    grouped = df.groupby(groupby_column)[value_column].mean()
    return grouped

def create_mean_column(df, groupby_column, value_column, new_column_name):
    grouped = calculate_mean(df, groupby_column, value_column)
    df[new_column_name] = df[groupby_column].map(grouped)
    return df

def replace_objects(df, column, old_names, new_name):
    df[column].replace(old_names, new_name, inplace=True)
    return df

def normalize_column(df, column):
    mean = df[column].mean()
    std = df[column].std()
    df[column + '_normalized'] = (df[column] - mean) / std
    return df

def add_anesthesia_type_mean_column(train_df, test_df):
    grouped = train_df.groupby('AnesthesiaType')['ElapsedTime(second)'].mean()
    test_df['AnesthesiaType_mean'] = test_df['AnesthesiaType'].map(grouped)
    test_df['AnesthesiaType_mean'].fillna(0, inplace=True)
    test_df['AnesthesiaType_mean'] = test_df['AnesthesiaType_mean'].replace(0, 3817)
    test_df['AnesthesiaType_mean'] = test_df['AnesthesiaType_mean'].astype('int64')
    return test_df



train_df = replace_null(train_df, 'AnesthesiaType')
train_df = remove_object(train_df, 'AnesthesiaType', 'Seçilmemi?')
train_df = create_mean_column(train_df, 'AnesthesiaType', 'ElapsedTime(second)', 'AnesthesiaType_mean')
train_df = replace_objects(train_df, 'AnesthesiaType', ['Rejyonel Sinir Blo?u', 'Periferik Bloklar', 'Epidural Anestezi', 'Kombine Spinal Epidural Anestezi'], 'Rejyonel Anestezi')
train_df = replace_objects(train_df, 'AnesthesiaType', ['Epidural ve Genel Anestezi', 'Spinal ve Genel Anestezi', 'Periferik Bloklar ve Genel Anestezi'], 'Genel Anestezi')
train_df = normalize_column(train_df, 'AnesthesiaType_mean')


test_df = replace_null(test_df, 'AnesthesiaType')
test_df = remove_object(test_df, 'AnesthesiaType', 'Seçilmemi?')
test_df = add_anesthesia_type_mean_column(train_df, test_df)
test_df = replace_objects(test_df, 'AnesthesiaType', ['Rejyonel Sinir Blo?u', 'Periferik Bloklar', 'Epidural Anestezi', 'Kombine Spinal Epidural Anestezi'], 'Rejyonel Anestezi')
test_df = replace_objects(test_df, 'AnesthesiaType', ['Epidural ve Genel Anestezi', 'Spinal ve Genel Anestezi', 'Periferik Bloklar ve Genel Anestezi'], 'Genel Anestezi')
test_df = normalize_column(test_df, 'AnesthesiaType_mean')

##Data cleaning for SurgeryGroup

# Define a dictionary with key-value pairs for assigning numbers to letters
dict_letter_number = {'A1': 7, 'A2': 6, 'A3': 5, 'B': 4, 'C': 3, 'D': 2, 'E': 1}


def replace_letters_with_numbers(lst):
    letters = []
    for x in lst:
        if isinstance(x, str):
            x = x.strip("'")
            if x in dict_letter_number:
                letters.append(x)
    if not letters:
        return 0
    # Calculate the mean of the numbers
    return sum(dict_letter_number[x] for x in letters) / len(letters)

# Apply the function to the column
train_df['SurgeryGroup'] = train_df['SurgeryGroup'].apply(replace_letters_with_numbers)
test_df['SurgeryGroup'] = test_df['SurgeryGroup'].apply(replace_letters_with_numbers)
train_df['SurgeryGroup'] = train_df['SurgeryGroup'].astype(int)
test_df['SurgeryGroup'] = test_df['SurgeryGroup'].astype(int)


mean = train_df['SurgeryGroup'].mean()
std = train_df['SurgeryGroup'].std()

mean = test_df['SurgeryGroup'].mean()
std = test_df['SurgeryGroup'].std()

# Normalize the column using Z-score normalization
test_df['SurgeryGroup_normalized'] = (test_df['SurgeryGroup'] - mean) / std
train_df['SurgeryGroup_normalized'] = (train_df['SurgeryGroup'] - mean) / std


test_df.to_csv('updated_dataset.csv', index=False)
train_df.to_csv('updated_dataset.csv', index=False)

# regression
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

# Select specific columns as features
num_features = ['Age', 'AnesthesiaType_mean_normalized', 'SurgeryGroup', 'Sex']
cat_features = ['DoctorID', 'AnaesthetistID']
X_train_num = train_df[num_features]
X_train_cat = train_df[cat_features]

# Split train data into training and validation sets
X_train_num, X_val_num, X_train_cat, X_val_cat, y_train, y_val = train_test_split(X_train_num, X_train_cat, train_df['ElapsedTime(second)'], test_size=0.2, random_state=42)

# create one-hot encoder
enc = OneHotEncoder(handle_unknown='ignore')

# fit encoder on training data
train_cat = enc.fit_transform(X_train_cat)

# transform validation and test data using fitted encoder
val_cat = enc.transform(X_val_cat)
test_cat = enc.transform(test_df[cat_features])

# Combine one-hot encoded features with numerical features
X_train = np.hstack([X_train_num, train_cat.toarray()])
X_val = np.hstack([X_val_num, val_cat.toarray()])
X_test = np.hstack([test_df[num_features], test_cat.toarray()])

# Define the XGBoostRegressor model
model = XGBRegressor(random_state=42)

# Define the hyperparameter grid to search
param_grid = {
    'learning_rate': [0.1, 0.3, 0.5],
    'max_depth': [3, 5, 7],
    'n_estimators': [50, 100, 200]
}

# Perform grid search to find the best set of hyperparameters
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_root_mean_squared_error')
grid_search.fit(X_train, y_train)

# Print the best set of hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Fit the model on the training data with the best set of hyperparameters
model = XGBRegressor(**grid_search.best_params_, random_state=42)
model.fit(X_train, y_train)

# Evaluate model performance on validation data
y_pred = model.predict(X_val)
mse = np.mean((y_pred - y_val)**2)
r2 = model.score(X_val, y_val)
rmse = np.sqrt(mse)
print('RMSE:', rmse)

print('MSE:', mse)
print('R2:', r2)

# Make predictions on test data
test_preds = model.predict(X_test)

# Save predictions in CSV file
submission_df = pd.DataFrame({'ID': test_df['ID'], 'ElapsedTime(second)': test_preds})
submission_df.to_csv('submission.csv', index=False)



Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
RMSE: 1761.8897886378884
MSE: 3104255.627306463
R2: 0.6703330920418991
