In [88]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from IPython.display import display
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_regression, mutual_info_classif
from sklearn.preprocessing import LabelEncoder
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from tensorflow import keras
from keras.callbacks import EarlyStopping, Callback
from keras import layers


In [89]:
X = pd.read_csv('data/train.csv', index_col='id')
y = X.pop('outcome')
X_test = pd.read_csv('data/test.csv', index_col='id')
X.shape, y.shape, X_test.shape

((1235, 27), (1235,), (824, 27))

In [95]:
preprocessor = make_column_transformer(
    (StandardScaler(),
     make_column_selector(dtype_include=np.number)),
    (OneHotEncoder(sparse=False),
     make_column_selector(dtype_include=object)),
)
X_transformed = pd.DataFrame(preprocessor.fit_transform(X), index=X.index)
X_transformed = pd.DataFrame(X_transformed, index=X.index)
# Assuming 'y' is your original labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
onehot_encoder = OneHotEncoder(sparse=False)
y_encoded = onehot_encoder.fit_transform(y_encoded.reshape(-1, 1))



In [100]:
X_train, X_valid, y_train, y_valid = train_test_split(X_transformed, y_encoded, test_size=0.2, random_state=42)

In [None]:
# 0.713
# with higher n nodes

In [200]:

model = keras.Sequential([
    #layers.BatchNormalization(input_shape=[X_train.shape[1]]),
    layers.Dropout(rate=0.2),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(256, activation='relu'),
    layers.Dense(3, activation='softmax'),  # Use 'softmax' for multi-class classification
])
model.compile(
    loss='categorical_crossentropy',
    optimizer='sgd',
    metrics=['accuracy'],
)

early_stopping = keras.callbacks.EarlyStopping(
    patience=10,
    min_delta=0.001,
    restore_best_weights=True,
)

history = model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    batch_size=32,
    epochs=100,
    verbose=0,
    callbacks=[early_stopping],
)
history_df = pd.DataFrame(history.history)
print(f"Best validation accuracy: {history_df['val_accuracy'].max():.3f}")
#history_df.loc[:, ['loss', 'val_loss']].plot(title="Cross-entropy")
#history_df.loc[:, ['accuracy', 'val_accuracy']].plot(title="Accuracy")
# print best validation accuracy and epoch

Best validation accuracy: 0.696


In [142]:
X_test['pain'] = X_test['pain'].replace('moderate', 'slight')
X_test_transformed = pd.DataFrame(preprocessor.transform(X_test), index=X_test.index)
y_pred = model.predict(X_test_transformed)
y_pred = y_pred.argmax(axis=1)
y_pred = label_encoder.inverse_transform(y_pred)
output = pd.DataFrame({'id': X_test.index, 'outcome': y_pred})
print(output.head())
output.to_csv('data/submission_keras.csv', index=False)

     id     outcome
0  1235       lived
1  1236        died
2  1237       lived
3  1238  euthanized
4  1239       lived
