In [3]:
!pip install skorch

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import torch
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from skorch import NeuralNetClassifier
from skorch.callbacks import EpochScoring
from torch import nn
from torch import optim
import math


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



class TitanicSurvivalClassifier(nn.Module):
  def __init__(self, input_size=7, num_classes=1, *args, **kwargs):
    super().__init__(*args, **kwargs)

    self.sequential = nn.Sequential(
      nn.Linear(in_features=input_size, out_features=256),
      nn.ReLU(),
      nn.Dropout(0.2),
      nn.Linear(in_features=256, out_features=256),
      nn.ReLU(),
      nn.Dropout(0.2),
      nn.Linear(in_features=256, out_features=256),
      nn.ReLU(),
      nn.Dropout(0.2),
      nn.Linear(in_features=256, out_features=num_classes)
    )

  def forward(self, x):
    # print(f"forward -> { x = }")
    return self.sequential(x)


# from sklearn import
# import skorch
def cabin_ohe(cabin: str) -> int:
  try:
    if cabin == float("nan"):
      print("cabin == nan, returning")
      return 0
    number = int(cabin[1:])
    character = (1 + ord(cabin[0]) - ord('A'))
    output = int(character * 1000 + number)
    return output
  except Exception as x:
    mynan = float("nan")
    print(f"{ cabin = }, { type(cabin) = }, { mynan = },{ type(mynan) = }, { x = }")
    return 0


def cabins_ohe(cabins: pd.Series) -> list:
  output: list = [cabin_ohe(cabin) for cabin in cabins]
  return output


def replace_nan_with_avg(input: pd.Series) -> pd.Series:
  output = input.fillna(input.mean())
  return output


def generic_ohe(input: pd.Series) -> list:
  mset = input.unique()
  dict = {}
  i = 0
  for item in mset:
    dict[item] = i

  mlist: list = [dict[item] for item in input]
  return mlist


def get_callbacks() -> list:
  # metric.auc ( uses trapezoidal rule) gave an error: x is neither increasing, nor decreasing. so I had to remove it
  return [
    ("tr_acc", EpochScoring(
      metrics.accuracy_score,
      lower_is_better=False,
      on_train=True,
      name="train_acc",
    )),

    ("tr_recall", EpochScoring(
      metrics.recall_score,
      lower_is_better=False,
      on_train=True,
      name="train_recall",
    )),
    # ("tr_precision", EpochScoring(
    #   metrics.precision_score,
    #   lower_is_better=False,
    #   on_train=True,
    #   name="train_precision",
    # )),
    ("tr_roc_auc", EpochScoring(
      metrics.roc_auc_score,
      lower_is_better=False,
      on_train=False,
      name="tr_auc"
    )),
    ("tr_f1", EpochScoring(
      metrics.f1_score,
      lower_is_better=False,
      on_train=False,
      name="tr_f1"
    )),
    # ("valid_acc1", EpochScoring(
    #   metrics.accuracy_score,
    #   lower_is_better=False,
    #   on_train=False,
    #   name="valid_acc1",
    # )),
    ("valid_recall", EpochScoring(
      metrics.recall_score,
      lower_is_better=False,
      on_train=False,
      name="valid_recall",
    )),
    # ("valid_precision", EpochScoring(
    #   metrics.precision_score,
    #   lower_is_better=False,
    #   on_train=False,
    #   name="valid_precision",
    # )),
    ("valid_roc_auc", EpochScoring(
      metrics.roc_auc_score,
      lower_is_better=False,
      on_train=False,
      name="valid_auc"
    )),
    ("valid_f1", EpochScoring(
      metrics.f1_score,
      lower_is_better=False,
      on_train=False,
      name="valid_f1"
    ))
  ]


def test(net: NeuralNetClassifier):
  print("inside test")
  df = pd.read_csv("/kaggle/input/titanic/test.csv")
  headers = [
    "PassengerId", "Pclass", "Name", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked"
  ]

  # df["Cabin"] = cabins_ohe(df["Cabin"])  # too complex, ignore for now
  # print(df["Cabin"])

  generic_ohe_needed_headers = [
    "Sex", "Embarked"
  ]

  replace_nan_with_avg_headers = [
    "Age", "SibSp", "Parch", "Fare"
  ]

  for column in replace_nan_with_avg_headers:
    df[column] = replace_nan_with_avg(df[column])

  drop_columns = [
    "Name", "Ticket", "Cabin"
  ]

  df = df.drop(drop_columns, axis=1)
  # for column_name in drop_columns:

  for column_name in generic_ohe_needed_headers:
    df[column_name] = generic_ohe(df[column_name])

  print(df.head())

  # train, val = train_test_split(df, train_size=0.8)

  # X1 = df[["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]]
  X1 = df[["Sex", "Age"]]

  # print(f"{len(X1.columns) = }")
  # return
  X = torch.tensor(X1.values)

  passengerIds = df["PassengerId"]

  y = net.predict(X)
  print(f"{ y.shape = }")
  y = y.squeeze(1)
  print(f"{ y.shape = }")

  result = pd.DataFrame()
  result["PassengerId"] = passengerIds
  result["Survived"] = y

  print(f"{ result.head() = }")
  result.to_csv("../working/titanic-results.csv", index=False)
  pass


def start():
  df = pd.read_csv("/kaggle/input/titanic/train.csv")
  headers = [
    "PassengerId", "Survived", "Pclass", "Name", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked"
  ]

  # df["Cabin"] = cabins_ohe(df["Cabin"])  # too complex, ignore for now
  # print(df["Cabin"])

  generic_ohe_needed_headers = [
    "Sex", "Embarked"
  ]

  replace_nan_with_avg_headers = [
    "Age", "SibSp", "Parch", "Fare"
  ]

  for column in replace_nan_with_avg_headers:
    df[column] = replace_nan_with_avg(df[column])

  drop_columns = [
    "Name", "Ticket", "Cabin"
  ]

  df = df.drop(drop_columns, axis=1)
  # for column_name in drop_columns:

  for column_name in generic_ohe_needed_headers:
    df[column_name] = generic_ohe(df[column_name])

  print(df.head())

  # train, val = train_test_split(df, train_size=0.8)

  # X1 = df[["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]]
  X1 = df[["Sex", "Age"]]

  # print(f"{len(X1.columns) = }")
  # return
  X = X1.values
  y1 = df["Survived"].values * 1.0
  y = np.expand_dims(y1, axis=1)

  # print(f"{ type(X) = },\n{ X = }")
  # print(y)

  # return

  # val_x = val["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
  # val_y = val["Survived"]

  titanic_survival_model = TitanicSurvivalClassifier(input_size=len(X1.columns)).double()

  net = NeuralNetClassifier(
    titanic_survival_model,
    max_epochs=20,
    criterion=nn.BCEWithLogitsLoss(),
    optimizer=torch.optim.Adam,
    # lr=0.01,
    lr=0.005,
    optimizer__weight_decay=1e-5,  # this is the correct way of passing the
    # optimizer__momentum_decay=0.5,  # weight_decay, momentum_decay etc to NAdam optimizer
    batch_size=16,
    # Shuffle training data on each epoch
    iterator_train__shuffle=True,
    # train_split=0.8,
    verbose=True,
    callbacks=get_callbacks()
  )

  net.fit(X, y)

  # test the model

  # titanic_survival_model.eval()
  # predicted = net.predict(X)
  # print(f"{ predicted = }")
  test(net=net)

  pass


# Press the green button in the gutter to run the script.
if __name__ == "__main__":
  start()
  # print(cabin_ohe("C454"))
  pass

# See PyCharm help at https://www.jetbrains.com/help/pycharm/


/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv
   PassengerId  Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked
0            1         0       3    0  22.0      1      0   7.2500         0
1            2         1       1    0  38.0      1      0  71.2833         0
2            3         1       3    0  26.0      0      0   7.9250         0
3            4         1       1    0  35.0      1      0  53.1000         0
4            5         0       3    0  35.0      0      0   8.0500         0
  epoch    tr_auc    tr_f1    train_acc    train_loss    train_recall    valid_acc    valid_auc    valid_f1    valid_loss    valid_recall     dur
-------  --------  -------  -----------  ------------  --------------  -----------  -----------  ----------  ------------  --------------  ------
      1    [36m0.5172[0m   [32m0.0822[0m       [35m0.5801[0m        [31m0.9181[0m          [94m0.2418[0m       [36m0.6257[0m   