In [None]:
# Load the data

import pandas as pd

train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [None]:
# The shapes of the Titanic datasets

print('train has {} rows and {} columns'.format(*train.shape))
print('test has {} rows and {} columns'.format(*test.shape))

In [None]:
#  The structure of the train dataset

train.info()

In [None]:
# The structure of the test dataset

test.info()

In [None]:
# Look at the data

train.head()

In [None]:
# Cope with missing values

# option 1
# We only have two passengers without it. This is bearable
train = train.dropna(subset=["Embarked"])

# option 2
# We only have very few information about the cabin, let's drop it
train = train.drop("Cabin", axis=1)

# option 3
# The age misses quite a few times. But intuition
# says it might be important for someone's chance to survive.
mean = train["Age"].mean()
train["Age"] = train["Age"].fillna(mean)

train.info()

In [None]:
# Unique values in columns

print('There are {} different (unique) PassengerIds in the data'
  .format(train["PassengerId"].nunique()))
print('There are {} different (unique) names in the data'
  .format(train["Name"].nunique()))
print('There are {} different (unique) ticket numbers in the data'
  .format(train["Ticket"].nunique()))

In [None]:
#  Remove identifying data

train = train.drop("PassengerId", axis=1)
train = train.drop("Name", axis=1)
train = train.drop("Ticket", axis=1)

train.info()

In [None]:
# Transforming textual data into numbers

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for col in ['Sex', 'Embarked']:
  le.fit(train[col])
  train[col] = le.transform(train[col])

train.head()

In [None]:
# The maximum values

print('The maximum age is {}'.format(train["Age"].max()))
print('The maximum fare is {}'.format(train["Fare"].max()))

In [None]:
# Normalization of the data

from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler

scaler = MinMaxScaler()
#scaler = MaxAbsScaler()
scaler.fit(train)
train = scaler.transform(train)

print('The minimum value is {} and the maximum value is {}'
  .format(train.min(), train.max()))