In [13]:
import pandas as pd
import awswrangler as wr
import boto3
from sklearn.ensemble import RandomForestClassifier

#### Load Titanic training data set

In [14]:
boto3.setup_default_session(aws_access_key_id="xxx", aws_secret_access_key="xxx", region_name="xxx")
df_train = wr.s3.read_csv(path='s3://dm-containers-webinar-data/part1/titanic/train.csv')

#### Add Sex feature

In [15]:
sexes = sorted(df_train['Sex'].unique())
genders_mapping = dict(zip(sexes, range(0, len(sexes) + 1)))
df_train['Sex_Val'] = df_train['Sex'].map(genders_mapping).astype(int)

#### Add Age feature

In [16]:
df_train['AgeFill'] = df_train['Age']
df_train['AgeFill'] = df_train['AgeFill'].groupby([df_train['Sex_Val'], df_train['Class']]).apply(lambda x: x.fillna(x.median()))

#### Add Family Size feature

In [17]:
df_train['FamilySize'] = df_train['Siblings'] + df_train['ParentsChildren']

#### Get rid of unused columns for the clasifier

In [18]:
df_train = df_train.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Age', 'Siblings', 'ParentsChildren', 'Fare'], axis=1)
train_data = df_train.values[:,1:]

#### Create and train a classifier

In [19]:
classifier = RandomForestClassifier(n_estimators=100)
train_features = train_data[:, 1:]
train_target = train_data[:, 0]
classifier = classifier.fit(train_features, train_target)

#### Load the test data set and apply the same data preparation

In [20]:
df_test = wr.s3.read_csv(path='s3://dm-containers-webinar-data/part1/titanic/test.csv')
df_test['Sex_Val'] = df_test['Sex'].map(genders_mapping).astype(int)
df_test['AgeFill'] = df_test['Age']
df_test['AgeFill'] = df_test['AgeFill'].groupby([df_test['Sex_Val'], df_test['Class']]).apply(lambda x: x.fillna(x.median()))
df_test['FamilySize'] = df_test['Siblings'] + df_test['ParentsChildren']
df_test = df_test.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Age', 'Siblings', 'ParentsChildren', 'Fare'], axis=1)
test_data = df_test.values[:,1:]

#### Get classifier prediction for test data set and save the results

In [21]:
test_x = test_data[:,1:]
test_y = classifier.predict(test_data)
df_test['Survived'] = test_y
df_test[['PassengerId', 'Survived']].to_csv('data/results.csv', index=False)