<a href="https://colab.research.google.com/github/cybercat17/MLDL/blob/master/Titanic_Basic_Manipulation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Downloading datasets from Kaggle

In [1]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"cybercat","key":"0bdbeb40485d70cbb45bd530ae54e5dd"}'}

In [0]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json

In [3]:
!kaggle competitions download -c titanic

Downloading train.csv to /content
  0% 0.00/59.8k [00:00<?, ?B/s]
100% 59.8k/59.8k [00:00<00:00, 21.6MB/s]
Downloading test.csv to /content
  0% 0.00/28.0k [00:00<?, ?B/s]
100% 28.0k/28.0k [00:00<00:00, 37.3MB/s]
Downloading gender_submission.csv to /content
  0% 0.00/3.18k [00:00<?, ?B/s]
100% 3.18k/3.18k [00:00<00:00, 2.72MB/s]


## Storing datasets

In [9]:
import pandas as pd

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

train_df.dropna(subset=['Survived'], axis=0, inplace=True)
y = train_df.Survived
train_df.drop(['Survived'], axis=1, inplace=True)

train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 11 columns):
PassengerId    891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


## Transforming each column

In [0]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error as MAE

In [10]:
unrelated_cols = ['PassengerId', 'Name', 'Ticket', 'Cabin']

train_df.drop(unrelated_cols, axis=1, inplace=True)
test_df.drop(unrelated_cols, axis=1, inplace=True)

print('Train : {}, Test : {}'.format(train_df.shape, test_df.shape))

Train : (891, 7), Test : (418, 7)


### Pclass

In [0]:
pclass_trans = OneHotEncoder(handle_unknown='ignore')

### Gender

In [0]:
gender_trans = OneHotEncoder(handle_unknown='ignore')

### Age

In [0]:
age_trans = SimpleImputer(strategy='mean')

### Embarked

In [0]:
embarked_trans = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

### Fare

In [0]:
fare_trans = SimpleImputer(strategy='constant', fill_value=0)

## Training the Model

In [0]:
prep = ColumnTransformer(
  transformers=[
      ('pclass', pclass_trans, ['Pclass']),
      ('gender', gender_trans, ['Sex']),
      ('age', age_trans, ['Age']),
      ('embarked', embarked_trans, ['Embarked']),
      ('fare', fare_trans, ['Fare'])
  ]
)

model = RandomForestClassifier(n_estimators=100, random_state=0)

ppl = Pipeline(steps=[
    ('prep', prep),
    ('model', model)
])

ppl.fit(train_df, y)

preds = ppl.predict(test_df)

## Create Submission

In [0]:
output = pd.DataFrame({'PassengerId':test_df.index+892, 'Survived':preds})
output.to_csv('submission.csv', index=False)