# Titanic example

## Main imports

In [1]:
import pandas as pd

import recipipe as r

## Dowload data

In this example we are going to use the Titanic dataset.
Train and test CSVs should be in `data/titanic/*.csv`.

You can download it from the Kaggle web page (https://www.kaggle.com/c/titanic/data) or, if you have the Kaggle CLI installed (https://github.com/Kaggle/kaggle-api), you can execute the next cell.

In [2]:
%%capture

from shutil import which


if which("kaggle") is not None:
    !kaggle competitions download titanic -p data
else:
    pass  # Kaggle CLI not installed :( Download the data manually.

In [3]:
!ls data/titanic

test.csv  train.csv


## Load data

In [4]:
df_train = pd.read_csv("data/titanic/train.csv")
df_test = pd.read_csv("data/titanic/test.csv")

df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Apply transformations with Recipipe

In [5]:
pipe = r.recipipe([
    r.drop("PassengerId", "Ticket", "Cabin"),
    r.impute("Embarked", strategy="most_frequent"),
    r.impute("Age", strategy="mean"),
    r.onehot("Embarked", "Pclass", "Sex"),
    r.scale("Age", "Fare"),
])

In [6]:
pipe.fit_transform(df_train).head()

Unnamed: 0,Survived,Pclass=1,Pclass=2,Pclass=3,Name,Sex=female,Sex=male,Age,SibSp,Parch,Fare,Embarked=C,Embarked=Q,Embarked=S
0,0,0.0,0.0,1.0,"Braund, Mr. Owen Harris",0.0,1.0,-0.592481,1,0,-0.502445,0.0,0.0,1.0
1,1,1.0,0.0,0.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1.0,0.0,0.638789,1,0,0.786845,1.0,0.0,0.0
2,1,0.0,0.0,1.0,"Heikkinen, Miss. Laina",1.0,0.0,-0.284663,0,0,-0.488854,0.0,0.0,1.0
3,1,1.0,0.0,0.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1.0,0.0,0.407926,1,0,0.42073,0.0,0.0,1.0
4,0,0.0,0.0,1.0,"Allen, Mr. William Henry",0.0,1.0,0.407926,0,0,-0.486337,0.0,0.0,1.0


In [7]:
pipe.transform(df_test).head()

Unnamed: 0,Pclass=1,Pclass=2,Pclass=3,Name,Sex=female,Sex=male,Age,SibSp,Parch,Fare,Embarked=C,Embarked=Q,Embarked=S
0,0.0,0.0,1.0,"Kelly, Mr. James",0.0,1.0,0.369449,0,0,-0.490783,0.0,1.0,0.0
1,0.0,0.0,1.0,"Wilkes, Mrs. James (Ellen Needs)",1.0,0.0,1.331378,1,0,-0.507479,0.0,0.0,1.0
2,0.0,1.0,0.0,"Myles, Mr. Thomas Francis",0.0,1.0,2.485693,0,0,-0.453367,0.0,1.0,0.0
3,0.0,0.0,1.0,"Wirz, Mr. Albert",0.0,1.0,-0.207709,0,0,-0.474005,0.0,0.0,1.0
4,0.0,0.0,1.0,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1.0,0.0,-0.592481,1,1,-0.401017,0.0,0.0,1.0


## Custom transformers



In [8]:
import re


class ExtractTransformer(r.ColumnTransformer):
    """Extract regex from string columns. """

    def __init__(self, *args, extract=None, **kwargs):
        """Create a ExtractTransformer.

        Args:
            extract (list): List of regex expressions to extract.
        """

        super().__init__(*args, **kwargs)
        assert type(extract) == list and len(extract) > 0
        self.extract = extract
        self._re = re.compile("|".join(f"({i})" for i in extract))

    def get_column_mapping(self):
        return {
            i: [f"{i}={j}" for j in self.extract] for i in self.cols
        }

    def _transform_column(self, df, c):
        return df[c].str.extract(self._re).notna().astype("int8").values

# Class alias.
extract = ExtractTransformer

In [9]:
class AddColumnGroup(r.ColumnGroupsTransformer):
    def _transform_group(self, df, group_cols):
        return df[group_cols].sum(axis=1)

# Class alias.
add_column_group = AddColumnGroup

You can add transformers to you pipeline at anytime.

In [10]:
pipe += extract("Name", extract=["Mr\.", "Mrs\.", "Miss\."])
pipe += add_column_group("SibSp", "Parch", col_format="FamilyOnboard")

Fit and transform again the same pipe with the extra transformers.

In [11]:
pipe.fit_transform(df_train)

Unnamed: 0,Survived,Pclass=1,Pclass=2,Pclass=3,Name=Mr\.,Name=Mrs\.,Name=Miss\.,Sex=female,Sex=male,Age,FamilyOnboard,Fare,Embarked=C,Embarked=Q,Embarked=S
0,0,0.0,0.0,1.0,1,0,0,0.0,1.0,-0.592481,1,-0.502445,0.0,0.0,1.0
1,1,1.0,0.0,0.0,0,1,0,1.0,0.0,0.638789,1,0.786845,1.0,0.0,0.0
2,1,0.0,0.0,1.0,0,0,1,1.0,0.0,-0.284663,0,-0.488854,0.0,0.0,1.0
3,1,1.0,0.0,0.0,0,1,0,1.0,0.0,0.407926,1,0.420730,0.0,0.0,1.0
4,0,0.0,0.0,1.0,1,0,0,0.0,1.0,0.407926,0,-0.486337,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,0.0,1.0,0.0,0,0,0,0.0,1.0,-0.207709,0,-0.386671,0.0,0.0,1.0
887,1,1.0,0.0,0.0,0,0,1,1.0,0.0,-0.823344,0,-0.044381,0.0,0.0,1.0
888,0,0.0,0.0,1.0,0,0,1,1.0,0.0,0.000000,3,-0.176263,0.0,0.0,1.0
889,1,1.0,0.0,0.0,1,0,0,0.0,1.0,-0.284663,0,-0.044381,1.0,0.0,0.0


In [12]:
pipe.transform(df_test)

Unnamed: 0,Pclass=1,Pclass=2,Pclass=3,Name=Mr\.,Name=Mrs\.,Name=Miss\.,Sex=female,Sex=male,Age,FamilyOnboard,Fare,Embarked=C,Embarked=Q,Embarked=S
0,0.0,0.0,1.0,1,0,0,0.0,1.0,0.369449,0,-0.490783,0.0,1.0,0.0
1,0.0,0.0,1.0,0,1,0,1.0,0.0,1.331378,1,-0.507479,0.0,0.0,1.0
2,0.0,1.0,0.0,1,0,0,0.0,1.0,2.485693,0,-0.453367,0.0,1.0,0.0
3,0.0,0.0,1.0,1,0,0,0.0,1.0,-0.207709,0,-0.474005,0.0,0.0,1.0
4,0.0,0.0,1.0,0,1,0,1.0,0.0,-0.592481,2,-0.401017,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,0.0,0.0,1.0,1,0,0,0.0,1.0,0.000000,0,-0.486337,0.0,0.0,1.0
414,1.0,0.0,0.0,0,0,0,1.0,0.0,0.715743,0,1.544246,1.0,0.0,0.0
415,0.0,0.0,1.0,1,0,0,0.0,1.0,0.677266,0,-0.502445,0.0,0.0,1.0
416,0.0,0.0,1.0,1,0,0,0.0,1.0,0.000000,0,-0.486337,0.0,0.0,1.0


## What's next?

You can continue learning about Recipipe by:

* Following this [fun tutorial about Paranoids](https://github.com/guiferviz/recipipe/blob/master/examples/paranoids.ipynb).
* Reading the [official documentation](https://guiferviz.com/recipipe/).
* Reading the source code from the [GitHub repository]() :).