In [1]:
from datetime import datetime

import pandas as pd

from prep_flow import BaseFlow, Column, ReferenceColumn, String, DateTime, Integer, modifier, creator, data_filter

## Test data

In [8]:
df_member = pd.DataFrame({
    "name": ["Taro Yamada", "John Smith", "Li Wei", "Hanako Tanaka"],
    "gender": ["man", "man", "man", "woman"],
    "birthday": ["1995/10/19", "1990/03/20", "2003/02/01", "1985/11/18"],
    "company_code": ["JP", "US", "CN", "JP"],
})

df_member

Unnamed: 0,name,gender,birthday,company_code
0,Taro Yamada,man,1995/10/19,JP
1,John Smith,man,1990/03/20,US
2,Li Wei,man,2003/02/01,CN
3,Hanako Tanaka,woman,1985/11/18,JP


## 01: Simple usage

- The validation rules are described in class, as [pandera](https://pandera.readthedocs.io/en/stable/index.html).

In [22]:
class MemberFlow(BaseFlow):
    name = Column(dtype=String)
    gender = Column(dtype=String, category=["man", "woman"])
    birthday = Column(dtype=DateTime)
    company_code = Column(dtype=String, regexp=r"[A-Z]{2}")

member = MemberFlow(df_member)
print(member.data)

            name gender   birthday company_code
0    Taro Yamada    man 1995-10-19           JP
1     John Smith    man 1990-03-20           US
2         Li Wei    man 2003-02-01           CN
3  Hanako Tanaka  woman 1985-11-18           JP


## 02: Use modifier

- modifier is used to modify data.
- There are two types of modifiers: `inline-modifier` and `decorator-modifier`.
- The `inline-modifier` is used for processing that can be completed in a single column, such as converting a string to lowercase or adding a prefix.
- The `decorator-modifier` is used for processing that also references other columns.

In [10]:
# inline-modifier example
class MemberFlow(BaseFlow):
    name = Column(dtype=String, modifier=lambda x: x.lower())  # here!
    gender = Column(dtype=String, category=["man", "woman"])
    birthday = Column(dtype=DateTime)
    company_code = Column(dtype=String, regexp=r"[A-Z]{2}")

member = MemberFlow(df_member)
member.data

Unnamed: 0,name,gender,birthday,company_code
0,taro yamada,0,1995-10-19,JP
1,john smith,0,1990-03-20,US
2,li wei,0,2003-02-01,CN
3,hanako tanaka,1,1985-11-18,JP


In [11]:
# decorator-modifier example
class MemberFlow(BaseFlow):
    name = Column(dtype=String, description='Add "Mr." or "Ms." depending on the gender.')
    gender = Column(dtype=String, category=["man", "woman"])
    birthday = Column(dtype=DateTime)
    company_code = Column(dtype=String, regexp=r"[A-Z]{2}")
    
    # here!
    @modifier("name")
    def modify_name(self, data: pd.DataFrame) -> pd.Series:
        data["prefix"] = data["gender"].apply(lambda x: "Mr." if x == "man" else "Ms.")
        return data["prefix"] + data["name"]

member = MemberFlow(df_member)
member.data

Unnamed: 0,name,gender,birthday,company_code
0,Mr.Taro Yamada,man,1995-10-19,JP
1,Mr.John Smith,man,1990-03-20,US
2,Mr.Li Wei,man,2003-02-01,CN
3,Ms.Hanako Tanaka,woman,1985-11-18,JP


## 03: Use creator

- creator is used to create a new column from an existing column.

In [12]:
class MemberFlow(BaseFlow):
    name = Column(dtype=String)
    birthday = Column(dtype=DateTime)
    gender = Column(dtype=String, category=["man", "woman"])
    company_code = Column(dtype=String, regexp=r"[A-Z]{2}")
    age = Column(dtype=Integer)
    
    # here!
    @creator("age")
    def create_age(self, data: pd.DataFrame) -> pd.Series:
        return data["birthday"].apply(lambda x: (datetime.now() - x).days // 365)

member = MemberFlow(df_member)
member.data

Unnamed: 0,name,birthday,gender,company_code,age
0,Taro Yamada,1995-10-19,man,JP,28
1,John Smith,1990-03-20,man,US,34
2,Li Wei,2003-02-01,man,CN,21
3,Hanako Tanaka,1985-11-18,woman,JP,38


## 04: Use data-filter

- data-filter is used to extract data.

In [18]:
class MemberFlow(BaseFlow):
    name = Column(dtype=String)
    birthday = Column(dtype=DateTime)
    gender = Column(dtype=String, category=["man", "woman"])
    company_code = Column(dtype=String, regexp=r"[A-Z]{2}")

    # here!
    @data_filter()
    def filter_japanese(self, data: pd.DataFrame) -> pd.DataFrame:
        return data.query('company_code == "JP"').reset_index(drop=True)

member = MemberFlow(df_member)
member.data

Unnamed: 0,name,birthday,gender,company_code
0,Taro Yamada,1995-10-19,man,JP
1,Hanako Tanaka,1985-11-18,woman,JP


## 05: Merge a flow

- We can merge flows using `ReferenceColumn`.

In [19]:
df_company_code_master = pd.DataFrame({
    "company_code": ["JP", "US", "CN"],
    "company_name": ["JAPAN", "AMERICA", "CHINA"],
})
df_company_code_master

Unnamed: 0,company_code,company_name
0,JP,JAPAN
1,US,AMERICA
2,CN,CHINA


In [21]:
class CompanyCodeMasterFlow(BaseFlow):
    company_code = Column(dtype=String, regexp=r"[A-Z]{2}")
    company_name = Column(dtype=String)

class MemberFlow(BaseFlow):
    name = Column(dtype=String)
    birthday = Column(dtype=DateTime)
    gender = Column(dtype=String, category=["man", "woman"])
    company_code = Column(dtype=String, regexp=r"[A-Z]{2}")
    company_name = ReferenceColumn(CompanyCodeMasterFlow.company_name, how="left", on="company_code")  # here!
    
company_code_master = CompanyCodeMasterFlow(df_company_code_master)
member = MemberFlow(df_member, reference=[company_code_master])
member.data

Unnamed: 0,name,birthday,gender,company_code,company_name
0,Taro Yamada,1995-10-19,man,JP,JAPAN
1,John Smith,1990-03-20,man,US,AMERICA
2,Li Wei,2003-02-01,man,CN,CHINA
3,Hanako Tanaka,1985-11-18,woman,JP,JAPAN
