In [1]:
from datetime import datetime

import pandas as pd

from prep_flow import BaseFlow, Column, ReferenceColumn, String, DateTime, Integer, modifier, creator, data_filter

## 01: Simple usage

In [26]:
df_member = pd.DataFrame({
    "name": ["Taro Yamada", "John Smith", "Li Wei", "Jiro Tanaka"],
    "birthday": ["1995/10/19", "1990/03/20", "2003/02/01", "1985/11/18"],
    "company_code": ["JP", "US", "CN", "JP"],
})

df_member

Unnamed: 0,name,birthday,company_code
0,Taro Yamada,1995/10/19,JP
1,John Smith,1990/03/20,US
2,Li Wei,2003/02/01,CN
3,Jiro Tanaka,1985/11/18,JP


In [27]:
class MemberFlow(BaseFlow):
    name = Column(dtype=String, name="name")
    birthday = Column(dtype=DateTime, original_dtype=DateTime)
    age = Column(dtype=Integer)
    company_code = Column(dtype=String, original_regexp=r"[A-Z]{2}")
    
    @modifier("name")
    def modify_name(self, data: pd.DataFrame) -> pd.Series:
        return data["name"].str.lower()
    
    @creator("age")
    def create_age(self, data: pd.DataFrame) -> pd.Series:
        return data["birthday"].apply(lambda x: (datetime.now() - x).days // 365)

In [46]:
member = MemberFlow(df_member)
member.data

Unnamed: 0,name,birthday,age,company_code
0,taro yamada,1995-10-19,28,JP
1,jiro tanaka,1985-11-18,38,JP


## 02: Filter

In [44]:
class MemberFlow(BaseFlow):
    name = Column(dtype=String, name="name")
    birthday = Column(dtype=DateTime, original_dtype=DateTime)
    age = Column(dtype=Integer)
    company_code = Column(dtype=String, original_regexp=r"[A-Z]{2}")
    
    @modifier("name")
    def modify_name(self, data: pd.DataFrame) -> pd.Series:
        return data["name"].str.lower()
    
    @creator("age")
    def create_age(self, data: pd.DataFrame) -> pd.Series:
        return data["birthday"].apply(lambda x: (datetime.now() - x).days // 365)
    
    # Add this lines!
    @data_filter()
    def filter_japanese(self, data: pd.DataFrame) -> pd.DataFrame:
        return data.query('company_code == "JP"').reset_index(drop=True)

In [45]:
member = MemberFlow(df_member)
member.data

Unnamed: 0,name,birthday,age,company_code
0,taro yamada,1995-10-19,28,JP
1,jiro tanaka,1985-11-18,38,JP


## 03: Merge another flow

In [39]:
df_company_code_master = pd.DataFrame({
    "company_code": ["JP", "US", "CN"],
    "company_name": ["JAPAN", "AMERICA", "CHINA"],
})
df_company_code_master

Unnamed: 0,company_code,company_name
0,JP,JAPAN
1,US,AMERICA
2,CN,CHINA


In [48]:
class CompanyCodeMasterFlow(BaseFlow):
    company_code = Column(dtype=String, original_regexp=r"[A-Z]{2}")
    company_name = Column(dtype=String)

class MemberFlow(BaseFlow):
    name = Column(dtype=String, name="name")
    birthday = Column(dtype=DateTime, original_dtype=DateTime)
    company_code = Column(dtype=String, original_regexp=r"[A-Z]{2}")
    # Add this line!
    company_name = ReferenceColumn(CompanyCodeMasterFlow.company_name, how="left", on="company_code")

In [49]:
company_code_master = CompanyCodeMasterFlow(df_company_code_master)
member = MemberFlow(df_member, reference=[company_code_master])

member.data

Unnamed: 0,name,birthday,company_code,company_name
0,Taro Yamada,1995-10-19,JP,JAPAN
1,John Smith,1990-03-20,US,AMERICA
2,Li Wei,2003-02-01,CN,CHINA
3,Jiro Tanaka,1985-11-18,JP,JAPAN
