# Pandas Best Practices - df.pipe() method chaining

[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dylanhogg/jupyter-experiments/blob/master/notebooks/best-practices/pandas-pipe-method.ipynb)    


## References
https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.pipe.html  
https://calmcode.io/pandas-pipe/introduction.html  
https://github.com/koaning/calm-notebooks  
https://github.com/wesm/pydata-book  
https://tomaugspurger.github.io/method-chaining.html  


In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from functools import wraps
from IPython.display import display, HTML

In [2]:
pd.options.display.max_columns = None
pd.options.display.max_rows = 100

## Load input data (popular python libraries)

In [3]:
df_raw = pd.read_json("https://www.awesomepython.org/github_data.json", orient="table")

In [4]:
df_raw[0:2]

Unnamed: 0,category,githuburl,featured,links,description,_repopath,_reponame,_stars,_forks,_watches,_topics,_language,_homepage,_description,_organization,_updated_at,_last_commit_date,_created_at,_age_weeks,_stars_per_week,_readme_filename,_readme_giturl,_readme_localurl,_requirements_filenames,_requirements_giturls,_requirements_localurls
89,ml-dl,https://github.com/tensorflow/tensorflow,,,,tensorflow/tensorflow,tensorflow,168890,87443,7802,"[tensorflow, machine-learning, python, deep-le...",C++,https://tensorflow.org,tensorflow: An Open Source Machine Learning Fr...,tensorflow,2022-11-05 00:00:00+00:00,2022-11-05T00:00:00.000Z,2015-11-07 00:00:00+00:00,365,462.71,README.md,https://raw.githubusercontent.com/tensorflow/t...,tensorflow~tensorflow~README.md,[],[],[]
405,study,https://github.com/thealgorithms/python,,,,thealgorithms/python,Python,147809,38118,5926,"[python, algorithm, algorithms-implemented, al...",Python,https://the-algorithms.com/,Python: All Algorithms implemented in Python,thealgorithms,2022-11-05 00:00:00+00:00,2022-11-04T00:00:00.000Z,2016-07-16 00:00:00+00:00,329,449.27,README.md,https://raw.githubusercontent.com/thealgorithm...,thealgorithms~python~README.md,"[requirements.txt, pyproject.toml]",[https://raw.githubusercontent.com/thealgorith...,"[thealgorithms~python~requirements.txt, thealg..."


## Decorator helpers

In [5]:
def log_pipeline_step(func):
    @wraps(func)
    def wrapper(*args, **kwargs) -> pd.DataFrame:
        input_shape = args[0].shape
        print(f"{datetime.now()} {func.__name__}")
        tic = datetime.now()
        df_result = func(*args, **kwargs)
        output_shape = df_result.shape
        print(f"{datetime.now()}  ╰╴took {datetime.now() - tic}s in: {input_shape} out: {output_shape} diff: ({output_shape[0] - input_shape[0]}, {output_shape[1] - input_shape[1]})")
        return df_result
    return wrapper

def log_columns(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        df_result = func(*args, **kwargs)
        print(f"{datetime.now()} {func.__name__} cols ({len(df_result.columns)}): [{', '.join(list(df_result.columns))}]")
        return df_result
    return wrapper

## Generic pipe functions

In [6]:
@log_pipeline_step
@log_columns
def start_pipeline(dataf):
    return dataf.copy()

@log_columns
def end_pipeline(dataf):
    return dataf

@log_pipeline_step
def filter_rows(dataf: pd.DataFrame, column, min_value) -> pd.DataFrame:
    return (dataf[dataf[column] >= min_value])

@log_pipeline_step
def sort_values(dataf: pd.DataFrame, col_names, ascending=False) -> pd.DataFrame:
    return dataf.sort_values(by=col_names, ascending=ascending)

@log_pipeline_step
def move_col(dataf, col_name, index=0):
    cols = dataf.columns.tolist()
    cols.insert(index, cols.pop(cols.index(col_name)))
    return dataf.loc[:, cols]

@log_pipeline_step
def calc_sum(dataf: pd.DataFrame, index_name= "total") -> pd.DataFrame:
    def _numeric_sum(col):
        return col.sum() if np.issubdtype(col.dtype, np.number) else None
    dataf.loc[index_name] = dataf.apply(_numeric_sum, axis=0) # over columns
    return dataf

## Custom pipe functions

In [7]:
@log_pipeline_step
def set_dtypes(dataf: pd.DataFrame) -> pd.DataFrame:
    return (dataf
            .assign(last_commit_date=lambda d: pd.to_datetime(d['last_commit_date']).dt.tz_localize(None))
            .assign(updated_at=lambda d: pd.to_datetime(d['updated_at']).dt.tz_localize(None))
            .assign(created_at=lambda d: pd.to_datetime(d['created_at']).dt.tz_localize(None))
           )

@log_pipeline_step
def remove_zero_age_weeks(dataf: pd.DataFrame) -> pd.DataFrame:
    dataf.loc[dataf["age_weeks"] == 0, "age_weeks"] = 1
    return dataf

@log_pipeline_step
def rename_columns(dataf: pd.DataFrame) -> pd.DataFrame:
    dataf.columns = dataf.columns.str.lstrip('_')
    dataf.columns = dataf.columns.str.replace(' ', '')
    return dataf

@log_pipeline_step
def calc_popularity_measure(dataf: pd.DataFrame, col_name = "popularity_measure") -> pd.DataFrame:
    def _calc_popularity_measure(row):
        return (row["stars"] + row["forks"] + row["watches"]) / row["age_weeks"]
    kwargs = {col_name: _calc_popularity_measure}
    return dataf.assign(**kwargs)

@log_pipeline_step
def expand_per_week_measures(dataf: pd.DataFrame) -> pd.DataFrame:
    def _calc_per_week_measures(row):
        return {
            "watches_per_week": row["watches"] / row["age_weeks"],
            "forks_per_week": row["forks"] / row["age_weeks"]
        }
    res = dataf.apply(_calc_per_week_measures, axis=1, result_type='expand')
    dataf[res.columns] = res
    return dataf

@log_pipeline_step
def expand_percentages(dataf: pd.DataFrame) -> pd.DataFrame:
    def _calc_percentages(row):
        return {
            "stars_percent": row["stars"] * 100 / dataf["stars"].sum(), 
            "watches_percent": row["watches"] * 100 / dataf["watches"].sum(), 
            "forks_percent": row["forks"] * 100 / dataf["forks"].sum()
        }
    res = dataf.apply(_calc_percentages, axis=1, result_type='expand')
    dataf[res.columns] = res
    return dataf

## Pipeline example

In [8]:
df = (df_raw
      .pipe(start_pipeline)
      .pipe(rename_columns)
      .pipe(set_dtypes)
      .pipe(remove_zero_age_weeks)
      .pipe(filter_rows, column="stars", min_value=100)
      .pipe(expand_per_week_measures)
      .pipe(expand_percentages)
      .pipe(calc_popularity_measure, col_name="popularity_measure")
      .pipe(sort_values, col_names=["popularity_measure"])
      .pipe(calc_sum)
      .pipe(move_col, "popularity_measure", 0)
      .pipe(end_pipeline)
     )

2022-11-05 21:35:35.147900 start_pipeline
2022-11-05 21:35:35.148562 start_pipeline cols (26): [category, githuburl, featured, links, description, _repopath, _reponame, _stars, _forks, _watches, _topics, _language, _homepage, _description, _organization, _updated_at, _last_commit_date, _created_at, _age_weeks, _stars_per_week, _readme_filename, _readme_giturl, _readme_localurl, _requirements_filenames, _requirements_giturls, _requirements_localurls]
2022-11-05 21:35:35.148591  ╰╴took 0:00:00.000559s in: (832, 26) out: (832, 26) diff: (0, 0)
2022-11-05 21:35:35.148605 rename_columns
2022-11-05 21:35:35.148891  ╰╴took 0:00:00.000280s in: (832, 26) out: (832, 26) diff: (0, 0)
2022-11-05 21:35:35.148909 set_dtypes
2022-11-05 21:35:35.154125  ╰╴took 0:00:00.005205s in: (832, 26) out: (832, 26) diff: (0, 0)
2022-11-05 21:35:35.154326 remove_zero_age_weeks
2022-11-05 21:35:35.155196  ╰╴took 0:00:00.000859s in: (832, 26) out: (832, 26) diff: (0, 0)
2022-11-05 21:35:35.155249 filter_rows
2022-1

In [9]:
display(df)

Unnamed: 0,popularity_measure,category,githuburl,featured,links,description,description.1,repopath,reponame,stars,forks,watches,topics,language,homepage,description.2,description.3,organization,updated_at,last_commit_date,created_at,age_weeks,stars_per_week,readme_filename,readme_giturl,readme_localurl,requirements_filenames,requirements_giturls,requirements_localurls,watches_per_week,forks_per_week,stars_percent,watches_percent,forks_percent
736,3222.333333,diffusion,https://github.com/compvis/stable-diffusion,,,,stable-diffusion: A latent text-to-image diffu...,compvis/stable-diffusion,stable-diffusion,33293.0,5053.0,322.0,[],Jupyter Notebook,https://ommer-lab.com/research/latent-diffusio...,,stable-diffusion: A latent text-to-image diffu...,compvis,2022-11-05,2022-08-22,2022-08-10,12.0,2678.75,README.md,https://raw.githubusercontent.com/compvis/stab...,compvis~stable-diffusion~README.md,[setup.py],[https://raw.githubusercontent.com/compvis/sta...,[compvis~stable-diffusion~setup.py],26.833333,421.083333,0.727211,0.264474,0.520697
758,797.125000,diffusion,https://github.com/divamgupta/diffusionbee-sta...,,,,diffusionbee-stable-diffusion-ui: Diffusion Be...,divamgupta/diffusionbee-stable-diffusion-ui,diffusionbee-stable-diffusion-ui,6076.0,238.0,63.0,"[electron-app, macos, stable-diffusion]",JavaScript,https://diffusionbee.com,,diffusionbee-stable-diffusion-ui: Diffusion Be...,divamgupta,2022-11-05,2022-11-03,2022-09-06,8.0,708.87,README.md,https://raw.githubusercontent.com/divamgupta/d...,divamgupta~diffusionbee-stable-diffusion-ui~RE...,[],[],[],7.875000,29.750000,0.132717,0.051745,0.024525
790,760.750000,diffusion,https://github.com/ashawkey/stable-dreamfusion,,,,stable-dreamfusion: A pytorch implementation o...,ashawkey/stable-dreamfusion,stable-dreamfusion,2799.0,178.0,66.0,"[text-to-3d, gui, nerf, stable-diffusion, drea...",Python,,,stable-dreamfusion: A pytorch implementation o...,ashawkey,2022-11-04,2022-11-04,2022-10-06,4.0,653.10,readme.md,https://raw.githubusercontent.com/ashawkey/sta...,ashawkey~stable-dreamfusion~readme.md,[requirements.txt],[https://raw.githubusercontent.com/ashawkey/st...,[ashawkey~stable-dreamfusion~requirements.txt],16.500000,44.500000,0.061138,0.054209,0.018342
89,723.657534,ml-dl,https://github.com/tensorflow/tensorflow,,,,tensorflow: An Open Source Machine Learning Fr...,tensorflow/tensorflow,tensorflow,168890.0,87443.0,7802.0,"[tensorflow, machine-learning, python, deep-le...",C++,https://tensorflow.org,,tensorflow: An Open Source Machine Learning Fr...,tensorflow,2022-11-05,2022-11-05,2015-11-07,365.0,462.71,README.md,https://raw.githubusercontent.com/tensorflow/t...,tensorflow~tensorflow~README.md,[],[],[],21.375342,239.569863,3.689025,6.408161,9.010748
405,583.139818,study,https://github.com/thealgorithms/python,,,,Python: All Algorithms implemented in Python,thealgorithms/python,Python,147809.0,38118.0,5926.0,"[python, algorithm, algorithms-implemented, al...",Python,https://the-algorithms.com/,,Python: All Algorithms implemented in Python,thealgorithms,2022-11-05,2022-11-04,2016-07-16,329.0,449.27,README.md,https://raw.githubusercontent.com/thealgorithm...,thealgorithms~python~README.md,"[requirements.txt, pyproject.toml]",[https://raw.githubusercontent.com/thealgorith...,"[thealgorithms~python~requirements.txt, thealg...",18.012158,115.860182,3.228558,4.867311,3.927949
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,0.643423,perf,https://github.com/blosc/python-blosc,,,,python-blosc: A Python wrapper for the extreme...,blosc/python-blosc,python-blosc,319.0,75.0,12.0,"[python, wrapper, blosc, compression]",C,https://www.blosc.org/python-blosc/python-blos...,,python-blosc: A Python wrapper for the extreme...,blosc,2022-11-01,2022-10-27,2010-09-30,631.0,0.51,README.rst,https://raw.githubusercontent.com/blosc/python...,blosc~python-blosc~README.rst,"[requirements.txt, setup.py, pyproject.toml]",[https://raw.githubusercontent.com/blosc/pytho...,"[blosc~python-blosc~requirements.txt, blosc~py...",0.019017,0.118859,0.006968,0.009856,0.007729
123,0.619048,util,https://github.com/mgedmin/check-manifest,,,,check-manifest: Tool to check the completeness...,mgedmin/check-manifest,check-manifest,269.0,36.0,7.0,[],Python,https://pypi.org/p/check-manifest,,check-manifest: Tool to check the completeness...,mgedmin,2022-11-02,2022-10-27,2013-03-05,504.0,0.53,README.rst,https://raw.githubusercontent.com/mgedmin/chec...,mgedmin~check-manifest~README.rst,[setup.py],[https://raw.githubusercontent.com/mgedmin/che...,[mgedmin~check-manifest~setup.py],0.013889,0.071429,0.005876,0.005749,0.003710
775,0.613272,sim,https://github.com/activitysim/activitysim,,,,activitysim: An Open Platform for Activity-Bas...,activitysim/activitysim,activitysim,143.0,83.0,42.0,"[python, travel-modeling, data-science, bsd-3-...",Jupyter Notebook,https://activitysim.github.io,,activitysim: An Open Platform for Activity-Bas...,activitysim,2022-10-29,2022-09-14,2014-06-18,437.0,0.33,README.md,https://raw.githubusercontent.com/activitysim/...,activitysim~activitysim~README.md,"[setup.py, pyproject.toml]",[https://raw.githubusercontent.com/activitysim...,"[activitysim~activitysim~setup.py, activitysim...",0.096110,0.189931,0.003124,0.034497,0.008553
774,0.485900,sim,https://github.com/openfisca/openfisca-core,,,,openfisca-core: OpenFisca core engine. See oth...,openfisca/openfisca-core,openfisca-core,128.0,72.0,24.0,"[legislation-as-code, rules-as-code, better-ru...",Python,https://openfisca.org,,openfisca-core: OpenFisca core engine. See oth...,openfisca,2022-10-20,2022-08-26,2013-12-29,461.0,0.28,README.md,https://raw.githubusercontent.com/openfisca/op...,openfisca~openfisca-core~README.md,[setup.py],[https://raw.githubusercontent.com/openfisca/o...,[openfisca~openfisca-core~setup.py],0.052061,0.156182,0.002796,0.019712,0.007419
