# Pandas Best Practices (WIP)

### References
https://calmcode.io/pandas-pipe/introduction.html  
https://github.com/koaning/calm-notebooks  
https://github.com/wesm/pydata-book  
TODO: more...


In [1]:
import numpy as np
import pandas as pd
from IPython.display import display, HTML

In [2]:
pd.options.display.max_columns = None
pd.options.display.max_rows = 100

In [3]:
# csv = 'notebooks/best-practices/github_data.csv'
csv = 'https://raw.githubusercontent.com/dylanhogg/crazy-awesome-python/master/github_data.csv'
df_raw = pd.read_csv(csv)

In [4]:
df_raw[0:3]

Unnamed: 0.1,Unnamed: 0,category,githuburl,featured,links,description,_repopath,_reponame,_stars,_forks,_watches,_topics,_language,_homepage,_description,_organization,_updated_at,_last_commit_date,_created_at,_age_weeks,_stars_per_week
0,89,ml-dl,https://github.com/tensorflow/tensorflow,,,,tensorflow/tensorflow,tensorflow,167198,87081,7819,"['tensorflow', 'machine-learning', 'python', '...",C++,https://tensorflow.org,tensorflow: An Open Source Machine Learning Fr...,tensorflow,2022-08-19,2022-08-19,2015-11-07,353,472.501413
1,407,util,https://github.com/TheAlgorithms/Python,,,,TheAlgorithms/Python,Python,142291,36460,6016,"['python', 'algorithm', 'algorithms-implemente...",Python,https://the-algorithms.com/,Python: All Algorithms implemented in Python,TheAlgorithms,2022-08-19,2022-08-16,2016-07-16,317,447.657079
2,112,nlp,https://github.com/huggingface/transformers,,,,huggingface/transformers,transformers,68665,15893,845,"['nlp', 'natural-language-processing', 'pytorc...",Python,https://huggingface.co/transformers,🤗 Transformers: State-of-the-art Machine Learn...,huggingface,2022-08-19,2022-08-18,2018-10-29,198,345.794964


In [5]:
def set_dtypes(dataf: pd.DataFrame) -> pd.DataFrame:
    """Example of datetime conversion"""
    return (dataf
            .assign(last_commit_date=lambda d: pd.to_datetime(d['last_commit_date']))
            .assign(updated_at=lambda d: pd.to_datetime(d['updated_at']))
            .assign(created_at=lambda d: pd.to_datetime(d['created_at']))
           )

def drop_unnamed_columns(dataf: pd.DataFrame) -> pd.DataFrame:
    return dataf.loc[:, ~dataf.columns.str.contains('^Unnamed')]

def rename_columns(dataf: pd.DataFrame) -> pd.DataFrame:
    dataf.columns = dataf.columns.str.lstrip('_')
    dataf.columns = dataf.columns.str.replace(' ', '')
    return dataf

def filter_rows(dataf: pd.DataFrame, column, min_value) -> pd.DataFrame:
    return (dataf[dataf[column] >= min_value])
                
def calc_popularity_measure(dataf: pd.DataFrame, col_name = "popularity_measure") -> pd.DataFrame:
    """Example of assign() with string name"""
    def _calc_popularity_measure(row):
        return (row["stars"] + row["forks"] + row["watches"]) / row["age_weeks"]
    kwargs = {col_name: _calc_popularity_measure}
    return dataf.assign(**kwargs)

def calc_per_week_measures(dataf: pd.DataFrame) -> pd.DataFrame:
    """Example of apply() with result_type of expand"""
    def _calc_per_week_measures(row):
        return {
            "watches_per_week": row["watches"] / row["age_weeks"], 
            "forks_per_week": row["forks"] / row["age_weeks"] 
        }
    res = dataf.apply(_calc_per_week_measures, axis=1, result_type='expand')
    dataf[res.columns] = res
    return dataf

def calc_percentages(dataf: pd.DataFrame) -> pd.DataFrame:
    """Example of apply() with result_type of expand"""
    def _calc_percentages(row):
        return {
            "stars_percent": row["stars"] * 100 / dataf["stars"].sum(), 
            "watches_percent": row["watches"] * 100 / dataf["watches"].sum(), 
            "forks_percent": row["forks"] * 100 / dataf["forks"].sum()
        }
    res = dataf.apply(_calc_percentages, axis=1, result_type='expand')
    dataf[res.columns] = res
    return dataf

def calc_totals(dataf: pd.DataFrame, index_name= "total") -> pd.DataFrame:
    """Example """
    def _numeric_sum(col):
        return col.sum() if np.issubdtype(col.dtype, np.number) else None
    dataf.loc[index_name] = dataf.apply(_numeric_sum, axis=0) # over columns
    return dataf
    
def sort(dataf: pd.DataFrame, col_names=["popularity_measure"]) -> pd.DataFrame:
    return dataf.sort_values(by=col_names, ascending=False)

def move_col(dataf, col_name, index=0):
    cols = dataf.columns.tolist()
    cols.insert(0, cols.pop(cols.index(col_name)))
    return dataf.loc[:, cols]

In [6]:
df = (df_raw
      .pipe(drop_unnamed_columns)
      .pipe(rename_columns)
      .pipe(set_dtypes)
      .pipe(filter_rows, column="stars", min_value=100)
      .pipe(calc_per_week_measures)
      .pipe(calc_percentages)
      .pipe(calc_popularity_measure, col_name="popularity_measure")
      .pipe(sort, col_names=["popularity_measure"])
      .pipe(calc_totals)
      .pipe(move_col, "popularity_measure")
     )

In [7]:
display(df)

Unnamed: 0,popularity_measure,category,githuburl,featured,links,description,description.1,repopath,reponame,stars,forks,watches,topics,language,homepage,description.2,description.3,organization,updated_at,last_commit_date,created_at,age_weeks,stars_per_week,watches_per_week,forks_per_week,stars_percent,watches_percent,forks_percent
206,5887.000000,ml-dl,https://github.com/CompVis/stable-diffusion,,,,CompVis/stable-diffusion,CompVis/stable-diffusion,stable-diffusion,5376.0,397.0,114.0,[],Jupyter Notebook,,,CompVis/stable-diffusion,CompVis,2022-08-19,2022-08-18,2022-08-10,1.0,4181.333333,114.000000,397.000000,0.127762,0.098864,0.043818
0,742.487252,ml-dl,https://github.com/tensorflow/tensorflow,,,,tensorflow: An Open Source Machine Learning Fr...,tensorflow/tensorflow,tensorflow,167198.0,87081.0,7819.0,"['tensorflow', 'machine-learning', 'python', '...",C++,https://tensorflow.org,,tensorflow: An Open Source Machine Learning Fr...,tensorflow,2022-08-19,2022-08-19,2015-11-07,353.0,472.501413,22.150142,246.688385,3.973496,6.780852,9.611335
74,618.000000,web,https://github.com/pyscript/pyscript,,,,pyscript: Home Page: https://pyscript.net Exa...,pyscript/pyscript,pyscript,14260.0,1013.0,177.0,"['python', 'html', 'javascript']",TypeScript,https://community.anaconda.cloud/c/tech-topics...,,pyscript: Home Page: https://pyscript.net Exa...,pyscript,2022-08-19,2022-08-18,2022-02-21,25.0,557.653631,7.080000,40.520000,0.338892,0.153499,0.111807
1,582.861199,util,https://github.com/TheAlgorithms/Python,,,,Python: All Algorithms implemented in Python,TheAlgorithms/Python,Python,142291.0,36460.0,6016.0,"['python', 'algorithm', 'algorithms-implemente...",Python,https://the-algorithms.com/,,Python: All Algorithms implemented in Python,TheAlgorithms,2022-08-19,2022-08-16,2016-07-16,317.0,447.657079,18.977918,115.015773,3.381576,5.217240,4.024176
124,493.842105,perf,https://github.com/bloomberg/memray,,,,Memray is a memory profiler for Python,bloomberg/memray,memray,9086.0,244.0,53.0,"['memory', 'memory-leak', 'memory-leak-detecti...",Python,https://bloomberg.github.io/memray/,,Memray is a memory profiler for Python,bloomberg,2022-08-19,2022-08-18,2022-04-08,19.0,478.210526,2.789474,12.842105,0.215931,0.045963,0.026931
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
698,0.711656,util,https://github.com/irmen/pyminiaudio,,,,pyminiaudio: python interface to the miniaudio...,irmen/pyminiaudio,pyminiaudio,102.0,11.0,3.0,[],C,,,pyminiaudio: python interface to the miniaudio...,irmen,2022-07-28,2022-08-13,2019-06-30,163.0,0.623037,0.018405,0.067485,0.002424,0.002602,0.001214
657,0.674009,geo,https://github.com/openaddresses/pyesridump,,,,pyesridump: Scrapes an ESRI MapServer REST end...,openaddresses/pyesridump,pyesridump,234.0,57.0,15.0,[],Python,,,pyesridump: Scrapes an ESRI MapServer REST end...,openaddresses,2022-08-17,2022-06-19,2013-12-06,454.0,0.515419,0.033040,0.125551,0.005561,0.013008,0.006291
688,0.666667,graph,https://github.com/guyallard/markov_clustering,,,,markov_clustering: markov clustering in python,guyallard/markov_clustering,markov_clustering,127.0,34.0,9.0,"['markov-clustering', 'clustering', 'python', ...",Python,,,markov_clustering: markov clustering in python,guyallard,2022-08-11,2018-12-11,2017-09-27,255.0,0.497482,0.035294,0.133333,0.003018,0.007805,0.003753
642,0.618661,util,https://github.com/mgedmin/check-manifest,,,,check-manifest: Tool to check the completeness...,mgedmin/check-manifest,check-manifest,263.0,36.0,6.0,[],Python,https://pypi.org/p/check-manifest,,check-manifest: Tool to check the completeness...,mgedmin,2022-08-07,2022-05-30,2013-03-05,493.0,0.533005,0.012170,0.073022,0.006250,0.005203,0.003973


In [8]:
print(f"{len(df)=}")
print(f"{df.dtypes=}")
display(pd.concat([df[0:10], df[-1:]]))

len(df)=700
df.dtypes=popularity_measure           float64
category                      object
githuburl                     object
featured                     float64
links                         object
description                  float64
description                   object
repopath                      object
reponame                      object
stars                        float64
forks                        float64
watches                      float64
topics                        object
language                      object
homepage                      object
description                  float64
description                   object
organization                  object
updated_at            datetime64[ns]
last_commit_date      datetime64[ns]
created_at            datetime64[ns]
age_weeks                    float64
stars_per_week               float64
watches_per_week             float64
forks_per_week               float64
stars_percent                float64
watches_percent 

Unnamed: 0,popularity_measure,category,githuburl,featured,links,description,description.1,repopath,reponame,stars,forks,watches,topics,language,homepage,description.2,description.3,organization,updated_at,last_commit_date,created_at,age_weeks,stars_per_week,watches_per_week,forks_per_week,stars_percent,watches_percent,forks_percent
206,5887.0,ml-dl,https://github.com/CompVis/stable-diffusion,,,,CompVis/stable-diffusion,CompVis/stable-diffusion,stable-diffusion,5376.0,397.0,114.0,[],Jupyter Notebook,,,CompVis/stable-diffusion,CompVis,2022-08-19,2022-08-18,2022-08-10,1.0,4181.333333,114.0,397.0,0.127762,0.098864,0.043818
0,742.487252,ml-dl,https://github.com/tensorflow/tensorflow,,,,tensorflow: An Open Source Machine Learning Fr...,tensorflow/tensorflow,tensorflow,167198.0,87081.0,7819.0,"['tensorflow', 'machine-learning', 'python', '...",C++,https://tensorflow.org,,tensorflow: An Open Source Machine Learning Fr...,tensorflow,2022-08-19,2022-08-19,2015-11-07,353.0,472.501413,22.150142,246.688385,3.973496,6.780852,9.611335
74,618.0,web,https://github.com/pyscript/pyscript,,,,pyscript: Home Page: https://pyscript.net Exa...,pyscript/pyscript,pyscript,14260.0,1013.0,177.0,"['python', 'html', 'javascript']",TypeScript,https://community.anaconda.cloud/c/tech-topics...,,pyscript: Home Page: https://pyscript.net Exa...,pyscript,2022-08-19,2022-08-18,2022-02-21,25.0,557.653631,7.08,40.52,0.338892,0.153499,0.111807
1,582.861199,util,https://github.com/TheAlgorithms/Python,,,,Python: All Algorithms implemented in Python,TheAlgorithms/Python,Python,142291.0,36460.0,6016.0,"['python', 'algorithm', 'algorithms-implemente...",Python,https://the-algorithms.com/,,Python: All Algorithms implemented in Python,TheAlgorithms,2022-08-19,2022-08-16,2016-07-16,317.0,447.657079,18.977918,115.015773,3.381576,5.21724,4.024176
124,493.842105,perf,https://github.com/bloomberg/memray,,,,Memray is a memory profiler for Python,bloomberg/memray,memray,9086.0,244.0,53.0,"['memory', 'memory-leak', 'memory-leak-detecti...",Python,https://bloomberg.github.io/memray/,,Memray is a memory profiler for Python,bloomberg,2022-08-19,2022-08-18,2022-04-08,19.0,478.210526,2.789474,12.842105,0.215931,0.045963,0.026931
2,431.328283,nlp,https://github.com/huggingface/transformers,,,,🤗 Transformers: State-of-the-art Machine Learn...,huggingface/transformers,transformers,68665.0,15893.0,845.0,"['nlp', 'natural-language-processing', 'pytorc...",Python,https://huggingface.co/transformers,,🤗 Transformers: State-of-the-art Machine Learn...,huggingface,2022-08-19,2022-08-18,2018-10-29,198.0,345.794964,4.267677,80.267677,1.631838,0.732807,1.754148
163,412.052632,ml-dl,https://github.com/lucidrains/DALLE2-pytorch,,,,"DALLE2-pytorch: Implementation of DALL-E 2, Op...",lucidrains/DALLE2-pytorch,DALLE2-pytorch,7206.0,520.0,103.0,"['artificial-intelligence', 'deep-learning', '...",Python,,,"DALLE2-pytorch: Implementation of DALL-E 2, Op...",lucidrains,2022-08-19,2022-08-17,2022-04-07,19.0,376.432836,5.421053,27.368421,0.171252,0.089324,0.057394
36,334.405405,ml,https://github.com/TencentARC/GFPGAN,,,,GFPGAN aims at developing Practical Algorithms...,TencentARC/GFPGAN,GFPGAN,21215.0,3210.0,321.0,"['pytorch', 'gan', 'deep-learning', 'super-res...",Python,,,GFPGAN aims at developing Practical Algorithms...,TencentARC,2022-08-19,2022-07-13,2021-03-19,74.0,286.689189,4.337838,43.378378,0.504179,0.27838,0.354295
14,284.9375,term,https://github.com/willmcgugan/rich,,,,Rich is a Python library for rich text and bea...,willmcgugan/rich,rich,39176.0,1324.0,531.0,"['python', 'python3', 'python-library', 'termi...",Python,https://rich.readthedocs.io/en/latest/,,Rich is a Python library for rich text and bea...,willmcgugan,2022-08-19,2022-08-17,2019-11-10,144.0,270.712734,3.6875,9.194444,0.931026,0.460498,0.146133
9,275.317708,web,https://github.com/tiangolo/fastapi,,,,"FastAPI framework, high performance, easy to l...",tiangolo/fastapi,fastapi,48422.0,3845.0,594.0,"['python', 'json', 'swagger-ui', 'redoc', 'sta...",Python,https://fastapi.tiangolo.com/,,"FastAPI framework, high performance, easy to l...",tiangolo,2022-08-19,2022-08-18,2018-12-08,192.0,251.077037,3.09375,20.026042,1.150759,0.515133,0.424382
