# BigFrames API Coverage Report

This notebook estimates BigFrame's coverage of Pandas APIs by:

1. Extracting the members from pandas, pandas.DataFrame, pandas.Index, pandas.Series
2. Checking if they are present in BigFrames
3. Searching them all on a dataset of 170k unique ipython notebooks queried from GitHub

TODO(bmil): add estimates based on Kaggle notebooks

In [43]:
import pandas as pd
import bigframes as bf
import inspect

token_map = {}
targets = [("pandas", pd, bf), ("dataframe", pd.DataFrame, bf.DataFrame), ("series", pd.Series, bf.Series), ("index", pd.Index, None)]
indexers = ['loc', 'iloc', 'iat', 'ix', 'at']
for name, pandas_obj, bigframes_obj in targets:
    for member in dir(pandas_obj):
        # skip private functions and properties
        if member[0] == "_" and member[1] != "_":
            continue

        if inspect.isfunction(getattr(pandas_obj, member)):
            # Function, match .member(
            token = f"\\.{member}\\("
            token_type = "function"
        elif member in indexers:
            # Indexer, match .indexer[
            token = f"\\.{member}\\["
            token_type = "indexer"
        else:
            # Property
            token = f"\\.{member}\\b"
            token_type = "property"

        is_in_bigframes = hasattr(bigframes_obj, member)

        # Special case: bigframes also implements some top level APIs on 'session'
        if name == "pandas":
            is_in_bigframes = is_in_bigframes or hasattr(bf.Session, member)

        if token not in token_map:
            token_map[token] = ([], token_type, is_in_bigframes)

        token_map[token][0].append(name)

header = ['pattern', 'token_type', 'is_pandas', 'is_dataframe', 'is_series', 'is_index', 'is_in_bigframes']
rows = [[
        k, v[1], 'pandas' in v[0], 'dataframe' in v[0], 'series' in v[0], 'index' in v[0], v[2]
                ] for k, v in token_map.items()]

# Wishlist: constructors for BigFrames...
pandas_df = pd.DataFrame(rows, columns=header)
pandas_df

  if name is "pandas":
  if inspect.isfunction(getattr(pandas_obj, member)):
  if inspect.isfunction(getattr(pandas_obj, member)):
  if inspect.isfunction(getattr(pandas_obj, member)):


Unnamed: 0,pattern,token_type,is_pandas,is_dataframe,is_series,is_index,is_in_bigframes
0,\.ArrowDtype\b,property,True,False,False,False,False
1,\.BooleanDtype\b,property,True,False,False,False,False
2,\.Categorical\b,property,True,False,False,False,False
3,\.CategoricalDtype\b,property,True,False,False,False,False
4,\.CategoricalIndex\b,property,True,False,False,False,False
...,...,...,...,...,...,...,...
495,\.symmetric_difference\(,function,False,False,False,True,False
496,\.to_flat_index\(,function,False,False,False,True,False
497,\.to_native_types\(,function,False,False,False,True,False
498,\.to_series\(,function,False,False,False,True,False


This query will count the occurrence of the API patterns in a dump of 170,000 deduped IPython notebooks taken from the public GitHub dataset.

In [44]:
session = bf.connect()
df = session.read_pandas(pandas_df)

# Soon, we could do all this in BigFrames...... 🤞
# TODO: see how much we can rewrite
sql = f"""
WITH
  token_patterns AS ( {df.sql} ),
  github_notebooks AS (SELECT content FROM `bigframes-dev.coverage_report.github_notebooks`),
  hit_counts AS (
    SELECT
      token_patterns.pattern,
      COUNTIF(REGEXP_CONTAINS(github_notebooks.content, token_patterns.pattern)) AS matches,
      COUNT(*) AS scanned
    FROM
      token_patterns, github_notebooks
    WHERE CONTAINS_SUBSTR(github_notebooks.content, 'import pandas')
    GROUP BY token_patterns.pattern
    ORDER BY matches DESC
)
SELECT token_patterns.*, hit_counts.matches, hit_counts.scanned
FROM token_patterns, hit_counts
WHERE token_patterns.pattern = hit_counts.pattern
"""

df = session.read_gbq(sql)

df

                pattern token_type  is_pandas  is_dataframe  is_series  \
0        \.ArrowDtype\b   property       True         False      False   
1      \.BooleanDtype\b   property       True         False      False   
2       \.Categorical\b   property       True         False      False   
3  \.CategoricalDtype\b   property       True         False      False   
4  \.CategoricalIndex\b   property       True         False      False   
5        \.DateOffset\b   property       True         False      False   
6     \.DatetimeIndex\b   property       True         False      False   
7   \.DatetimeTZDtype\b   property       True         False      False   
8         \.ExcelFile\b   property       True         False      False   
9       \.ExcelWriter\b   property       True         False      False   

   is_index  is_in_bigframes  matches  scanned  
0     False            False        0    40337  
1     False            False        1    40337  
2     False            False      233 

In [45]:
# TODO: rewrite everything in bigframes. Too much missing right now :(
df = df.to_pandas()

In [46]:
df["api"] = df["pattern"].str.replace('[\W(\\\\b)]', '')

# Weighted total API coverage
covered = df[df["is_in_bigframes"]]["matches"].sum()
total = df[df["is_in_bigframes"] == False]["matches"].sum()
weighted_percentage = 100 * covered / total
print(f"Weighted by use, BigFrames implements {round(weighted_percentage)}% of Pandas APIs")

Weighted by use, BigFrames implements 50% of Pandas APIs


  df["api"] = df["pattern"].str.replace('[\W(\\\\b)]', '')


In [47]:
df["pct"] = df["matches"] * 100 / df["scanned"]
df = df.sort_values("pct", ascending=False)

print("The top missing APIs, and the rate at which they appear are:")
df[df["is_in_bigframes"] == False].head(50)

The top missing APIs, and the rate at which they appear are:


Unnamed: 0,pattern,token_type,is_pandas,is_dataframe,is_series,is_index,is_in_bigframes,matches,scanned,api,pct
213,\.plot\b,property,False,True,True,False,False,16546,40337,plot,41.019411
322,\.append\(,function,False,True,True,True,False,15782,40337,append,39.125369
346,\.values\b,property,False,True,True,True,False,11397,40337,values,28.254456
202,\.mean\(,function,False,True,True,False,False,10768,40337,mean,26.695094
372,\.format\(,function,False,False,False,True,False,10152,40337,format,25.16796
351,\.array\b,property,False,False,True,True,False,8875,40337,array,22.002132
50,\.array\(,function,True,False,False,False,False,8716,40337,array,21.607953
197,\.loc\[,indexer,False,True,True,False,False,8217,40337,loc,20.370875
245,\.sum\(,function,False,True,True,False,False,8217,40337,sum,20.370875
332,\.groupby\(,function,False,True,True,True,False,6740,40337,groupy,16.709225
