# BigFrames Scikit-Learn API Coverage Report

This notebook estimates BigFrame's coverage of Scikit-Learn APIs by:

1. Loading a list of Scikit-Learn APIs
2. Checking if they are present in BigFrames
3. Matching them to a scrape of Kaggle's search
4. Searching them all on a dataset of 170k unique ipython notebooks queried from GitHub

Note: Because of the way the library is set up, Scikit-Learn API usage can be inferred from import statements. This is a different method to the Pandas API coverage report, which examines key classes.

TODO(bmil): add estimates based on Kaggle notebooks


In [1]:
sklearn_modules = [
    "sklearn",
    "sklearn.model_selection","sklearn.preprocessing","sklearn.metrics","sklearn.linear_model","sklearn.ensemble",
    "sklearn.tree","sklearn.neighbors","sklearn.svm","sklearn.naive_bayes","sklearn.pipeline","sklearn.decomposition",
    "sklearn.impute","sklearn.cluster","sklearn.feature_selection","sklearn.utils","sklearn.compose","sklearn.neural_network",
    "sklearn.datasets","sklearn.base","sklearn.manifold","sklearn.discriminant_analysis","sklearn.experimental",
    "sklearn.multiclass","sklearn.kernel_ridge","sklearn.feature_extraction","sklearn.dummy","sklearn.mixture",
    "sklearn.gaussian_process","sklearn.calibration","sklearn.multioutput","sklearn.inspection","sklearn.exceptions",
    "sklearn.cross_decomposition","sklearn.random_projection","sklearn.covariance","sklearn.semi_supervised","sklearn.isotonic",
    "sklearn.kernel_approximation"]

header = ["pattern", "api", "kind", "is_in_bigframes"]
api_patterns = []
for module in sklearn_modules:
    exec(f"import {module}")
    members = eval(f"dir({module})")
    bigframes_has_module = False
    bigframes_members = []
    try:
        bigframes_module = module.replace("sklearn", "bigframes.ml")
        exec(f"import {bigframes_module}")
        bigframes_has_module = True
        bigframes_members = eval(f"dir({bigframes_module})")
    except ImportError as e:
        pass

    api_patterns.append([
        f"from {module} import ",
        module,
        "module",
        bigframes_has_module,
    ])
    for member in members:
        # skip private functions and properties
        if member[0] == "_":
            continue
        
        api_patterns.append([
            rf"from {module} import [^\n]*\b{member}\b",
            member,
            "api",
            member in bigframes_members
        ])

import pandas as pd
pandas_df = pd.DataFrame(api_patterns, columns=header)
pandas_df

Unnamed: 0,pattern,api,kind,is_in_bigframes
0,from sklearn import,sklearn,module,True
1,from sklearn import [^\n]*\bbase\b,base,api,False
2,from sklearn import [^\n]*\bclone\b,clone,api,False
3,from sklearn import [^\n]*\bconfig_context\b,config_context,api,False
4,from sklearn import [^\n]*\bexceptions\b,exceptions,api,False
...,...,...,...,...
815,from sklearn.kernel_approximation import [^\n]...,pairwise_kernels,api,False
816,from sklearn.kernel_approximation import [^\n]...,safe_sparse_dot,api,False
817,from sklearn.kernel_approximation import [^\n]...,sp,api,False
818,from sklearn.kernel_approximation import [^\n]...,svd,api,False


This query will count the occurrence of the API patterns in a dump of 170,000 deduped IPython notebooks taken from the public GitHub dataset.

In [2]:
import bigframes as bf
session = bf.connect()
df = session.read_pandas(pandas_df)

# Soon, we could do all this in BigFrames...... 🤞
# TODO: see how much we can rewrite
sql = f"""
WITH
  token_patterns AS ( {df.sql} ),
  kaggle_hit_counts AS (
    SELECT pattern, private_matches, public_matches, private_scanned, public_scanned
    FROM `bigframes-dev.coverage_report.kaggle_sklearn_hit_counts`),
  github_notebooks AS (SELECT content FROM `bigframes-dev.coverage_report.github_notebooks`),
  github_hit_counts AS (
    SELECT
      token_patterns.pattern,
      COUNTIF(REGEXP_CONTAINS(github_notebooks.content, token_patterns.pattern)) AS matches,
      COUNT(*) AS scanned
    FROM
      token_patterns, github_notebooks
    WHERE CONTAINS_SUBSTR(github_notebooks.content, 'from sklearn')
    GROUP BY token_patterns.pattern
    ORDER BY matches DESC
)
SELECT
  token_patterns.*,
  github_hit_counts.matches AS github_matches,
  github_hit_counts.scanned AS github_scanned,
  kaggle_hit_counts.private_matches AS kaggle_priv_matches,
  kaggle_hit_counts.private_scanned AS kaggle_priv_scanned,
  kaggle_hit_counts.public_matches AS kaggle_pub_matches,
  kaggle_hit_counts.public_scanned AS kaggle_pub_scanned
FROM token_patterns
LEFT JOIN kaggle_hit_counts ON token_patterns.pattern = kaggle_hit_counts.pattern
LEFT JOIN github_hit_counts ON token_patterns.pattern = github_hit_counts.pattern
"""

df = session.read_gbq(sql)

df

                                             pattern api kind  \
0       from sklearn.naive_bayes import [^\n]*\bnp\b  np  api   
1          from sklearn.pipeline import [^\n]*\bnp\b  np  api   
2             from sklearn.utils import [^\n]*\bnp\b  np  api   
3              from sklearn.base import [^\n]*\bnp\b  np  api   
4  from sklearn.discriminant_analysis import [^\n...  np  api   
5        from sklearn.multiclass import [^\n]*\bnp\b  np  api   
6      from sklearn.kernel_ridge import [^\n]*\bnp\b  np  api   
7             from sklearn.dummy import [^\n]*\bnp\b  np  api   
8       from sklearn.calibration import [^\n]*\bnp\b  np  api   
9       from sklearn.multioutput import [^\n]*\bnp\b  np  api   

   is_in_bigframes  github_matches  github_scanned  kaggle_priv_matches  \
0            False               1           18087                 <NA>   
1            False              25           18087                 <NA>   
2            False               1           18087         

In [4]:
# TODO: rewrite everything in bigframes. Too much missing right now :(","
df = df.to_pandas()

# this imports checking method has a lot of noise from other stuff it finds in the modules.
# Lets remove rows that had <10 hits
df = df[df.github_matches >= 10]
df

Unnamed: 0,pattern,api,kind,is_in_bigframes,github_matches,github_scanned,kaggle_priv_matches,kaggle_priv_scanned,kaggle_pub_matches,kaggle_pub_scanned
1,from sklearn.pipeline import [^\n]*\bnp\b,np,api,False,25,18087,,,,
20,from sklearn.cross_decomposition import [^\n]*...,CCA,api,False,12,18087,2908,16757969,208,2674415
22,from sklearn.manifold import [^\n]*\bMDS\b,MDS,api,False,82,18087,3876,16757969,731,2674415
23,from sklearn.decomposition import [^\n]*\bNMF\b,NMF,api,False,98,18087,55781,16757969,4454,2674415
25,from sklearn.decomposition import [^\n]*\bPCA\b,PCA,api,False,1515,18087,605568,16757969,105404,2674415
...,...,...,...,...,...,...,...,...,...,...
815,from sklearn.random_projection import,sklearn.random_projection,module,False,34,18087,,,,
816,from sklearn.feature_extraction import,sklearn.feature_extraction,module,False,295,18087,,,,
817,from sklearn.cross_decomposition import,sklearn.cross_decomposition,module,False,31,18087,,,,
818,from sklearn.kernel_approximation import,sklearn.kernel_approximation,module,False,39,18087,,,,


In [5]:
apis_df = df[df.kind == "api"].copy()
modules_df = df[df.kind == "module"].copy()

# Weighted total API coverage
matches_stats = apis_df[["github_matches", "kaggle_priv_matches", "kaggle_pub_matches"]]
covered = matches_stats[apis_df["is_in_bigframes"]].sum(axis=0)
total = matches_stats.sum(axis=0)
weighted_percentage = covered * 100 / total
weighted_percentage["average"] = weighted_percentage.mean()
weighted_percentage

github_matches         4.303175
kaggle_priv_matches    2.183974
kaggle_pub_matches     2.661133
average                3.049427
dtype: float64

In [7]:
apis_df["github_pct"] = apis_df["github_matches"] * 100 / apis_df["github_scanned"]
apis_df["kaggle_priv_pct"] = apis_df["kaggle_priv_matches"] * 100 / apis_df["kaggle_priv_scanned"]
apis_df["kaggle_pub_pct"] = apis_df["kaggle_pub_matches"] * 100 / apis_df["kaggle_pub_scanned"]
apis_df["avg_pct"] = (apis_df["github_pct"] + apis_df["kaggle_priv_pct"] + apis_df["kaggle_pub_pct"]) / 3

summary = apis_df[["api", "avg_pct", "github_pct", "kaggle_priv_pct", "kaggle_pub_pct", "is_in_bigframes"]]
summary = summary.sort_values("avg_pct", ascending=False)

print("The top missing APIs, and the rate at which they appear are:")
summary[summary["is_in_bigframes"] == False].head(50)

The top missing APIs, and the rate at which they appear are:


Unnamed: 0,api,avg_pct,github_pct,kaggle_priv_pct,kaggle_pub_pct,is_in_bigframes
539,train_test_split,26.256856,18.145629,28.84599,31.77895,False
408,StandardScaler,12.915765,9.863438,10.5809,18.302956,False
691,RandomForestClassifier,10.591562,12.594681,7.043294,12.13671,False
413,accuracy_score,9.808392,9.802621,7.961985,11.660569,False
580,LogisticRegression,9.78837,12.699729,5.841669,10.823713,False
524,confusion_matrix,9.051271,9.913197,6.795716,10.444901,False
57,KFold,8.095705,2.60408,14.598046,7.084989,False
295,LabelEncoder,7.477444,4.423066,8.867453,9.141812,False
608,mean_squared_error,7.125417,5.042296,8.622548,7.711406,False
455,StratifiedKFold,7.013412,2.217062,13.188895,5.634279,False
