# BigFrames Scikit-Learn API Coverage Report

This notebook estimates BigFrame's coverage of Scikit-Learn APIs by:

1. Loading a list of Scikit-Learn APIs
2. Checking if they are present in BigFrames
3. Matching them to a scrape of Kaggle's search
4. Searching them all on a dataset of 170k unique ipython notebooks queried from GitHub

Note: Because of the way the library is set up, Scikit-Learn API usage can be inferred from import statements. This is a different method to the Pandas API coverage report, which examines key classes.

TODO(bmil): add estimates based on Kaggle notebooks


In [1]:
sklearn_modules = [
    "sklearn",
    "sklearn.model_selection","sklearn.preprocessing","sklearn.metrics","sklearn.linear_model","sklearn.ensemble",
    "sklearn.tree","sklearn.neighbors","sklearn.svm","sklearn.naive_bayes","sklearn.pipeline","sklearn.decomposition",
    "sklearn.impute","sklearn.cluster","sklearn.feature_selection","sklearn.utils","sklearn.compose","sklearn.neural_network",
    "sklearn.datasets","sklearn.base","sklearn.manifold","sklearn.discriminant_analysis","sklearn.experimental",
    "sklearn.multiclass","sklearn.kernel_ridge","sklearn.feature_extraction","sklearn.dummy","sklearn.mixture",
    "sklearn.gaussian_process","sklearn.calibration","sklearn.multioutput","sklearn.inspection","sklearn.exceptions",
    "sklearn.cross_decomposition","sklearn.random_projection","sklearn.covariance","sklearn.semi_supervised","sklearn.isotonic",
    "sklearn.kernel_approximation"]

header = ["pattern", "api", "kind", "is_in_bigframes"]
api_patterns = []
for module in sklearn_modules:
    exec(f"import {module}")
    members = eval(f"dir({module})")
    bigframes_has_module = False
    bigframes_members = []
    try:
        bigframes_module = module.replace("sklearn", "bigframes.ml")
        exec(f"import {bigframes_module}")
        bigframes_has_module = True
        bigframes_members = eval(f"dir({bigframes_module})")
    except ImportError as e:
        pass

    api_patterns.append([
        f"from {module} import ",
        module,
        "module",
        bigframes_has_module,
    ])
    for member in members:
        # skip private functions and properties
        if member[0] == "_":
            continue
        
        api_patterns.append([
            rf"from {module} import [^\n]*\b{member}\b",
            member,
            "api",
            member in bigframes_members
        ])

import pandas as pd
pandas_df = pd.DataFrame(api_patterns, columns=header)
pandas_df

Unnamed: 0,pattern,api,kind,is_in_bigframes
0,from sklearn import,sklearn,module,True
1,from sklearn import [^\n]*\bbase\b,base,api,False
2,from sklearn import [^\n]*\bclone\b,clone,api,False
3,from sklearn import [^\n]*\bconfig_context\b,config_context,api,False
4,from sklearn import [^\n]*\bexceptions\b,exceptions,api,False
...,...,...,...,...
815,from sklearn.kernel_approximation import [^\n]...,pairwise_kernels,api,False
816,from sklearn.kernel_approximation import [^\n]...,safe_sparse_dot,api,False
817,from sklearn.kernel_approximation import [^\n]...,sp,api,False
818,from sklearn.kernel_approximation import [^\n]...,svd,api,False


This query will count the occurrence of the API patterns in a dump of 170,000 deduped IPython notebooks taken from the public GitHub dataset.

In [2]:
import bigframes as bf
session = bf.connect()
df = session.read_pandas(pandas_df)

# Soon, we could do all this in BigFrames...... 🤞
# TODO: see how much we can rewrite
sql = f"""
WITH
  token_patterns AS ( {df.sql} ),
  github_notebooks AS (SELECT content FROM `bigframes-dev.coverage_report.github_notebooks`),
  hit_counts AS (
    SELECT
      token_patterns.pattern,
      COUNTIF(REGEXP_CONTAINS(github_notebooks.content, token_patterns.pattern)) AS matches,
      COUNT(*) AS scanned
    FROM
      token_patterns, github_notebooks
    WHERE CONTAINS_SUBSTR(github_notebooks.content, 'from sklearn')
    GROUP BY token_patterns.pattern
    ORDER BY matches DESC
)
SELECT token_patterns.*, hit_counts.matches, hit_counts.scanned
FROM token_patterns, hit_counts
WHERE token_patterns.pattern = hit_counts.pattern
"""

df = session.read_gbq(sql)

df

                                             pattern api kind  \
0       from sklearn.naive_bayes import [^\n]*\bnp\b  np  api   
1          from sklearn.pipeline import [^\n]*\bnp\b  np  api   
2             from sklearn.utils import [^\n]*\bnp\b  np  api   
3              from sklearn.base import [^\n]*\bnp\b  np  api   
4  from sklearn.discriminant_analysis import [^\n...  np  api   
5        from sklearn.multiclass import [^\n]*\bnp\b  np  api   
6      from sklearn.kernel_ridge import [^\n]*\bnp\b  np  api   
7             from sklearn.dummy import [^\n]*\bnp\b  np  api   
8       from sklearn.calibration import [^\n]*\bnp\b  np  api   
9       from sklearn.multioutput import [^\n]*\bnp\b  np  api   

   is_in_bigframes  matches  scanned  
0            False        1    18087  
1            False       25    18087  
2            False        1    18087  
3            False        2    18087  
4            False        1    18087  
5            False        2    18087  
6          

In [3]:
# TODO: rewrite everything in bigframes. Too much missing right now :(","
df = df.to_pandas()

# this imports checking method has a lot of noise from other stuff it finds in the modules.
# Lets remove rows that had <10 hits
df = df[df.matches >= 10]
df

Unnamed: 0,pattern,api,kind,is_in_bigframes,matches,scanned
1,from sklearn.pipeline import [^\n]*\bnp\b,np,api,False,25,18087
20,from sklearn.cross_decomposition import [^\n]*...,CCA,api,False,12,18087
22,from sklearn.manifold import [^\n]*\bMDS\b,MDS,api,False,82,18087
23,from sklearn.decomposition import [^\n]*\bNMF\b,NMF,api,False,98,18087
25,from sklearn.decomposition import [^\n]*\bPCA\b,PCA,api,False,1515,18087
...,...,...,...,...,...,...
815,from sklearn.random_projection import,sklearn.random_projection,module,False,34,18087
816,from sklearn.feature_extraction import,sklearn.feature_extraction,module,False,295,18087
817,from sklearn.cross_decomposition import,sklearn.cross_decomposition,module,False,31,18087
818,from sklearn.kernel_approximation import,sklearn.kernel_approximation,module,False,39,18087


In [4]:
apis_df = df[df.kind == "api"].copy()
modules_df = df[df.kind == "module"].copy()

# Weighted total API coverage
covered = apis_df[apis_df["is_in_bigframes"]]["matches"].sum()
total = apis_df[apis_df["is_in_bigframes"] == False]["matches"].sum()
weighted_percentage = 100 * covered / total
print(f"Weighted by use, BigFrames ML implements {round(weighted_percentage)}% of SKLearn APIs")

Weighted by use, BigFrames ML implements 4% of SKLearn APIs


In [5]:
apis_df["pct"] = apis_df["matches"] * 100 / apis_df["scanned"]
apis_df = apis_df.sort_values("pct", ascending=False)

print("The top missing APIs, and the rate at which they appear are:")
apis_df[apis_df["is_in_bigframes"] == False].head(50)

The top missing APIs, and the rate at which they appear are:


Unnamed: 0,pattern,api,kind,is_in_bigframes,matches,scanned,pct
539,from sklearn.model_selection import [^\n]*\btr...,train_test_split,api,False,3282,18087,18.145629
580,from sklearn.linear_model import [^\n]*\bLogis...,LogisticRegression,api,False,2297,18087,12.699729
691,from sklearn.ensemble import [^\n]*\bRandomFor...,RandomForestClassifier,api,False,2278,18087,12.594681
524,from sklearn.metrics import [^\n]*\bconfusion_...,confusion_matrix,api,False,1793,18087,9.913197
408,from sklearn.preprocessing import [^\n]*\bStan...,StandardScaler,api,False,1784,18087,9.863438
413,from sklearn.metrics import [^\n]*\baccuracy_s...,accuracy_score,api,False,1773,18087,9.802621
25,from sklearn.decomposition import [^\n]*\bPCA\b,PCA,api,False,1515,18087,8.376182
27,from sklearn.svm import [^\n]*\bSVC\b,SVC,api,False,1380,18087,7.629789
148,from sklearn.pipeline import [^\n]*\bPipeline\b,Pipeline,api,False,1154,18087,6.380273
676,from sklearn.metrics import [^\n]*\bclassifica...,classification_report,api,False,1135,18087,6.275225
