<a href="https://colab.research.google.com/github/cuboidandroid/googletl/blob/main/sklearn_pipelines_poc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Preamble

In [None]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from gspread_dataframe import set_with_dataframe

# df = pd.DataFrame()
# df['Name'] = ['Jane', 'Stanley', 'Anthony', 'Marcus', 'Tommy', 'Alice', 'Ragun']
# df['points'] = [45, 61, 21, np.nan, 91, 81, np.nan]
# df['Exams Taken'] = [1, 1, 1, 0, 2, 1, np.nan]

from google.colab import auth
auth.authenticate_user()

import gspread
from google.auth import default
creds, _ = default()

gc = gspread.authorize(creds)

Opening google sheets workbook and loading data into pandas dataframe

In [None]:
worksheet = gc.open('Class_Example').worksheet('Classroom')
rows = worksheet.get_all_values()
df = pd.DataFrame.from_records(rows[1:], columns=rows[0]).replace('', np.nan)

Defining pipeline steps

In [None]:
class NaNFilter(BaseEstimator, TransformerMixin):
  def __init__(self, axi=0):
    self.axi = axi

  def fit(self, X, y=None):
    return self

  def transform(self, X):
    if isinstance(X, pd.DataFrame):
      return X.dropna(axis=self.axi).reset_index(drop=True)
    else:
      NotImplementedError

In [None]:
class Transposer(BaseEstimator, TransformerMixin):
  def __init__(self):
    pass

  def fit(self, X, y=None):
    return self

  def transform(self, X):
    if isinstance(X, pd.DataFrame):
      return X.T
    else:
      NotImplementedError

In [None]:
class SampleQuantiler(BaseEstimator, TransformerMixin):
  def __init__(self, sample):
    self.sample = sample
    self.sample_mean = np.nan

  def fit(self, X, y=None):
    self.sample_mean = X[self.sample].mean()
    return self

  def transform(self, X):
    if isinstance(X, pd.DataFrame):
      col = f'{self.sample}_SampleQuantile'
      X2 = X.copy()

      X2.loc[X2[self.sample].sort_values().index, col] = range(1, int(len(X2[self.sample]))+1)
      X2[col] = X2[col] / len(X2[col])

      return X2
    else:
      NotImplementedError

Combining steps into a Pipeline object

In [None]:
pipe = Pipeline([
    ('nanfilter', NaNFilter(axi=0)),
    ('meanscorer', SampleQuantiler(sample='points')),
    ('transposer', Transposer())
])

In [None]:
pipe

Running the pipeline on data

In [None]:
df

Unnamed: 0,Name,points,Exams Taken
0,Jane,45.0,1.0
1,Stanley,61.0,1.0
2,Anthony,21.0,1.0
3,Marcus,,0.0
4,Tommy,91.0,2.0
5,Alice,81.0,1.0
6,Ragun,,
7,Iza,95.0,1.0


In [None]:
pipe.fit_transform(df)

Unnamed: 0,0,1,2,3,4,5
Name,Jane,Stanley,Anthony,Tommy,Alice,Iza
points,45,61,21,91,81,95
Exams Taken,1,1,1,2,1,1
points_SampleQuantile,0.333333,0.5,0.166667,0.833333,0.666667,1.0


Saving processed data to Statistics worksheet

In [None]:
worksheet2 = gc.open('Class_Example').worksheet('Statistics')
set_with_dataframe(worksheet2, pipe.fit_transform(df), include_index=True)