In [1]:
import pandas as pd

# Creating a data frame
df = pd.read_csv("../datasets/bank-additional-full.csv", delimiter=';')
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


### Apply River classification models

In [2]:
from river import ensemble
from river import evaluate
from river import metrics
from river.datasets import synth

dataset = synth.ConceptDriftStream(seed=42,
                                   position=500,
                                   width=40).take(1000)

model = ensemble.AdaptiveRandomForestClassifier(seed=8, leaf_prediction="mc")

metric = metrics.Accuracy()

evaluate.progressive_val_score(dataset, model, metric)

Accuracy: 76.68%

In [3]:
from river import stream
from river.datasets import base


class BankDataset(base.FileDataset):
    def __init__(self):
        super().__init__(
            filename="bank-additional-full.csv",
            directory="../datasets",
            n_features=20,
            task=base.BINARY_CLF,
        )

    def __iter__(self):
        return stream.iter_csv(
            self.path,
            target="y",
            delimiter=';',
            converters={
                "age": int,
                "duration": int,
                "campaign": int,
                "pdays": int,
                "previous": int,
                "emp.var.rate": float,
                "cons.price.idx": float,
                "cons.conf.idx": float,
                "euribor3m": float,
                "nr.employed": int
            },
        )

In [12]:
from river import compose
from river import preprocessing
from river import metrics

from pprint import pprint

categorical_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
numerical_features = (feature for feature in df.columns if feature not in categorical_features + ['y'])

# pp = compose.Select('job') | preprocessing.OneHotEncoder()
model = compose.Select(categorical_features[0])
for feature in categorical_features[1:]:
    model += compose.Select(feature)
model |= preprocessing.OneHotEncoder()

for feature in numerical_features:
    model += compose.Select(feature)
model |= preprocessing.StandardScaler()
model |= ensemble.AdaptiveRandomForestClassifier(seed=8, leaf_prediction="mc")

bank_dataset = BankDataset()
metric = metrics.Accuracy()

for idx, (x, y) in enumerate(bank_dataset):
    model = model.learn_one(x)
    pprint(model.transform_one(x))
    if idx == 2:
        break

{'age': 0.0,
 'campaign': 0.0,
 'cons.conf.idx': 0.0,
 'cons.price.idx': 0.0,
 'contact_telephone': 0.0,
 'day_of_week_mon': 0.0,
 'default_no': 0.0,
 'duration': 0.0,
 'education_basic.4y': 0.0,
 'emp.var.rate': 0.0,
 'euribor3m': 0.0,
 'housing_no': 0.0,
 'job_housemaid': 0.0,
 'loan_no': 0.0,
 'marital_married': 0.0,
 'month_may': 0.0,
 'nr.employed': 0.0,
 'pdays': 0.0,
 'poutcome_nonexistent': 0.0,
 'previous': 0.0}
{'age': 1.0,
 'campaign': 0.0,
 'cons.conf.idx': 0.0,
 'cons.price.idx': 0.0,
 'contact_telephone': 0.0,
 'day_of_week_mon': 0.0,
 'default_no': -1.0,
 'default_unknown': 0.0,
 'duration': -1.0,
 'education_basic.4y': -1.0,
 'education_high.school': 0.0,
 'emp.var.rate': 0.0,
 'euribor3m': 0.0,
 'housing_no': 0.0,
 'job_housemaid': -1.0,
 'job_services': 0.0,
 'loan_no': 0.0,
 'marital_married': 0.0,
 'month_may': 0.0,
 'nr.employed': 0.0,
 'pdays': 0.0,
 'poutcome_nonexistent': 0.0,
 'previous': 0.0}
{'age': -1.4128209342043259,
 'campaign': 0.0,
 'cons.conf.idx': 0.0

In [15]:
for idx, (x, y) in enumerate(bank_dataset):
    # Obtain the prior prediction and update the model in one go
    y_pred = model.predict_one(x)
    model.learn_one(x, y)

    # Update the error metric
    metric.update(y, y_pred)
    if (idx + 1) % 100:
        print(metric)

TypeError: '<' not supported between instances of 'NoneType' and 'str'

In [14]:
evaluate.progressive_val_score(bank_dataset, model, metric, print_every=100)

TypeError: '<' not supported between instances of 'NoneType' and 'str'