In [1]:
!pip install river



In [2]:
import pandas as pd

from river import compose
from river import linear_model
from river import metrics
from river import optim
from river import preprocessing
from river import stream

from sklearn import datasets
from sklearn import metrics as sk_metrics

# Part 1

In [3]:
scaler = preprocessing.StandardScaler()
optimizer = optim.SGD(lr=0.01)
log_reg = linear_model.LogisticRegression(optimizer)

In [4]:
y_true = []
y_pred = []

In [5]:
sklearn_df = datasets.load_breast_cancer()
df = pd.DataFrame(data=sklearn_df.data, columns=sklearn_df.feature_names)
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [6]:
df.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'],
      dtype='object')

In [7]:
df.shape

(569, 30)

In [8]:
# iter_sklearn_dataset: Iterates rows from one of the datasets provided by scikit-learn.
for xi, yi in stream.iter_sklearn_dataset(datasets.load_breast_cancer(), shuffle=True, seed=42):

    # Scale the features
    xi_scaled = scaler.learn_one(xi).transform_one(xi)

    # Test the current model on the new "unobserved" sample
    yi_pred = log_reg.predict_proba_one(xi_scaled)
    # Train the model with the new sample
    log_reg.learn_one(xi_scaled, yi)

    # Store the truth and the prediction
    y_true.append(yi)
    y_pred.append(yi_pred[True])

print(f'ROC AUC: {sk_metrics.roc_auc_score(y_true, y_pred):.3f}')

ROC AUC: 0.990


In [10]:
print(y_true[:5])

[1, 0, 1, 0, 1]


In [11]:
print(y_pred[:5])

[0.5, 0.5012499973958399, 0.5150967551331861, 0.4570193388417207, 0.5167806906005182]


# Part 2

In [16]:
model = compose.Pipeline(
    preprocessing.StandardScaler(),
    linear_model.LogisticRegression( optim.SGD(lr=0.01) )
    )
metric = metrics.ROCAUC()

In [17]:
for xi, yi in stream.iter_sklearn_dataset(datasets.load_breast_cancer(), shuffle=True, seed=42):

    # Test the current model on the new "unobserved" sample
    yi_pred = model.predict_proba_one(xi)

    # Update the running metric with the prediction and ground truth value
    metric.update(yi, yi_pred)

    # Train the model with the new sample
    model.learn_one(xi, yi)

print(f'ROC AUC: {metric}')

ROC AUC: ROCAUC: 98.74%


Dataset Info:

In [19]:
data = datasets.load_breast_cancer()

In [20]:
type(data)

sklearn.utils.Bunch

In [21]:
list(data.target_names)

['malignant', 'benign']