In [1]:
import sklearn

print("Scikit-Learn Version : {}".format(sklearn.__version__))

Scikit-Learn Version : 1.1.2


https://coderzcolumn.com/tutorials/machine-learning/scikit-learn-incremental-learning-for-large-datasets

# 模型汇总

- Regression
    -   sklearn.linear_model.SGDRegressor
    -   sklearn.linear_model.PassiveAggressiveRegressor
    -   sklearn.neural_network.MLPRegressor
- Classification
    -   sklearn.naive_bayes.MultinomialNB
    -   sklearn.naive_bayes.BernoulliNB
    -   sklearn.linear_model.Perceptron
    -   sklearn.linear_model.SGDClassifier
    -   sklearn.linear_model.PassiveAggressiveClassifier
    -   sklearn.neural_network.MLPClassifier
- Clustering
    -   sklearn.cluster.MiniBatchKMeans
    -   sklearn.cluster.Birch
- Preprocessing
    -   sklearn.preprocessing.StandardScaler
    -   sklearn.preprocessing.MinMaxScaler
    -   sklearn.preprocessing.MaxAbsScaler
- Decomposition / Dimensionality Reduction
    -   sklearn.decomposition.MiniBatchDictionaryLearning
    -   sklearn.decomposition.IncrementalPCA
    -   sklearn.decomposition.LatentDirichletAllocation

# 回归案例

In [2]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

X, Y = datasets.make_regression(n_samples=240000, random_state=123)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.9, random_state=123)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((216000, 100), (24000, 100), (216000,), (24000,))

In [3]:
X_train, X_test = X_train.reshape(-1,24,100), X_test.reshape(-1,24,100)
Y_train, Y_test = Y_train.reshape(-1,24), Y_test.reshape(-1,24)

X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((9000, 24, 100), (1000, 24, 100), (9000, 24), (1000, 24))

In [4]:
from sklearn.linear_model import SGDRegressor

regressor = SGDRegressor()
epochs = 10

for k in range(epochs): ## Number of loops through data
    for i in range(X_train.shape[0]): ## Looping through batches
        X_batch, Y_batch = X_train[i], Y_train[i]
        regressor.partial_fit(X_batch, Y_batch) ## Partially fitting data in batches

In [5]:
from sklearn.metrics import mean_squared_error, r2_score

Y_test_preds = []
for j in range(X_test.shape[0]): ## Looping through test batches for making predictions
    Y_preds = regressor.predict(X_test[j])
    Y_test_preds.extend(Y_preds.tolist())

print("Test MSE      : {}".format(mean_squared_error(Y_test.reshape(-1), Y_test_preds)))
print("Test R2 Score : {}".format(r2_score(Y_test.reshape(-1), Y_test_preds)))

Test MSE      : 0.0002487686327610577
Test R2 Score : 0.9999999907247031


# 分类案例

In [6]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

X, Y = datasets.make_classification(n_samples=32000, n_features=30, n_informative=20, n_classes=2)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.9, random_state=123)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((28800, 30), (3200, 30), (28800,), (3200,))

In [7]:
X_train, X_test = X_train.reshape(-1,32,30), X_test.reshape(-1,32,30)
Y_train, Y_test = Y_train.reshape(-1,32), Y_test.reshape(-1,32)

X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((900, 32, 30), (100, 32, 30), (900, 32), (100, 32))

In [9]:
from sklearn.linear_model import SGDClassifier

classifier = SGDClassifier(random_state=123)
epochs = 10
for k in range(epochs):
    for i in range(X_train.shape[0]):
        X_batch, Y_batch = X_train[i], Y_train[i]
        classifier.partial_fit(X_batch, Y_batch, classes=list(range(2)))

In [10]:
from sklearn.metrics import accuracy_score

Y_test_preds = []
for j in range(X_test.shape[0]):
    Y_preds = classifier.predict(X_test[j])
    Y_test_preds.extend(Y_preds.tolist())

print("Test Accuracy      : {}".format(accuracy_score(Y_test.reshape(-1), Y_test_preds)))

Test Accuracy      : 0.7278125


# 聚类案例

In [11]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

X, Y = datasets.make_blobs(n_samples=32000, n_features=30, centers=5, cluster_std=0.7, random_state=12345)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.9, random_state=123)

X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((28800, 30), (3200, 30), (28800,), (3200,))

In [12]:
X_train, X_test = X_train.reshape(-1,32,30), X_test.reshape(-1,32,30)
Y_train, Y_test = Y_train.reshape(-1,32), Y_test.reshape(-1,32)

X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((900, 32, 30), (100, 32, 30), (900, 32), (100, 32))

In [13]:
from sklearn.cluster import MiniBatchKMeans

clustering_algo = MiniBatchKMeans(n_clusters=5, random_state=123)
epochs = 10
for k in range(epochs):
    for i in range(X_train.shape[0]):
        X_batch, Y_batch = X_train[i], Y_train[i]
        clustering_algo.partial_fit(X_batch, Y_batch)

In [15]:
from sklearn.metrics import adjusted_rand_score

Y_test_preds = []
for j in range(X_test.shape[0]): 
    Y_preds = clustering_algo.predict(X_test[j])
    Y_test_preds.extend(Y_preds.tolist())

print("rand_score      : {}".format(adjusted_rand_score(Y_test.reshape(-1), Y_test_preds)))

rand_score      : 1.0


# 预处理案例

In [16]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

X, Y = datasets.make_regression(n_samples=240000, random_state=123)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.9, random_state=123)

X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((216000, 100), (24000, 100), (216000,), (24000,))

In [17]:
X_train, X_test = X_train.reshape(-1,24,100), X_test.reshape(-1,24,100)
Y_train, Y_test = Y_train.reshape(-1,24), Y_test.reshape(-1,24)

X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((9000, 24, 100), (1000, 24, 100), (9000, 24), (1000, 24))

In [18]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

for i in range(X_train.shape[0]):
    X_batch, Y_batch = X_train[i], Y_train[i]
    scaler.partial_fit(X_batch, Y_batch) ## Partially fitting data in batches

# 降维案例

In [20]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

X, Y = datasets.make_classification(n_samples=32000, n_features=30)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.9, random_state=123)

X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((28800, 30), (3200, 30), (28800,), (3200,))

In [21]:
X_train, X_test = X_train.reshape(-1,32,30), X_test.reshape(-1,32,30)
Y_train, Y_test = Y_train.reshape(-1,32), Y_test.reshape(-1,32)

X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((900, 32, 30), (100, 32, 30), (900, 32), (100, 32))

In [22]:
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import IncrementalPCA

pca = IncrementalPCA(n_components=20)

for i in range(X_train.shape[0]):
    X_batch, Y_batch = X_train[i], Y_train[i]
    pca.partial_fit(X_batch, Y_batch) ## Partially fitting data in batches