In [37]:
import pandas as pd
import numpy as np
import pickle
import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA

sklearn.__version__

'0.21.3'

# 训练一个模型

In [34]:
train_data = pd.read_csv('../example3+4/titanic_train.csv', index_col=0)
train_data['Embarked'] = train_data['Embarked'].astype(str)
train_data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [53]:
# 使用sklearn的pipelines训练一个模型

encoder = ColumnTransformer(
    [
        ('Pclass', 'passthrough', ['Pclass']),
        ('Age', SimpleImputer(strategy='most_frequent'), ['Age']),
        ('Fare', SimpleImputer(strategy='mean'), ['Fare']),
        ('SibSp', 'passthrough', ['SibSp']),
        ('Embarked', OneHotEncoder(), ['Embarked']),
        ('Sex', OneHotEncoder(),['Sex']),
    ],
    remainder='drop')

estimators = [('linear_pca', PCA()), ('kernel_pca', KernelPCA())]
embedding = FeatureUnion(estimators)
model = GradientBoostingClassifier()

pipeline = Pipeline(steps=[
    ('encode', encoder),
    ('embedding', embedding),
    ('tree', model)
])


X, y = train_data, train_data.Survived

pipeline.fit(X,y)

Pipeline(memory=None,
         steps=[('encode',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('Pclass', 'passthrough',
                                                  ['Pclass']),
                                                 ('Age',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='most_frequent',
                                                                verbose=0),
                                                  ['Age']),
                     

# 使用不同方式存储模型

In [55]:
# pickle
import pickle
with open('./sklearn_model.pickle', 'wb') as o:
    pickle.dump(pipeline, o)

In [57]:
# joblib
import joblib
joblib.dump(pipeline, './sklearn_model.joblib')

['./sklearn_model.joblib']

# 载入存储的模型

In [59]:
import pickle
with open('./sklearn_model.pickle', 'rb') as r:
    pipeline1 = pickle.load(r)
pipeline1

Pipeline(memory=None,
         steps=[('encode',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('Pclass', 'passthrough',
                                                  ['Pclass']),
                                                 ('Age',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='most_frequent',
                                                                verbose=0),
                                                  ['Age']),
                     

In [60]:
import joblib
pipeline2 = joblib.load('./sklearn_model.joblib')
pipeline2

Pipeline(memory=None,
         steps=[('encode',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('Pclass', 'passthrough',
                                                  ['Pclass']),
                                                 ('Age',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='most_frequent',
                                                                verbose=0),
                                                  ['Age']),
                     