In [58]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn2pmml import sklearn2pmml
from feature_engine.imputation import CategoricalImputer
from sklearn2pmml.preprocessing import LookupTransformer, DaysSinceYearTransformer, PMMLLabelEncoder
from sklearn2pmml.decoration import ContinuousDomain, CategoricalDomain, DateTimeDomain
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml.feature_extraction.text import Splitter
from sklearn_pandas import DataFrameMapper

In [59]:
# Dsplay pipelines
from sklearn import set_config
set_config(display="diagram")

In [60]:
import warnings
warnings.filterwarnings('ignore')

In [61]:
df = pd.read_csv("../data/mpg.csv")
df

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
0,chevrolet,malibu,2.4,2008,4,auto(l4),f,22,30,r,midsize
1,volkswagen,new beetle,1.9,1999,4,manual(m5),f,35,44,d,subcompact
2,mercury,mountaineer 4wd,4.0,1999,6,auto(l5),4,14,17,r,suv
3,jeep,grand cherokee 4wd,6.1,2008,8,auto(l5),4,11,14,p,suv
4,volkswagen,passat,2.0,2008,4,manual(m6),f,21,29,p,midsize
...,...,...,...,...,...,...,...,...,...,...,...
229,subaru,impreza awd,2.2,1999,4,auto(l4),4,21,26,r,subcompact
230,toyota,corolla,1.8,2008,4,auto(l4),f,26,35,r,compact
231,jeep,grand cherokee 4wd,3.7,2008,6,auto(l5),4,15,19,r,suv
232,land rover,range rover,4.0,1999,8,auto(l4),4,11,15,p,suv


In [62]:
df.dtypes

manufacturer     object
model            object
displ           float64
year              int64
cyl               int64
trans            object
drv              object
cty               int64
hwy               int64
fl               object
class            object
dtype: object

In [63]:
# Handle missing values in target variable
ci = CategoricalImputer(imputation_method='frequent', variables=['class'])
df = ci.fit_transform(df)
list(df['class'].unique())

['midsize', 'subcompact', 'suv', 'compact', 'minivan', 'pickup', '2seater']

In [64]:
target_mapper = {
    "midsize" :     0,
    "subcompact" :  1,
    "suv" :         2,
    "compact" :     3,
    "minivan" :     4,
    "pickup" :      5,
    "2seater" :     6
}

In [65]:
lt = LookupTransformer(target_mapper, default_value = 0)
df['class'] = lt.fit_transform(df['class'])
df.dtypes

manufacturer     object
model            object
displ           float64
year              int64
cyl               int64
trans            object
drv              object
cty               int64
hwy               int64
fl               object
class             int64
dtype: object

In [66]:
target_type = pd.CategoricalDtype(categories=sorted(df['class'].unique()), ordered=True)
df["class"] = df["class"].astype(target_type)
df.dtypes

manufacturer      object
model             object
displ            float64
year               int64
cyl                int64
trans             object
drv               object
cty                int64
hwy                int64
fl                object
class           category
dtype: object

In [67]:
numeric_features = ["displ", "year", "cyl","cty","hwy"]
categorical_features = ["manufacturer","trans","drv", "fl"]
text_features = ["model"]
datetime_features = []
drop_features = []

In [68]:
mapper = DataFrameMapper(
    [([col_num], [ContinuousDomain(), SimpleImputer(strategy='median'), StandardScaler()]) for col_num in numeric_features] +
    [([col_cat], [CategoricalDomain(), SimpleImputer(strategy='most_frequent'), PMMLLabelEncoder(), StandardScaler()]) for col_cat in categorical_features] +
    [([col_dat], [DateTimeDomain(), SimpleImputer(strategy='most_frequent'), DaysSinceYearTransformer(year = 1968), StandardScaler()]) for col_dat in datetime_features] +
    [(col_txt,   [CategoricalDomain(), CountVectorizer(tokenizer=Splitter(), max_features=5)]) for col_txt in text_features],
    drop_cols=drop_features,
    input_df=True,
    df_out=True
)

In [69]:
pipeline = PMMLPipeline([
    ("mapper", mapper),
    ("regressor", LogisticRegression(multi_class='multinomial', solver='lbfgs',max_iter=1000))
])

In [70]:
display(pipeline)

In [71]:
X = df.drop('class', axis=1)
y = df["class"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [72]:
# use model__sample_weight for weight
pipeline.fit(X_train,y_train)

In [73]:
y_pred = pipeline.predict(X_test)

In [74]:
y_pred

array([0, 0, 3, 2, 5, 2, 2, 2, 5, 0, 1, 3, 0, 2, 2, 2, 3, 0, 2, 3, 5, 2,
       2, 2, 3, 5, 3, 3, 1, 1, 3, 0, 3, 1, 4, 2, 1, 2, 2, 3, 0, 2, 2, 0,
       2, 5, 3])