## Lection_3.Pipeline.Regularization

In [1]:
import os
print(os.getcwd())

/Users/nickkon/projects/pet/ITEA/Lection_3


In [2]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split

In [3]:
# Read the CSV file into a DataFrame: df
df = pd.read_csv('../data/gm_2008_region.csv')

In [4]:
df.head()

Unnamed: 0,population,fertility,HIV,CO2,BMI_male,GDP,BMI_female,life,child_mortality,Region
0,34811059.0,2.73,0.1,3.328945,24.5962,12314.0,129.9049,75.3,29.5,Middle East & North Africa
1,19842251.0,6.43,2.0,1.474353,22.25083,7103.0,130.1247,58.3,192.0,Sub-Saharan Africa
2,40381860.0,2.24,0.5,4.78517,27.5017,14646.0,118.8915,75.5,15.4,America
3,2975029.0,1.4,0.1,1.804106,25.35542,7383.0,132.8108,72.5,20.0,Europe & Central Asia
4,21370348.0,1.96,0.1,18.016313,27.56373,41312.0,117.3755,81.5,5.2,East Asia & Pacific


### Prepare data

In [6]:
y = df.life
X_all = df.drop(columns='life')

In [7]:
X_all.dtypes

population         float64
fertility          float64
HIV                float64
CO2                float64
BMI_male           float64
GDP                float64
BMI_female         float64
child_mortality    float64
Region              object
dtype: object

In [23]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X_all, y, test_size=0.3, random_state=42)

In [32]:
X_train.head(2)

Unnamed: 0,population,fertility,HIV,CO2,BMI_male,GDP,BMI_female,child_mortality,Region
40,1339941.0,1.62,1.2,13.031379,26.26446,24743.0,129.5161,5.5,Europe & Central Asia
24,11139740.0,6.81,3.4,0.047839,21.48569,1753.0,127.864,168.0,Sub-Saharan Africa


#### Custom transformers

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin

In [9]:
class ItemSelector(BaseEstimator, TransformerMixin):
    """For data grouped by feature, select subset of data at a provided key.

    The data is expected to be stored in a 2D data structure, where the first
    index is over features and the second is over samples.  i.e.

    >> len(data[key]) == n_samples

    Please note that this is the opposite convention to scikit-learn feature
    matrixes (where the first index corresponds to sample).

    ItemSelector only requires that the collection implement getitem
    (data[key]).  Examples include: a dict of lists, 2D numpy array, Pandas
    DataFrame, numpy record array, etc.

    >> data = {'a': [1, 5, 2, 5, 2, 8],
               'b': [9, 4, 1, 4, 1, 3]}
    >> ds = ItemSelector(key='a')
    >> data['a'] == ds.transform(data)

    ItemSelector is not designed to handle data grouped by sample.  (e.g. a
    list of dicts).  If your data is structured this way, consider a
    transformer along the lines of `sklearn.feature_extraction.DictVectorizer`.

    Parameters
    ----------
    key : hashable, required
        The key corresponding to the desired value in a mappable.
    """
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]

In [None]:
class ItemSelector(BaseEstimator, TransformerMixin):
    """For data grouped by feature, select subset of data at a provided key.

    ItemSelector only requires that the collection implement getitem
    (data[key]).  Examples include: a dict of lists, 2D numpy array, Pandas
    DataFrame, numpy record array, etc.

    >> data = {'a': [1, 5, 2, 5, 2, 8],
               'b': [9, 4, 1, 4, 1, 3]}
    >> ds = ItemSelector(key='a')
    >> data['a'] == ds.transform(data)

    Parameters
    ----------
    key : hashable, required
        The key corresponding to the desired value in a mappable.
    """
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]

In [10]:
from pandas.api.types import CategoricalDtype

class DummyTransformer(BaseEstimator, TransformerMixin):
    """Custom transformer to implement pandas 'get_dummies' into sklearn pipeline
    As an alternative:
    http://contrib.scikit-learn.org/categorical-encoding/onehot.html
    can be used
    """

    def __init__(self, columns_to_dummies, n_minus_one=False, sep='=', sparse=False,
                 add_unseen_column=False, unseen_name='unseen'):

        """
        :param unseen_name: the name of unseen column
        :param add_unseen_column: bool. True if you want to add 'unseen' column
        :param columns_to_dummies: list of columns that needs to be one-hot encoded.
        :param n_minus_one: takes n-1 columns for one-hot encoding if True, and n columns if False.
        :param sep: separator. prefix_sep in pd.get_dummies
        """
        # TODO: adjust documentation

        self.train_data_categories = dict()
        self.columns_to_dummies = columns_to_dummies
        self.n_minus_one = n_minus_one
        self.train_columns = None
        self.raw_columns = None
        self.sep = sep
        self.sparse = sparse

        self.unseen_name = unseen_name
        self.add_unseen_column = add_unseen_column

    def fit(self, X, y=None):
        """
        :param X: input dataframe.
        :return: {categorical value: encoded value}
        """

        for column in self.columns_to_dummies:
            self.train_data_categories[column] = X[column].unique().tolist()
            if self.add_unseen_column:
                self.train_data_categories[column].append(self.unseen_name)

        return self

    def transform(self, X):
        """
        :param X: input dataframe.
        :return: data frame where each selected column is one-hot encoded.
        """

        for col in self.columns_to_dummies:

            if self.add_unseen_column:
                all_values = X[col].unique().tolist()
                new_values = list(set(all_values) - set(self.train_data_categories[col]))
                X[col] = X[col].replace({k: self.unseen_name for k in new_values})  # TODO

            X[col] = X[col].astype(CategoricalDtype(categories=self.train_data_categories[col]))

        X_dum = pd.get_dummies(X, columns=self.columns_to_dummies,
                               drop_first=self.n_minus_one, prefix_sep=self.sep,
                               sparse=self.sparse)

        return X_dum

In [39]:
# we may get the description for the custom transformer via:
DummyTransformer?

#### Create pipelines

In [20]:
from sklearn.pipeline import Pipeline, FeatureUnion

In [34]:
dt = DummyTransformer(columns_to_dummies=['Region'])

ohe_pipeline = Pipeline(
    steps=[
        ("select", ItemSelector(key=['Region'])),
        ("dummy", dt),
    ])

In [35]:
from sklearn.preprocessing import StandardScaler

num_col = ['population', 'fertility', 'HIV', 'CO2', 'BMI_male', 'GDP', 'BMI_female', 'child_mortality']

scale_pipeline = Pipeline(
    steps=[
        ('selector', ItemSelector(key=num_col)),
        ("scaling", StandardScaler())
    ])

In [36]:
data_prep_pipe = FeatureUnion([
    ("scaling", scale_pipeline),
    ("ohe", ohe_pipeline)
])

In [37]:
data_prep_pipe.fit(X_train)

FeatureUnion(n_jobs=None,
             transformer_list=[('scaling',
                                Pipeline(memory=None,
                                         steps=[('selector',
                                                 ItemSelector(key=['population',
                                                                   'fertility',
                                                                   'HIV', 'CO2',
                                                                   'BMI_male',
                                                                   'GDP',
                                                                   'BMI_female',
                                                                   'child_mortality'])),
                                                ('scaling',
                                                 StandardScaler(copy=True,
                                                                with_mean=True,
                                    

In [27]:
X_train_prep = data_prep_pipe.transform(X_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [29]:
X_train_prep[:1]

array([[-0.3115296 , -0.88261103, -0.18734712,  1.91368118,  0.79135547,
         0.59948962,  0.72975617, -0.89467342,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ]])

In [51]:
dummy_columns=list(dt.transform(X_train[['Region']].head()).columns)
dummy_columns

['Region=Europe & Central Asia',
 'Region=Sub-Saharan Africa',
 'Region=America',
 'Region=East Asia & Pacific',
 'Region=Middle East & North Africa',
 'Region=South Asia']

In [53]:
prepared_columns = num_col + dummy_columns
prepared_columns

['population',
 'fertility',
 'HIV',
 'CO2',
 'BMI_male',
 'GDP',
 'BMI_female',
 'child_mortality',
 'Region=Europe & Central Asia',
 'Region=Sub-Saharan Africa',
 'Region=America',
 'Region=East Asia & Pacific',
 'Region=Middle East & North Africa',
 'Region=South Asia']

In [54]:
X_train_prep_df = pd.DataFrame(X_train_prep, columns=prepared_columns)
X_train_prep_df.head()

Unnamed: 0,population,fertility,HIV,CO2,BMI_male,GDP,BMI_female,child_mortality,Region=Europe & Central Asia,Region=Sub-Saharan Africa,Region=America,Region=East Asia & Pacific,Region=Middle East & North Africa,Region=South Asia
0,-0.31153,-0.882611,-0.187347,1.913681,0.791355,0.59949,0.729756,-0.894673,1.0,0.0,0.0,0.0,0.0,0.0
1,-0.235233,2.437518,0.303866,-0.781455,-1.364965,-0.84073,0.36343,2.641951,0.0,1.0,0.0,0.0,0.0,0.0
2,-0.300806,-0.390029,-0.075708,0.120198,-0.228535,-0.389808,-0.189906,-0.603038,0.0,0.0,1.0,0.0,0.0,0.0
3,0.195409,-0.972171,-0.165019,0.004709,-0.678041,-0.185271,-1.270104,-0.674859,0.0,0.0,0.0,1.0,0.0,0.0
4,-0.219858,1.842582,2.581311,-0.76046,-1.727067,-0.760168,1.380145,1.051014,0.0,1.0,0.0,0.0,0.0,0.0


### Train the model

In [58]:
lr = LinearRegression()
lasso = Lasso()
ridge = Ridge()

In [76]:
# choose any of lr, lasso, ridge
estimator = lasso

In [77]:
full_pipeline = Pipeline(steps=[
    ("prepare", data_prep_pipe),
    ("predict", estimator)
])

In [78]:
full_pipeline.fit(X_train, y_train);

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [79]:
y_pred = full_pipeline.predict(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### Metrics

In [80]:
from sklearn.metrics import mean_absolute_error, r2_score

In [81]:
mean_absolute_error(y_test, y_pred)

2.273842506734322

In [82]:
print("r2: {:.2f}".format(r2_score(y_test, y_pred)))

r2: 0.87


In [83]:
print(estimator.coef_)

[-0.         -0.         -2.54730985  0.          0.59981739  1.40776429
 -0.75327118 -4.41888409  0.         -0.          0.         -0.
  0.          0.        ]


#### Notice the diff between lasso coefs and lr coefs

In [85]:
coefs = pd.Series(estimator.coef_, index=prepared_columns)
coefs

population                          -0.000000
fertility                           -0.000000
HIV                                 -2.547310
CO2                                  0.000000
BMI_male                             0.599817
GDP                                  1.407764
BMI_female                          -0.753271
child_mortality                     -4.418884
Region=Europe & Central Asia         0.000000
Region=Sub-Saharan Africa           -0.000000
Region=America                       0.000000
Region=East Asia & Pacific          -0.000000
Region=Middle East & North Africa    0.000000
Region=South Asia                    0.000000
dtype: float64