In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler


df = pd.read_csv(r"C:\Users\i.ignatiev\Lection 3\kc_house_data.csv")

y = df.price
X_all = df.drop(columns=['price','id', 'date', 'view', 'zipcode', 'lat', 'long'])
X_all['yr_renovated'] = X_all['yr_renovated'].apply(lambda x: 1 if x > 0 else 0)

#skf = StratifiedKFold(n_splits=1)
X_train, X_test, y_train, y_test = train_test_split(X_all, y, test_size=0.3, random_state=42)
X_all.head()


Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,sqft_living15,sqft_lot15
0,3,1.0,1180,5650,1.0,0,3,7,1180,0,1955,0,1340,5650
1,3,2.25,2570,7242,2.0,0,3,7,2170,400,1951,1,1690,7639
2,2,1.0,770,10000,1.0,0,3,6,770,0,1933,0,2720,8062
3,4,3.0,1960,5000,1.0,0,5,7,1050,910,1965,0,1360,5000
4,3,2.0,1680,8080,1.0,0,3,8,1680,0,1987,0,1800,7503


In [17]:
from sklearn.base import BaseEstimator, TransformerMixin
from pandas.api.types import CategoricalDtype

class ItemSelector(BaseEstimator, TransformerMixin):
    """For data grouped by feature, select subset of data at a provided key.

    The data is expected to be stored in a 2D data structure, where the first
    index is over features and the second is over samples.  i.e.

    >> len(data[key]) == n_samples

    Please note that this is the opposite convention to scikit-learn feature
    matrixes (where the first index corresponds to sample).

    ItemSelector only requires that the collection implement getitem
    (data[key]).  Examples include: a dict of lists, 2D numpy array, Pandas
    DataFrame, numpy record array, etc.

    >> data = {'a': [1, 5, 2, 5, 2, 8],
               'b': [9, 4, 1, 4, 1, 3]}
    >> ds = ItemSelector(key='a')
    >> data['a'] == ds.transform(data)

    ItemSelector is not designed to handle data grouped by sample.  (e.g. a
    list of dicts).  If your data is structured this way, consider a
    transformer along the lines of `sklearn.feature_extraction.DictVectorizer`.

    Parameters
    ----------
    key : hashable, required
        The key corresponding to the desired value in a mappable.
    """
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]
    
class DummyTransformer(BaseEstimator, TransformerMixin):
    """Custom transformer to implement pandas 'get_dummies' into sklearn pipeline
    As an alternative:
    http://contrib.scikit-learn.org/categorical-encoding/onehot.html
    can be used
    """

    def __init__(self, columns_to_dummies, n_minus_one=False, sep='=', sparse=False,
                 add_unseen_column=False, unseen_name='unseen'):

        """
        :param unseen_name: the name of unseen column
        :param add_unseen_column: bool. True if you want to add 'unseen' column
        :param columns_to_dummies: list of columns that needs to be one-hot encoded.
        :param n_minus_one: takes n-1 columns for one-hot encoding if True, and n columns if False.
        :param sep: separator. prefix_sep in pd.get_dummies
        """
        # TODO: adjust documentation

        self.train_data_categories = dict()
        self.columns_to_dummies = columns_to_dummies
        self.n_minus_one = n_minus_one
        self.train_columns = None
        self.raw_columns = None
        self.sep = sep
        self.sparse = sparse

        self.unseen_name = unseen_name
        self.add_unseen_column = add_unseen_column

    def fit(self, X, y=None):
        """
        :param X: input dataframe.
        :return: {categorical value: encoded value}
        """

        for column in self.columns_to_dummies:
            self.train_data_categories[column] = X[column].unique().tolist()
            if self.add_unseen_column:
                self.train_data_categories[column].append(self.unseen_name)

        return self

    def transform(self, X):
        """
        :param X: input dataframe.
        :return: data frame where each selected column is one-hot encoded.
        """

        for col in self.columns_to_dummies:

            if self.add_unseen_column:
                all_values = X[col].unique().tolist()
                new_values = list(set(all_values) - set(self.train_data_categories[col]))
                X[col] = X[col].replace({k: self.unseen_name for k in new_values})  # TODO

            X[col] = X[col].astype(CategoricalDtype(categories=self.train_data_categories[col]))

        X_dum = pd.get_dummies(X, columns=self.columns_to_dummies,
                               drop_first=self.n_minus_one, prefix_sep=self.sep,
                               sparse=self.sparse)

        return X_dum

In [18]:
from sklearn.pipeline import Pipeline, FeatureUnion

dt = DummyTransformer(columns_to_dummies=['waterfront', 'yr_renovated'])

ohe_pipeline = Pipeline(
    steps=[
        ("select", ItemSelector(key=['waterfront', 'yr_renovated'])),
        ("dummy", dt),
    ])



In [19]:
from sklearn.preprocessing import StandardScaler

num_col = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
           'sqft_above', 'sqft_basement', 'sqft_living15', 'sqft_lot15']

scale_pipeline = Pipeline(
    steps=[
        ('selector', ItemSelector(key=num_col)),
        ("scaling", StandardScaler())
    ])


In [20]:

data_prep_pipe = FeatureUnion([
    ("scaling", scale_pipeline),
    ("ohe", ohe_pipeline)
])

In [21]:
data_prep_pipe.fit(X_train)

FeatureUnion(n_jobs=None,
             transformer_list=[('scaling',
                                Pipeline(memory=None,
                                         steps=[('selector',
                                                 ItemSelector(key=['bedrooms',
                                                                   'bathrooms',
                                                                   'sqft_living',
                                                                   'sqft_lot',
                                                                   'floors',
                                                                   'sqft_above',
                                                                   'sqft_basement',
                                                                   'sqft_living15',
                                                                   'sqft_lot15'])),
                                                ('scaling',
                          

In [22]:
X_train_prep = data_prep_pipe.transform(X_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [23]:
dummy_columns=list(dt.transform(X_train[['waterfront', 'yr_renovated']].head()).columns)
prepared_columns = num_col + dummy_columns
prepared_columns

['bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'floors',
 'sqft_above',
 'sqft_basement',
 'sqft_living15',
 'sqft_lot15',
 'waterfront=0',
 'waterfront=1',
 'yr_renovated=0',
 'yr_renovated=1']

In [24]:
X_train_prep_df = pd.DataFrame(X_train_prep, columns=prepared_columns)
X_train_prep_df.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,sqft_above,sqft_basement,sqft_living15,sqft_lot15,waterfront=0,waterfront=1,yr_renovated=0,yr_renovated=1
0,0.679213,0.506735,0.66518,-0.266061,0.920075,1.080196,-0.654856,1.371813,-0.304804,1.0,0.0,1.0,0.0
1,0.679213,-0.472371,0.292233,-0.251329,-0.921292,-0.744979,2.012453,-1.041654,-0.349846,1.0,0.0,1.0,0.0
2,0.679213,0.180367,0.127697,-0.151429,0.920075,0.48792,-0.654856,0.274782,-0.16199,1.0,0.0,1.0,0.0
3,-1.462473,-0.472371,0.204481,6.743767,0.920075,0.125303,0.188652,1.284051,7.569851,1.0,0.0,1.0,0.0
4,0.679213,-0.472371,0.039945,6.357198,-0.921292,0.391222,-0.654856,0.18702,3.580378,1.0,0.0,1.0,0.0


In [28]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
import xgboost as xgb
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score

xgbst = xgb.XGBClassifier()
lr = LinearRegression()
lasso = Lasso()
tree = DecisionTreeClassifier()
#bagging = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=5, max_features=.5)
randomforest = RandomForestClassifier(n_estimators=2, random_state=1)

In [None]:
estimator = xgbst
full_pipeline = Pipeline(steps=[
    ("prepare", data_prep_pipe),
    ("predict", estimator)
])
full_pipeline.fit(X_train, y_train);
y_pred = full_pipeline.predict(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [None]:
from sklearn.metrics import mean_absolute_error, r2_score
mean_absolute_error(y_test, y_pred)

In [None]:
r2_score(y_test, y_pred)

In [1]:
# !no score results 