In [358]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn import metrics
from sklearn.impute import KNNImputer
import datetime
import math
import random

import warnings
warnings.simplefilter('ignore')

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

In [359]:
pd.__version__

'1.2.4'

In [360]:
df = pd.read_csv('avocado.csv')

In [361]:
#--- MISSING VALUES---
#
# we have no original missing values, so we'll pretend
# we have those: select some columns to put data out of them randomly
# chosen columns are 'AveragePrice', 'type' and randomly something from
# '4046', '4225', '4770' and total amount respectively
#
#
# type will be fixed using knnimputer
# that depends on other numeric values - organic avocados
# obviously have different statistics, unlike 
# conventional
#
# 4046, 4225, 4770 and total amount - using knn imputer
# in spite of the fact that these values are strongly tied
# on economic conditions, so e.g. equally wealthy areas are
# very likely to have similar buy stats
#
# AveragePrice will be imputed by using mean value of
# all table (tbh, for diversity)
#
#---PREPROCESSING---
#
# 1. type has 2 values, thus we can use one hot encoder
# feature - nice way to turn 'object' into numeric here
#
# 2. index tells us nothing and should be dropped
#
# 3. date is not useful while it's object
# no visible sense in storing day of observation
# all the more there are a few unique day values per month
# so let's create a column how many months passed since some moment
# - object becomes ordinal feature
#
# 4. add column 'money spent' that is composition of 'Total Volume'
# and 'AveragePrice' - gives us more details about money
#
# 5. amount of regions will be reduced to 10
# say we have 100 avocado purchase stats where total money spent on avocado
# is between 250 000 and 500 000; 70 of them is Texas
# then we match 30 remaining as Texas too
# and lets use one hot encoder after this step
#
# we lose some region data doing this step, but it's pretty likely to
# appear a nice alternative to simple one hot encoder (too many columns),
# to mean target encoding (we are losing region data totally)
# to label encoding (we have no 'better' regions)

In [362]:
# Random data deletion
df.loc[[random.randint(0, df.shape[0]) for i in range(0, int(0.15*df.shape[0]))], 'AveragePrice'] = np.nan

In [363]:
df.loc[[random.randint(0, df.shape[0]) for i in range(0, int(0.15*df.shape[0]))], 'type'] = np.nan

In [364]:
cols_to_del_from = ['4046', '4225', '4770']

for i in range(0, int(0.15*df.shape[0])):
    curr_col_to_del_ind = random.randint(0, 2)
    curr_row_to_del_ind = random.randint(0, df.shape[0])
    df.loc[curr_row_to_del_ind, cols_to_del_from[curr_col_to_del_ind]] = np.nan
    df.loc[curr_row_to_del_ind, 'Total Volume'] = np.nan

In [365]:
df

# Now we have data frame with missing values to impute

Unnamed: 0,index,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
0,0,2015-12-27,,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,,2015,Albany
1,1,2015-12-20,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany
2,2,2015-12-13,0.93,118220.22,794.70,109149.67,130.50,8145.35,8042.21,103.14,0.0,,2015,Albany
3,3,2015-12-06,1.08,78992.15,1132.00,71976.41,72.58,5811.16,5677.40,133.76,0.0,conventional,2015,Albany
4,4,2015-11-29,1.28,51039.60,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015,Albany
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18244,7,2018-02-04,1.63,17074.83,2046.96,1529.20,0.00,13498.67,13066.82,431.85,0.0,,2018,WestTexNewMexico
18245,8,2018-01-28,1.71,,1191.70,3431.50,,9264.84,8940.04,324.80,0.0,organic,2018,WestTexNewMexico
18246,9,2018-01-21,1.87,13766.76,1191.92,2452.79,727.94,9394.11,9351.80,42.31,0.0,organic,2018,WestTexNewMexico
18247,10,2018-01-14,1.93,16205.22,1527.63,2981.04,727.01,10969.54,10919.54,50.00,0.0,organic,2018,WestTexNewMexico


# Custom classes for pipeline construction

In [366]:
df.groupby('type').agg('median')

# statistics tell us we should impute 'type'
# using data from 4225, 4046, 4770 - thats how we
# 'calculate a probability' of certain avocado stats
# to turn out to be about organic or conventional

Unnamed: 0_level_0,index,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,year
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
conventional,23,1.13,412213.0,105339.58,138937.95,6107.57,100570.55,76008.08,14816.18,135.85,2016
organic,24,1.63,10676.48,889.605,3048.57,0.0,5093.575,2880.095,380.075,0.0,2016


In [367]:
class ImputeAvPriceByRegionMean(TransformerMixin):
    """
    Imputes 'AveragePrice' value by taking mean from all
    values with same 'region' value in data frame
    """
    
    def __init__(self):
        """
        Initalizes a new instance of imputer
        """
        TransformerMixin.__init__(self)
    
    def fit(self, X, y=None):
        """
        Fits to X. Creates and remembers a new 'region' column,
        setting off from dataset X.
        """
        self.impute_dict = {}
        self.missing_key_val = X['AveragePrice'].mean() 
        regions_list = list(X['region'].unique())
        for reg in regions_list:
            self.impute_dict[reg] = X[X['region'] == reg]['AveragePrice'].mean()
        return self
        
    def transform(self, X, y=None):
        """
        Writes previously created new 'region' column to X
        """
        X_copy=X.copy()
        for index, row in X.iterrows():
            if row['AveragePrice'] != row['AveragePrice']: # if NaN
                if row['region'] not in self.impute_dict:
                    X_copy.at[index,'AveragePrice'] = self.impute_dict[row['region']]
                else:
                    X_copy.at[index,'AveragePrice'] = self.missing_key_val
        return X_copy

In [368]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    """
    The class provides basic functionality for retrieving
    a subset of columns from the dataset.
    """
    
    def __init__(self, feature_names):
        """
        Initialize class instance by setting
        a list of columns to retrieve from the dataset.
        """
        BaseEstimator.__init__(self)
        TransformerMixin.__init__(self)
        self.feature_names = feature_names
        
    def fit(self, X, y=None):
        """
        Fit FeatureSelector to X, but really do nothing.
        Return self.
        """
        return self
    
    def transform(self, X, y=None):
        """
        Transform X using feature selection. 
        Return column-subset of X.
        """
        return X[self.feature_names]

In [369]:
class ImputeUpdateTotalVolume(TransformerMixin):
    """
    Imputes value by taking mean number from certain value 
    of the 'region' column in data frame
    """
    
    def __init__(self):
        """
        Initalizes a new instance of imputer
        """
        TransformerMixin.__init__(self)
    
    def fit(self, X, y=None):
        """
        Builds and remembers a columns with updated 'TotalVolume' values
        Returns self
        """
        self.new_total_volume = X['4046'] + X['4225'] + X['4770']
        return self
        
    def transform(self, X, y=None):
        """
        Returns dataset with updated 'TotalVolume' column
        """
        X_copy = X.copy()
        X_copy['Total Volume'] = self.new_total_volume
        return X_copy

In [370]:
class DateTransformer(BaseEstimator, TransformerMixin):
    """
    The class provides functionality for converting date columns to numeric.
    Converts dates to a number indicating the amount of time 
    that has elapsed from a certain point in time.
    """
    
    def __init__(self, timepoint, transform_to, drop):
        """
        Initialize class instance by setting convert options. 
        
        Parameters
        ----------
        timepoint : pandas.Timestamp, 
            the time point from which the count is taken.
        transform_to: str, 
            unit of time to use for calculating the result.
            options:
            - 'y' -- years;
            - 'm' -- months;
            - 'w' -- weeks;
            - 'd' -- days.
        drop: bool, 
            if True, remove the original columns from the dataset.
        """
        BaseEstimator.__init__(self)
        TransformerMixin.__init__(self)
        self.timepoint = timepoint
        self.transform_to = transform_to
        self.drop = drop
    
    def fit(self, X, y=None):
        """
        Fit DateTransformer to X, but really do nothing.
        Return self.
        """
        return self
    
    def transform(self, X, y=None):
        """
        Transfor X using the parameters set in the constructor.
        Return transformed dataframe. 
        """
        options = dict(d=1, w=7, m=30, y=365)
        div = options.get(self.transform_to, 1)
        columns = X.columns
        for col in columns:
            new_col_name = f'{col}_{self.transform_to}'
            X[new_col_name] = X[f'{col}'].apply(
                lambda x: (pd.to_datetime(x) - (pd.to_datetime(self.timepoint))).days / div)
        if self.drop:
            X.drop(columns, axis=1, inplace=True)
        return X

In [371]:
class ColumnDropTransformer(TransformerMixin):
    """
    Drops given columns from dataset
    """
    
    def __init__(self, columns, copy):
        """
        Initalizes a new instance of ColumnDropTransformer
        
        Parameters
        ----------
        columns: list,
            list of column names to delete.
        copy: Boolean,
            defines whether transformer will influence on
            original dataset or not.
        """
        TransformerMixin.__init__(self)
        self.cols_to_del=columns
        self.copy = copy
    
    def fit(self, X, y=None):
        """
        Fit ColumnDropTransformer to X, but really do nothing.
        Return self.
        """
        return self
        
    def transform(self, X, y=None):
        """
        Returns X with dropped columns
        """
        if self.copy == True:
            copy_df = X.copy()
            for col in self.cols_to_del:
                copy_df = copy_df.drop(col, 1)
            return copy_df
        else:
            for col in self.cols_to_del:
                X = X.drop(col, 1) 
            return X
    

In [372]:
class ColumnTotalSumAddTransformer(TransformerMixin):
    """
    Imputes value by taking mean number from certain value 
    of the 'region' column in data frame
    """
    
    def __init__(self, copy=True):
        """
        Creates a new instance of ColumnTotalSumAddTransformer.
        """
        TransformerMixin.__init__(self)
        self.copy = copy
    
    def fit(self, X, y=None):
        """
        Fit ColumnTotalSumAddTransformer to X, but really do nothing.
        Return self.
        """
        return self
        
    def transform(self, X, y=None):
        """
        Returns X with added 'TotalSum' column, that is
        a product of 'TotalVolume' and 'AveragePrice'
        """
        if self.copy == True:
            copy_df = X.copy()
            copy_df.loc[:, 'TotalSum'] = copy_df['Total Volume']*copy_df['AveragePrice']
            return copy_df
        else:
            X.loc[:, 'TotalSum'] = X['Total Volume']*X['AveragePrice']
            return X

In [373]:
class KeyValueRowDropTransformer(TransformerMixin):
    """
    Takes out rows with specific value of a certain key
    from dataframe.
    """
    
    def __init__(self, key, value):
        """
        Initalizes a new instance of KeyValueRowDropTransformer
        
        Parameters
        ----------
        key: str,
            key of column where to find a value.
        value: str,
            value to be considered.
        """
        TransformerMixin.__init__(self)
        self.key = key
        self.value = value
    
    def fit(self, X, y=None):
        """
        Fit KeyValueRowDropTransformer to X, but really do nothing.
        Return self.
        """
        return self
    
    def transform(self, X, y=None):
        """
        Returns X without rows with given value of a given key
        """
        return X[X[self.key] != self.value]

In [374]:
class FeatureDecreaseAmountTransformer(TransformerMixin):
    """
    Decreases amount of unique values in chosen feature, ranging
    them by other feature-estimator
    """
    
    def __init__(self,  feature, estimator, copy=True, new_col_amount=10):
        """
        Initalizes a new instance of FeatureDecreaseAmountTransformer
        
        Parameters
        ----------
        feature: str,
            feature which amount of unique values is
            need to be decreased.
        estimator: str,
            feature which is taken to range 'feature'.
        new_col_amount: int,
            amount of ranges to split 'feature'.
            New amount of unique values will be less or equal
            new_col_amount.
        copy: Boolean,
            defines whether transformer influences on
            original dataset or not.
        """
        TransformerMixin.__init__(self)
        self.copy = copy
        self.new_col_amount = new_col_amount
        self.column = feature
        self.estimator = estimator
    
    def fit(self, X, y=None):
        """
        Fits to X by creating a new column with new
        feature values.
        """
        X_copy = X.copy()
        size_volumes_imputer = ImputeAvPriceByRegionMean();
        
        X_copy = size_volumes_imputer.fit_transform(X_copy)
        update_total_sum = ImputeUpdateTotalVolume();
        
        X_copy = update_total_sum.fit_transform(X_copy)
        X_copy = X.sort_values(self.estimator)
        
        self.new_regions = [None]*X_copy.shape[0]
        
        for i in range(0, self.new_col_amount):
            x_i = X_copy.shape[0]*i//self.new_col_amount
            x_iplus1 = X_copy.shape[0]*(i+1)//self.new_col_amount
            curr_prevail_region = X_copy[x_i:x_iplus1][self.column].value_counts().sort_values(ascending=False).index[0]
            for i in range(x_i, x_iplus1):
                self.new_regions[i] = curr_prevail_region
        return self
        
    def transform(self, X, y=None):
        """
        Writes new feature column in X.
        Returns transformed X.
        """
        if self.copy == True:
            copy_df = X.sort_values(self.estimator).copy()
            copy_df[self.column] = self.new_regions
            copy_df = copy_df.sort_index()
            return copy_df
        else:
            X = X.sort_values(self.estimator)
            X[self.column] = self.new_regions
            X = X.sort_index()
            return X

In [375]:
def my_round(X):
    """
    Maps every single element in list X the following way:
        1, if element is >= 0.5
        0, otherwise
    Returns mapped collection.
    """
    for i in range(0, len(X)):
        if X[i] >= 0.5:
            X[i] = 1
        else:
            X[i] = 0
    return X

def my_map(X, mapper):
    """
    Maps every single element in list X using given mapper.
    Returns mapped collection.
    """
    for i in range(0, len(X)):
        X[i] = mapper[X[i]]
    return X

class AvocadoTypeImputer(KNNImputer):
    """
    Imputes 'type' feature labels. Based on KNNImputer.
    """
    def __init__(self, features):
        """
        Initalizes a new instance of AvocadoTypeImputer
        """
        KNNImputer.__init__(self)
        features.append('type')
        self.features = features
    
    def fit(self, X, y=None):
        """
        Fits to X. Maps values into numbers and says KNNImputer
        to fit on them.
        Return self
        """
        self.type_remapper = {'conventional' : 0, 'organic' : 1}
        X_copy = X.copy()[self.features]
        X_copy['type'] = X['type'].map(self.type_remapper)
        self.knn_imputer = KNNImputer()
        self.knn_imputer.fit(X_copy)
        return self
        
    def transform(self, X, y=None):
        """
        Transforms X: says KNNImputer to transform what is previously
        fitted, maps output back to labels.
        Returns data frame with imputed values.
        """
        X_copy = X.copy()
        X_copy_slice = X_copy[self.features]
        type_col_pos = X_copy_slice.columns.get_loc('type')

        X_copy_slice['type'] = X_copy_slice['type'].map(self.type_remapper)

        X_copy_slice = self.knn_imputer.transform(X_copy_slice)
        type_remapper_reverse = {0: 'conventional', 1: 'organic'}
        X_copy['type'] = my_round(X_copy_slice[:, type_col_pos])
        X_copy['type'] = X_copy['type'].map(type_remapper_reverse)
        return X_copy

In [376]:
class KNNInlineImpute(TransformerMixin):
    """
    Does the exact same thing as KNNImputer, but writes
    result in data frame in original columns
    """
    def __init__(self, features):
        """
        Initalizes new instance of KNNInlineImputer
        """
        TransformerMixin.__init__(self)
        self.knn_imputer = KNNImputer()
        self.features = features
    
    def fit(self, X, y=None):
        """
        Fits to X[features] precisely as KNNImputer
        Returns self
        """
        self.knn_imputer.fit(X[self.features])
        return self
        
    def transform(self, X, y=None):
        """
        Transforms X[features] like KNNImputer and writes
        result in X
        Returns transformed X
        """
        X_copy= X.copy()
        X_copy[self.features] = self.knn_imputer.transform(X_copy[self.features])
        return X_copy

# Building pipelines


In [377]:
impute_pipeline = Pipeline(
    steps=[
        ('sizes_impute', KNNInlineImpute(['4046', '4225', '4770'])),
        ('av_price_impute', ImputeAvPriceByRegionMean()),
        ('total_volume_impute', ImputeUpdateTotalVolume()),
        ('type_impute', AvocadoTypeImputer(features=['4046', '4225', '4770']))
    ]
)

In [427]:
date_pipeline = Pipeline(
    steps=[
        ('date_selector', FeatureSelector(['Date'])),
        ('date_transformer', DateTransformer(pd.Timestamp(2015,1,1), transform_to='m', drop=True)),
    ]
)

av_price_pipeline = Pipeline(
    steps=[
        ('average_price_select', FeatureSelector(['AveragePrice']))
    ]
)

volume_pipeline = Pipeline(
    steps=[
        ('select_sizes_volume', FeatureSelector(['4046', '4225', '4770', 'Total Volume'])),
    ]
)

add_columns_pipeline = Pipeline(
    steps=[
        ('add_total_sum', ColumnTotalSumAddTransformer()),
        ('select_total_sum', FeatureSelector(['TotalSum']))
    ]
)

type_pipeline = Pipeline(
    steps=[
        ('type_select_to_ohe', FeatureSelector(['type'])),
        ('type_one_hot', OneHotEncoder(sparse=False))
    ]
)

region_pipeline = Pipeline(
    steps=[
        ('add_total_sum_to_range', ColumnTotalSumAddTransformer()),
        ('region_reduce', FeatureDecreaseAmountTransformer(feature='region',
                                                           estimator='TotalSum')),
        ('region_select', FeatureSelector(['region'])),
        ('region_one_hot', OneHotEncoder(sparse=False))
    ]
)

bags_pipeline = Pipeline(
    steps=[
        ('select_bags', FeatureSelector(['Total Bags', 'Small Bags', 'Large Bags', 'XLarge Bags']))
    ]
)


In [504]:
gather_and_transform_pipeline = FeatureUnion(transformer_list=[
    #('volumes', volume_pipeline), otherwise doesn't work for no reason,
    # im 100% sure my imputer pipeline works properly and i've
    # already checked whether those columns have NaN's or smth
    # 99999 times (and they haven't)
    ('date', date_pipeline),
    ('av_price', av_price_pipeline),
    ('bags', bags_pipeline),
    ('type', type_pipeline),
    ('region', region_pipeline)
])


In [513]:
full_pipeline = Pipeline(
    steps=[
        ('impute', impute_pipeline),
        ('transform', gather_and_transform_pipeline)
    ]
)


Unnamed: 0,4046,4225,4770,Total Volume
0,1036.74,54454.85,48.160,55539.750
1,674.28,44638.81,58.330,45371.420
2,794.70,109149.67,130.500,110074.870
3,1132.00,71976.41,72.580,73180.990
4,941.48,43838.39,75.780,44855.650
...,...,...,...,...
18244,2046.96,1529.20,0.000,3576.160
18245,1191.70,3431.50,23.224,4646.424
18246,1191.92,2452.79,727.940,4372.650
18247,1527.63,2981.04,727.010,5235.680


In [507]:
from sklearn.neighbors import KNeighborsClassifier

Now lets use preprocessed data to try to "predict" something
(as we are only expected to try out api, data will be random)

In [508]:
def gc(method, ctor_params, params, preparation):
    """
    Function builds pipeline of full data preparation and launches grid search
    with parameters passed in `params`.
    
    Parameters
    ----------
    method: <class 'type'>,
        class (type/method) for boosting classification.
    ctor_params: dict,
        parameters for `method` initialization (__init__ method).
    params: list of dict,
        grid definition for GridSearchCV.
    preparation: pipeline, transformer,
        preparation step for pipeline.
    """
    
    pip = Pipeline(
        steps=[
            ('preparation', preparation),
            ('gc', GridSearchCV(method(**ctor_params), params, n_jobs=-1,
                                scoring='accuracy', cv=5, refit=True, verbose=2))
        ]
    )
    
    df = pd.read_csv('avocado.csv')
    df = df.dropna()
    X_train = df[::2][0:8000]
    X_test = df[1::2][0:8000]
    y_train = [random.randint(0,100) for i in range(0, X_train.shape[0])]
    y_test = [random.randint(0,100) for i in range(0, X_test.shape[0])]
    
    pip.fit(X_train, y_train)
    y_true, y_pred = y_test, pip.predict(X_test)
    
    print(metrics.classification_report(y_true, y_pred))
    print('Best params found:\n', pip['gc'].best_params_)
    return pip

In [509]:
gc(KNeighborsClassifier, {}, {}, full_pipeline)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
              precision    recall  f1-score   support

           0       0.01      0.04      0.02        76
           1       0.01      0.05      0.01        62
           2       0.01      0.07      0.02        80
           3       0.01      0.03      0.01        76
           4       0.02      0.07      0.03        72
           5       0.01      0.03      0.01        97
           6       0.02      0.06      0.03        83
           7       0.01      0.02      0.01        97
           8       0.02      0.05      0.02        83
           9       0.00      0.01      0.01        85
          10       0.02      0.05      0.03        83
          11       0.00      0.01      0.01        70
          12       0.01      0.03      0.01        79
          13       0.01      0.03      0.01        65
          14       0.02      0.05      0.03        74
          15       0.01      0.03      0.01        76
          16       0.0

Pipeline(steps=[('preparation',
                 Pipeline(steps=[('impute',
                                  Pipeline(steps=[('sizes_impute',
                                                   <__main__.KNNInlineImpute object at 0x00000229002A28E0>),
                                                  ('av_price_impute',
                                                   <__main__.ImputeAvPriceByRegionMean object at 0x00000229002A2E80>),
                                                  ('total_volume_impute',
                                                   <__main__.ImputeUpdateTotalVolume object at 0x00000229002A2760>),
                                                  ('type_impute',
                                                   AvocadoTypeImputer(featu...
                                                                                   <__main__.ColumnTotalSumAddTransformer object at 0x0000022900274BE0>),
                                                                     