In [111]:
import numpy as np
from scipy import stats
import datetime
import matplotlib.dates as mdates
from matplotlib import pyplot as plt
from workalendar.usa import UnitedStates
from operator import itemgetter

import pandas as pd

In [115]:
products = list(np.load('office_products_data.npy', allow_pickle=True))
# products = list(np.load('product_electronics_50_price_history.npy', allow_pickle=True))
sample_product = products[24]

In [133]:
class Product:
    '''
    Modularize each product to store each feature as an attribute of a class. 
    Attributes: 
        Mean, Max, Mode, Standard Deviation, Normalized Amazon time Price History
    Methods:
        Holidays overlaid, Ability to detect sales, Derivative of Price History, SCatter plot with various features
        Regression Function
    '''
    
    def __init__(self, product_dict):
        #TODO: Add more asserts
        assert isinstance(product_dict, dict)
        
        self.product_dict = product_dict
#         self.df['amazon_price'] = product_dict['data']['AMAZON']
        
        d = {'amazon_time': product_dict['data']['AMAZON_time'], 'amazon_price': product_dict['data']['AMAZON']}
        self.df = pd.DataFrame(data=d)
        self.df = self.remove_nan(self.df)
        self.df['normalized'] = self.normalize_prices(self.df['amazon_price'])
        self.df['standardized'] = self.standardize_prices(self.df['amazon_price'])
        
#         self.mean = np.average(self.df['amazon_price'])
#         self.max = max(self.df['amazon_price'])
#         self.mode = stats.mode(self.df['amazon_price'])
#         self.std = stats.tstd(self.df['amazon_price'])
        self.mean = self.df['amazon_price'].mean()
        self.max = self.df['amazon_price'].max()
    
    def price_holiday_correlation(self, year=2018):
        '''Plot price history's correlation with a country's holidays

        :param year: Year for which prices are to be plotted 
        :type product: int
        :return: List of dates for which prices are available and their corresponding prices
        :rtype: list
        '''
        
        #Plot the amazon time price history for the given year and overlay a graph of holidays on top of that
        year_index = [iter_date.year == year for iter_date in self.df['amazon_time']]
        
        year_prices = self.df['amazon_price'][np.where(year_index)]
        year_dates = self.product_dict['data']['AMAZON_time'][np.where(year_index)]
        print('Number of price data points for the year %d: %d'% (year, len(year_dates)))
        
        #Plot the dates against prices
        fig, ax = plt.subplots(constrained_layout=True)
        locator = mdates.AutoDateLocator()
        formatter = mdates.AutoDateFormatter(locator)
        ax.xaxis.set_major_locator(locator)
        ax.xaxis.set_major_formatter(formatter)

        ax.plot(year_dates, year_prices, '.')
        ax.set_title('%d Price vs Holidays plotter' % (year))

        cal=UnitedStates()
        us_holidays, _ = zip(*cal.holidays(year))
        for holiday in us_holidays:
            plt.axvline(holiday, color = 'r')
            
        return year_dates, year_prices

    def remove_nan(self, product):
        '''Remove nan prices from price history

        :param product: price history 
        :type product: pd.DataFrame or pd.Series
        :return: product with nan entries removed
        :rtype: pd.DataFrame or pd.Series
        '''
        assert isinstance(product, (pd.DataFrame,pd.Series))

        return product.dropna()

    def standardize_prices(self, x):
        '''Given an iterable list of prices, standardize the prices
        Data is centered at 0 mean with unit variance

        :param x: price history
        :type x: (list, np.ndarray, pd.Series)
        :return: z-score standardized price history
        :rtype: np.ndarray
        '''

        assert isinstance(x, (list, np.ndarray, pd.Series))
        assert all(~np.isnan(i) for i in x)

        return stats.zscore(x)

    def normalize_prices(self, x):
        '''Normalize an iterable list of prices to range (0,1)

        :param x: price history
        :type x: (list, np.ndarray, pd.Series)
        :return: normalized price history
        :rtype: np.ndarray
        '''
        assert isinstance(x, (list, np.ndarray, pd.Series))
        assert all(~np.isnan(i) for i in x)

        return (x-np.min(x))/(np.max(x) - np.min(x))
    
    def derivative_prices(self, x):
        '''Derivative of price history
        
        :param x: price history
        :type x: (list, np.ndarray, pd.Series)
        :return: derivative of price history
        :rtype: pd.Series
        '''
        assert isinstance(x, (list, np.ndarray, pd.Series))
        assert all(~np.isnan(i) for i in x)
        
        return pd.Series(np.gradient(x.values), x.index, name='gradient')

In [135]:
product_object = Product(sample_product)
# print(product_object.amazon_price_history)
print(product_object.df.head())
print(product_object.mean)
print(product_object.max)
print(product_object.derivative_prices(product_object.df['amazon_price']))

# print(product_object.mode)
# print(product_object.std)
year = 2018
product_object.price_holiday_correlation(year)

          amazon_time  amazon_price  normalized  standardized
0 2011-03-24 16:00:00         11.88    0.282900     -0.566087
2 2011-04-22 00:00:00         16.99    0.685579      1.944550
3 2011-05-03 13:00:00         16.79    0.669819      1.846287
4 2011-05-03 22:00:00         16.28    0.629630      1.595714
5 2011-05-06 11:00:00         16.99    0.685579      1.944550
13.032179775280834
20.98
0       5.110
2       2.455
3      -0.355
4       0.100
5       0.255
6      -0.150
7      -0.255
8      -0.460
9       0.000
10     -2.390
11     -1.025
12      2.135
13      0.975
14      0.715
15      0.250
16     -0.165
17      0.000
18     -0.545
19      0.000
20      0.710
21     -0.290
22     -0.400
23     -0.320
24     -0.325
25     -0.225
26     -0.465
27     -0.470
28     -0.245
29     -0.250
30     -0.255
        ...  
1771   -1.255
1772    0.000
1773    1.540
1774   -0.040
1775   -0.150
1776    0.030
1777   -0.150
1778    0.010
1779    0.300
1780   -0.060
1781    0.010
1782   -0.070
1

ValueError: Can only tuple-index with a MultiIndex

In [30]:
a = np.array([1, 3, 4, 5, 6, -1])
print(a[np.where(a > 1)])
del(dates)

[3 4 5 6]
