Demonstrate we can specify a simple interface to load data sources.

In [1]:
import numpy as np
import pandas as pd

%load_ext autoreload

In [2]:
%autoreload 2

from ldr import Schema, Filter, Selector
from ingresso import Sales0

In [3]:
sources = { "fx": [ "tests/media/gbp-usd.csv", Schema(desc = "fx") ], 
           "fx2": [ "tests/media/usd-gbp.csv", Schema(desc = "fx-datahub") ], 
           "sales": [ "tests/media/sales.csv", Schema(desc = "sales") ],  
           "weather": [ "tests/media/london.csv", Schema(desc = "weather") ],
           "weather2": [ "tests/media/metoffice.csv", Schema(desc = "weather-metoffice") ] }

In [4]:
filters = list(map( lambda x: Filter(x[0], x[1]), sources.values()))

In [5]:
def postprocess0(f0):
    f0.series("datetime", index=True)
    return f0.series("value")

In [6]:
series = list(map(lambda x: postprocess0(x), filters))

Finally, merge the series together, we pass the monthly Met-office series separately.

In [7]:
s0s = list(filter(lambda x: x.name != "weather-metoffice", series))
s1 = list(filter(lambda x: x.name == "weather-metoffice", series))
df = pd.DataFrame(s0s).transpose()
s0 = Sales0(df, metoffice=s1)
print(s0)

'Sales0: fx, fx-datahub, sales, weather'


In [8]:
s0.constrain()
print(s0._cdf.describe())
print("series-length: {0:d}".format(len(s0._cdf.index)))
s0._cdf.head()

               fx   fx-datahub        sales      weather
count  788.000000  1122.000000  1864.000000  1773.000000
mean     1.366371     1.473153  1115.015558    12.029724
std      0.098996     0.148440   927.505853     5.406125
min      1.204800     1.211827     0.000000    -4.100000
25%      1.292975     1.316569   310.000000     7.900000
50%      1.338350     1.512745   852.500000    11.900000
75%      1.437975     1.594070  1805.250000    16.300000
max      1.577600     1.716444  4864.000000    27.900000
series-length: 1864


Unnamed: 0_level_0,fx,fx-datahub,sales,weather
dt0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013-06-11,,1.558603,0.0,12.3
2013-06-12,,1.569612,3.0,5.8
2013-06-13,,1.569859,12.0,
2013-06-14,,1.568627,17.0,
2013-06-15,,,14.0,


In [35]:
print(len(s0._cdf))
Selector.nulls(s0._cdf)

1864


Unnamed: 0,N,R
fx,1076,0.6
fx-datahub,742,0.4
weather,91,0.0
sales,0,0.0


In [30]:
df = s0._cdf
df.isnull().sum() / len(df)

fx            0.577253
fx-datahub    0.398069
sales         0.000000
weather       0.048820
dtype: float64

In [10]:
x0 = s0._df['sales']
x1 = x0[x0.notna()].index[0]

x0=s0._cdf['sales']
x0[x0.notna()].index[-1]

Timestamp('2018-07-18 00:00:00')

In [11]:
s0.weather()
s0.weather

dt0
1971-01-31          NaN
1971-02-28          NaN
1971-03-31          NaN
1971-04-30          NaN
1971-05-31          NaN
1971-06-30          NaN
1971-07-31          NaN
1971-08-31          NaN
1971-09-30          NaN
1971-10-31          NaN
1971-11-30          NaN
1971-12-31          NaN
1972-01-31          NaN
1972-02-29          NaN
1972-03-31          NaN
1972-04-30          NaN
1972-05-31          NaN
1972-06-30          NaN
1972-07-31          NaN
1972-08-31          NaN
1972-09-30          NaN
1972-10-31          NaN
1972-11-30          NaN
1972-12-31          NaN
1973-01-31          NaN
1973-02-28          NaN
1973-03-31          NaN
1973-04-30          NaN
1973-05-31          NaN
1973-06-30          NaN
                ...    
2016-07-31    17.038710
2016-08-31    16.829032
2016-09-30    15.136667
2016-10-31    11.532258
2016-11-30     8.979310
2016-12-31     8.693548
2017-01-31     6.670000
2017-02-28     9.992857
2017-03-31    10.961290
2017-04-30    10.776667
2017-05-31  

In [12]:
s0._df['sales'].resample("M").mean()

dt0
1971-01-31            NaN
1971-02-28            NaN
1971-03-31            NaN
1971-04-30            NaN
1971-05-31            NaN
1971-06-30            NaN
1971-07-31            NaN
1971-08-31            NaN
1971-09-30            NaN
1971-10-31            NaN
1971-11-30            NaN
1971-12-31            NaN
1972-01-31            NaN
1972-02-29            NaN
1972-03-31            NaN
1972-04-30            NaN
1972-05-31            NaN
1972-06-30            NaN
1972-07-31            NaN
1972-08-31            NaN
1972-09-30            NaN
1972-10-31            NaN
1972-11-30            NaN
1972-12-31            NaN
1973-01-31            NaN
1973-02-28            NaN
1973-03-31            NaN
1973-04-30            NaN
1973-05-31            NaN
1973-06-30            NaN
                 ...     
2016-07-31    1319.612903
2016-08-31    1604.032258
2016-09-30    1686.300000
2016-10-31    2079.612903
2016-11-30    2582.233333
2016-12-31    2437.709677
2017-01-31    1726.774194
2017-02-

In [13]:
s0._df['fx'].resample("M").mean()

dt0
1971-01-31         NaN
1971-02-28         NaN
1971-03-31         NaN
1971-04-30         NaN
1971-05-31         NaN
1971-06-30         NaN
1971-07-31         NaN
1971-08-31         NaN
1971-09-30         NaN
1971-10-31         NaN
1971-11-30         NaN
1971-12-31         NaN
1972-01-31         NaN
1972-02-29         NaN
1972-03-31         NaN
1972-04-30         NaN
1972-05-31         NaN
1972-06-30         NaN
1972-07-31         NaN
1972-08-31         NaN
1972-09-30         NaN
1972-10-31         NaN
1972-11-30         NaN
1972-12-31         NaN
1973-01-31         NaN
1973-02-28         NaN
1973-03-31         NaN
1973-04-30         NaN
1973-05-31         NaN
1973-06-30         NaN
                ...   
2016-07-31    1.314776
2016-08-31    1.310461
2016-09-30    1.314859
2016-10-31    1.233462
2016-11-30    1.244145
2016-12-31    1.247350
2017-01-31    1.235336
2017-02-28    1.248875
2017-03-31    1.234743
2017-04-30    1.264385
2017-05-31    1.292213
2017-06-30    1.281214
2017-07

In [14]:
fw = list(filter(lambda x: x._schema.desc == "weather-metoffice", filters))[0]
x0 = fw._data
x0['day'] = 1
fw._data

Unnamed: 0_level_0,yyyy,mm,tmax,tmin,af,rain,sun,day
dt0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1948-01,1948,1,8.9,3.3,,85.0,,1
1948-02,1948,2,7.9,2.2,,26.0,,1
1948-03,1948,3,14.2,3.8,,14.0,,1
1948-04,1948,4,15.4,5.1,,35.0,,1
1948-05,1948,5,18.1,6.9,,57.0,,1
1948-06,1948,6,19.1,10.3,,67.0,,1
1948-07,1948,7,21.7,12.0,,21.0,,1
1948-08,1948,8,20.8,11.7,,67.0,,1
1948-09,1948,9,19.6,10.2,,35.0,,1
1948-10,1948,10,14.9,6.0,,50.0,,1


In [15]:
x0
x1 = x0.rename(index=str, columns={"yyyy":"year", "mm":"month"})

In [16]:
x2 = pd.to_datetime(x1[['year', 'month', 'day']])
x2.dt.to_period('M')

dt0
1948-01   1948-01
1948-02   1948-02
1948-03   1948-03
1948-04   1948-04
1948-05   1948-05
1948-06   1948-06
1948-07   1948-07
1948-08   1948-08
1948-09   1948-09
1948-10   1948-10
1948-11   1948-11
1948-12   1948-12
1949-01   1949-01
1949-02   1949-02
1949-03   1949-03
1949-04   1949-04
1949-05   1949-05
1949-06   1949-06
1949-07   1949-07
1949-08   1949-08
1949-09   1949-09
1949-10   1949-10
1949-11   1949-11
1949-12   1949-12
1950-01   1950-01
1950-02   1950-02
1950-03   1950-03
1950-04   1950-04
1950-05   1950-05
1950-06   1950-06
            ...  
2016-01   2016-01
2016-02   2016-02
2016-03   2016-03
2016-04   2016-04
2016-05   2016-05
2016-06   2016-06
2016-07   2016-07
2016-08   2016-08
2016-09   2016-09
2016-10   2016-10
2016-11   2016-11
2016-12   2016-12
2017-01   2017-01
2017-02   2017-02
2017-03   2017-03
2017-04   2017-04
2017-05   2017-05
2017-06   2017-06
2017-07   2017-07
2017-08   2017-08
2017-09   2017-09
2017-10   2017-10
2017-11   2017-11
2017-12   2017-12
2018-0

In [17]:
"x" != "x"

False