In [1]:
import numpy as np
import h5py
import dask.array as da
import glob
import timeit

In [2]:
import matplotlib.pyplot as plt

In [3]:
%matplotlib inline

In [None]:
!du -h data/weather-big/*

In [None]:
!du -h data/weather-small/2014-01-01.csv

In [None]:
!du -h data/weather-small/2014-01-01.hdf5

In [None]:
%timeit temps = np.loadtxt('data/weather-small/2014-01-01.csv', delimiter=',')

In [None]:
%timeit temps = h5py.File('data/weather-small/2014-01-02.hdf5')['/t2m'][...]

In [4]:
dset = h5py.File('data/weather-big/2014-01-01.hdf5')['/t2m']

In [None]:
print(dset)

In [None]:
temps = dset[...]

In [None]:
temps.shape

In [None]:
print(temps)

Next line takes a couple of minute to run

In [None]:
#plt.imshow(temps)
#plt.colorbar()

We can get away with plotting a smaller array:

In [None]:
plt.imshow(temps[::100, ::100])
plt.colorbar()

What if we wanted the mean at each day?

In [10]:
file_list = glob.glob('data/weather-big/*.hdf5')
num_files = len(file_list)

t1 = timeit.default_timer()
for filename in file_list:
    print(filename)
    print(h5py.File(filename)['/t2m'][...].mean())
    
t2 = timeit.default_timer()
print(t2-t1)

data/weather-big/2014-01-06.hdf5
168.855123457
data/weather-big/2014-01-02.hdf5
154.539969136
data/weather-big/2014-01-01.hdf5
157.049490741
data/weather-big/2014-01-04.hdf5
159.862962963
data/weather-big/2014-01-03.hdf5
160.815709877
data/weather-big/2014-01-07.hdf5
163.755972222
data/weather-big/2014-01-05.hdf5
165.088395062
85.92660486500245


In [14]:
file_list = glob.glob('data/weather-big/*.hdf5')
num_files = len(file_list)

t1 = timeit.default_timer()
for filename in file_list:
    print(filename)
    print(da.from_array(h5py.File(filename)['/t2m'], chunks=(5000, 5000)).mean().compute())
t2 = timeit.default_timer()
print(t2-t1)

data/weather-big/2014-01-06.hdf5
168.855123457
data/weather-big/2014-01-02.hdf5
154.539969136
data/weather-big/2014-01-01.hdf5
157.049490741
data/weather-big/2014-01-04.hdf5


KeyboardInterrupt: 

In [6]:
temps = da.stack(dsets, axis=0)

In [7]:
temps

dask.array<stack, shape=(7, 23040, 46080), dtype=float64, chunksize=(1, 5000, 5000)>

In [8]:
t1 = timeit.default_timer()
mean_temps = temps.mean(axis=0).compute()
t2 = timeit.default_timer()
print(t2-t1)

151.08153882005718


In [9]:
print(mean_temps)

[[ 74.42857143  74.42857143  74.42857143 ...,  74.57142857  74.57142857
   74.57142857]
 [ 74.42857143  74.42857143  74.42857143 ...,  74.57142857  74.57142857
   74.57142857]
 [ 74.42857143  74.42857143  74.42857143 ...,  74.57142857  74.57142857
   74.57142857]
 ..., 
 [ 91.42857143  91.42857143  91.42857143 ...,  91.57142857  91.57142857
   91.57142857]
 [ 91.42857143  91.42857143  91.42857143 ...,  91.57142857  91.57142857
   91.57142857]
 [ 91.42857143  91.42857143  91.42857143 ...,  91.57142857  91.57142857
   91.57142857]]


In [None]:
print(mean_temps)

In [None]:
print(temps)

In [None]:
6.6777*31


In [None]:
%whos

In [None]:
mean_temps.shape

In [None]:
%clear mean_temps

In [None]:
del mean_temps

In [None]:
%whos

In [None]:
%reset