In [None]:
import h5py
import numpy as np
import pandas as pd
import cv2

In [2]:
# reading in data directly from h5 file, indexing using numpy 
f = h5py.File('../data/raw/pollution_data.h5', 'r')
list(f.keys())

pollution_data = np.array(f.get("SCIENCE_DATA/UVAerosolIndex354and388"))
lat = np.array(f.get("GEOLOCATION_DATA/Latitude"))
long = np.array(f.get("GEOLOCATION_DATA/Longitude"))

pol_data = pollution_data.flatten()
lat_idx = lat.flatten()
long_idx = long.flatten()

pol_data = pol_data[(lat_idx <= -10) & (lat_idx >= -20) & (long_idx <= -50) & (long_idx >= -60)]

In [5]:
# interpolation using cv2 

data = pol_data
colname = "pollution_data"
grid_size = 500

grid = cv2.resize(data, dsize=(grid_size, grid_size), interpolation = cv2.INTER_AREA)

# create dataframe 
row = np.arange(grid_size ** 2) // grid_size
col = np.arange(grid_size ** 2) % grid_size

h5_df = pd.DataFrame(index = row * grid_size + col)
h5_df["row"] = row
h5_df["col"] = col

h5_df[colname] = grid.flatten()

Unnamed: 0,row,col,pollution_data
0,0,0,-0.293491
1,0,1,-0.293491
2,0,2,-0.293491
3,0,3,-0.293491
4,0,4,-0.293491
...,...,...,...
249995,499,495,-0.082437
249996,499,496,-0.082437
249997,499,497,-0.082437
249998,499,498,-0.082437


In [8]:
# reading in data from csv files downloaded from panoply, indexing as pandas dataframe 

p = pd.read_csv("../data/raw/UVAerosolIndex354and388.csv", header = None)
lat = pd.read_csv("../data/raw/Latitude.csv", header = None)
long = pd.read_csv("../data/raw/Longitude.csv", header = None)

row = np.tile(np.arange(0, 1643), 60)
col = np.repeat(np.arange(0, 60), 1643)

pol_df = pd.DataFrame(index = np.arange(0, 98580))
pol_df["row"] = row
pol_df["col"] = col

pol_df["lat"] = np.array(lat).flatten()
pol_df["long"] = np.array(long).flatten()
pol_df["pollution"] = np.array(p).flatten()

In [10]:
# interpolation using cv2 
data = np.array(pol_df["pollution"])
colname = "pollution_data"
grid_size = 500

grid = cv2.resize(data, dsize=(grid_size, grid_size), interpolation = cv2.INTER_AREA)

# create dataframe 
row = np.arange(grid_size ** 2) // grid_size
col = np.arange(grid_size ** 2) % grid_size

pd_df = pd.DataFrame(index = row * grid_size + col)
pd_df["row"] = row
pd_df["col"] = col

pd_df[colname] = grid.flatten()

Unnamed: 0,row,col,pollution_data
0,0,0,
1,0,1,
2,0,2,
3,0,3,
4,0,4,
...,...,...,...
249995,499,495,
249996,499,496,
249997,499,497,
249998,499,498,


In [16]:
pd_df["pollution_data"].describe()

count    197000.000000
mean          0.067100
std           0.470259
min          -1.795964
25%          -0.197786
50%           0.067858
75%           0.366142
max           1.835708
Name: pollution_data, dtype: float64

In [14]:
h5_df["pollution_data"].describe()

count    250000.000000
mean         -0.150478
std           0.404238
min          -2.056443
25%          -0.363740
50%          -0.119307
75%           0.112140
max           1.770803
Name: pollution_data, dtype: float64