In [1]:
import os

# numerical
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

# viz
import matplotlib.pyplot as plt
from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D
style_list = ['default', 'classic'] + sorted(style for style in plt.style.available)
plt.style.use(style_list[7])
import folium
%matplotlib inline

# others
import json

In [2]:
gfs_gra_path = '/home/slimbook/git-repos/eolo-project/data/.raw/GFS_data/GFS_2016060212_f006.gra'
array = np.fromfile(gfs_gra_path, dtype=np.float32)

print('n_features: {n}'.format(n=array.shape[0]))

n_features: 23634


In [4]:
with open('gfs_info.json') as f:
    gfs_features = json.load(f)
    
n_var_3d = 7
n_var_2d = 20

features_3d = list(gfs_features.keys())[0:n_var_3d]
features_2d = list(gfs_features.keys())[n_var_3d:(n_var_3d+n_var_2d)]

In [7]:

def get_file(file_path):
    new_time = []
    for root, subdirs, files in os.walk(file_path):
        for file in files:
            match=file.split("_")[1]
            date = pd.to_datetime(match, format = "%Y%m%d%H").strftime('%d/%m/%Y') 
            time = (datetime.strptime(match, "%Y%m%d%H") + timedelta(hours=6)).strftime('%H:%M')
            new_time.append(date + " " + time) #, '%d/%m/%Y %H:%M')
    return sorted(new_time, key=lambda x: datetime.strptime(x, '%d/%m/%Y %H:%M'))




In [11]:

def loading(file_path, nz=26): 
    """
    Given GFS data in several .gra files, this function iterate over a given folder path importing and 
    organising its content in a dictionary format.
    """
    total={}
    file_data = []
    new_time = []
    
    for root, subdirs, files in os.walk(file_path):
        for file in files:
            array = np.fromfile(file_path +"/"+ file, dtype=np.float32)
    
    for root, subdirs, files in os.walk(file_path):
        for file in files:
            match=file.split("_")[1]
            date = pd.to_datetime(match, format = "%Y%m%d%H").strftime('%d/%m/%Y') 
            time = (datetime.strptime(match, "%Y%m%d%H") + timedelta(hours=6)).strftime('%H:%M')
            new_time.append(date + " " + time)
    
    
    for file in os.listdir(file_path): 
        start = 0
        step_3d = 13*9*26
        end = step_3d
        features_3d = {
                  "HGTprs": {'dimesiones': [13, 9, 26], 'data': None}, 
                  "CLWMRprs": {'dimesiones': [13, 9, 26], 'data': None},
                  "RHprs": {'dimesiones': [13, 9, 26], 'data': None},
                  "Velprs": {'dimesiones': [13, 9, 26], 'data': None},
                  "UGRDprs": {'dimesiones': [13, 9, 26], 'data': None},
                  "VGRDprs": {'dimesiones': [13, 9, 26], 'data': None},
                  "TMPprs": {'dimesiones': [13, 9, 26], 'data': None}
               }

        end = end - step_3d
        step_2d = 13*9
        end = end +step_2d
        features_2d = {
                   "HGTsfc": {'dimesiones': [13, 9, 1], 'data': None},
                   "MSLETmsl": {'dimesiones': [13, 9, 1], 'data': None},
                   "PWATclm": {'dimesiones': [13, 9, 1], 'data': None},
                   "RH2m": {'dimesiones': [13, 9, 1], 'data': None},
                   "Vel100m": {'dimesiones': [13, 9, 1], 'data': None},
                   "UGRD100m": {'dimesiones': [13, 9, 1], 'data': None},
                   "VGRD100m": {'dimesiones': [13, 9, 1], 'data': None},
                   "Vel80m": {'dimesiones': [13, 9, 1], 'data': None},
                   "UGRD80m": {'dimesiones': [13, 9, 1], 'data': None},
                   "VGRD80m": {'dimesiones': [13, 9, 1], 'data': None},
                   "Vel10m":{'dimesiones': [13, 9, 1], 'data': None},
                   "UGRD10m": {'dimesiones': [13, 9, 1], 'data': None},
                   "VGRD10m": {'dimesiones': [13, 9, 1], 'data': None},
                   "GUSTsfc": {'dimesiones': [13, 9, 1], 'data': None},
                   "TMPsfc": {'dimesiones': [13, 9, 1], 'data': None},
                   "TMP2m": {'dimesiones': [13, 9, 1], 'data': None},
                   "no4LFTXsfc":{'dimesiones': [13, 9, 1], 'data': None},
                   "CAPEsfc": {'dimesiones': [13, 9, 1], 'data': None},
                   "SPFH2m": {'dimesiones': [13, 9, 1], 'data': None},
                   "SPFH80m": {'dimesiones': [13, 9, 1], 'data': None},
               }   

        size_3d = 13*9*nz
        array_3d = array[:size_3d*7]
        for variable, length in zip(features_3d.keys(), range(len(features_3d))):
            features_3d[variable]["data"] = array_3d[length*size_3d:(length +1)*size_3d]#.reshape((len(features_3d.keys()), corte))

        size_2d = 13*9
        array_2d = array[size_3d*7:]
        for variable, length in zip(features_2d.keys(), range(len(features_2d))):
            features_2d[variable]["data"] = array_2d[length*size_2d:(length +1)*size_2d]#.reshape((len(features_2d.keys()), corte))
        
        
        
        file_data.append( {
            "file_name": file,
            #"datetime": new_time,
            "var_3d": features_3d,
            "var_2d":features_2d,
        }) 

    for i in range(len(new_time)):
        total.update({new_time[i]:file_data[i]})
        
    print("It's done!")
    return  total

In [12]:
all_data = loading("/home/slimbook/git-repos/eolo-project/data/.raw/GFS_data")

It's done!


In [5]:
len(all_data)

4324

In [None]:
# Ahora mismo tienesun diccionario con:
    # Identificador (id) -> datetime
    # Dentro de datetime:
        # Otro diccionario > file_data
            # En file_data tienes:
                # File name
                # Variables 3d
                # Variables 2d
                
                ## Para cada variable tienes las dimensiones y su array de datos

# Objetivo: 
    # Montar un dataframe que haga join con el dataframe de las potencias
    # Estructura:
        # id(datetime) - power - array (este array contendrátodos los elementos de las variables que hayamos decidido)
        
 # Model training:
    # X -> 2015-2016

In [31]:
all_data["11/05/2016 00:00"]["var_3d"].keys()

dict_keys(['HGTprs', 'CLWMRprs', 'RHprs', 'Velprs', 'UGRDprs', 'VGRDprs', 'TMPprs'])

In [None]:
# Ahora mismo tienesun diccionario con:
    # Identificador (id) -> datetime
    # Dentro de datetime:
        # Otro diccionario > file_data
            # En file_data tienes:
                # File name
                # Variables 3d
                # Variables 2d
                
                ## Para cada variable tienes las dimensiones y su array de datos

# Objetivo: 
    # Montar un dataframe que haga join con el dataframe de las potencias
    # Estructura:
        # id(datetime) - power - array (este array contendrátodos los elementos de las variables que hayamos decidido)
        
 # Model training:
    # X -> 2015-2016

In [99]:
len(all_data.keys())

4324

In [256]:
for datetime_key in all_data: #Quiero iterar sobre las keys de 1º nivel
        print(all_data[datetime_key]["var_3d"]["data"])    


KeyError: 'data'

In [211]:
test_dic = all_data.get("11/05/2016 00:00")
test_dic

{'file_name': 'GFS_2016051118_f006.gra',
 'var_3d': {'HGTprs': {'dimesiones': [13, 9, 26],
   'data': array([  162.55891,   160.92691,   157.23091, ..., 31379.352  ,
          31378.871  , 31379.031  ], dtype=float32)},
  'CLWMRprs': {'dimesiones': [13, 9, 26],
   'data': array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)},
  'RHprs': {'dimesiones': [13, 9, 26],
   'data': array([53.8     , 48.8     , 47.100002, ...,  0.1     ,  0.1     ,
           0.1     ], dtype=float32)},
  'Velprs': {'dimesiones': [13, 9, 26],
   'data': array([1.9991999, 1.8582786, 3.93459  , ..., 6.963476 , 6.9856997,
          6.9856997], dtype=float32)},
  'UGRDprs': {'dimesiones': [13, 9, 26],
   'data': array([ 1.88     ,  1.8399999,  3.9299998, ..., -6.8      , -6.8      ,
          -6.8      ], dtype=float32)},
  'VGRDprs': {'dimesiones': [13, 9, 26],
   'data': array([0.68, 0.26, 0.19, ..., 1.5 , 1.6 , 1.6 ], dtype=float32)},
  'TMPprs': {'dimesiones': [13, 9, 26],
   'data': array([293.1    , 293.80002

In [6]:
list_var = ['CLWMRprs', 'Velprs']#,'GUSTsfc']
nz = 5

#######
#################
#####################

#FUNCION ESTABLE

#####################
#################
#######


def get_var(main_dic, list_var, nz=26):
    """This function provides the selected variables in a nested dictionary with the given array
    and level (consider that each level is around 50m heigth). Output is given as dictionary
    """
    dict_final = {}
    size_3d = 13*9*nz
    res = []
   
    for datetime_key in main_dic: #Quiero iterar sobre las keys de 1º nivel
        
        for var in list_var:  # compruebo que la variable que voy a sacar está en mi lista

            if var in main_dic.get(datetime_key).get("var_3d").keys():
                 # compruebo que esa variable está en las de 2º nivel
                array_3d = main_dic[datetime_key]["var_3d"][var]["data"]
                # Asigno el array del value de 4º nivel a una variable
                arr_3d_nz = []
                for j in range(0,len(array_3d), size_3d):
                    res.extend(array_3d[j: j+size_3d])
                
        for var in list_var:
            
            if var in main_dic.get(datetime_key).get("var_2d").keys():
                array_2d = main_dic[datetime_key]["var_3d"][var]["data"] 
                res.extend(array_2d)
                
                
    print(res)        
    """for i in main_dic(range(len(dic.keys()))):   #.items():
        dict_final.update({datetime_key[i]:dic_2[i]})"""
    

In [7]:
x=get_var(all_data, list_var, nz=26)
x

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [199]:
def get_var(dic, list_var, nz=26):
    """
    This function provides the selected variables in a nested dictionary with the given array
    and level (consider that each level is around 50m heigth). Output is given as dictionary
    """ 
    dict_final = {}
    size_3d = 13*9*nz
    dic_2 = {}
    for ids, v in dic.items(): #Este itera las keys del diccionario grande (1ºnivel - datetime)
        
        for e in list_var: # Este itera sobre la lista de variables para comprobar que estén en el 2º nivel 
            if e in dic[ids]["var_3d"].keys(): # después se mete en las keys de var_3d y saca el 
                var3d_array = dic[ids]["var_3d"][e]["data"]
                d3_array = []
                
                for j in range(0,len(var3d_array), size_3d):
                    d3_array.append(var3d_array[j: j+size_3d])

                dic_2.update({e:d3_array})    
            
            else:
                d2_array = dic[ids]["var_2d"][e]["data"]
                dic_2.update({e:d2_array})
    
    
        for i in dic(range(len(dic.keys()))):   #.items():
            dict_final.update({dic[i]:dic_2[i]})    
    
    return dict_final

In [200]:
get_var(test_dic, list_var)

TypeError: string indices must be integers

In [161]:
'''
def func_1(dic, var):
    for i in dic.keys():
        for e in dic[i]["var_3d"].keys():
                return dic[i]["var_3d"][e]["data"]
         
'''
data = all_data.get("11/05/2016 00:00").get('var_3d').get('HGTprs')

z = data.get('dimesiones')[2]
array = data.get('data')
len(array)
nz = 5
suelo = array[:nz*len(array)//z]
len(array), len(suelo)



def suelo(array, z, nz): 
    return array[:nz*len(array)//z]

def get_suelo(archivo, lst_vars_3d, nz): 
    res = []
    for var in lst_vars_3d: 
        z =  archivo.get('var_3d').get(var).get('dimesiones')[2]
        data = archivo.get('var_3d').get(var).get('data')
        res.extend(data)
    return res

archivo = all_data.get("11/05/2016 00:00")
lst= ['CLWMRprs', 'Velprs']
nz = 4
get_suelo(archivo, lst, nz)        

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1e-07,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1e-07,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 4.1e-06,
 7.8e-06,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 5.6e-06,
 7.3e-06,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 6.6000002e-06,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 8e-07,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 5e-07,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 9.2e-06,
 1.95e-05,
 0.0,
 0.

In [None]:
def func_1(dic, var):
    for i in dic.keys(): #Este itera las keys del diccionario grande (1ºnivel - datetime)
        
        for e in dic[i]["var_3d"].keys(): # Este itera a través del 2º nivel y se mete en las keys de var_3d
            aux_var = dic[i]["var_3d"][e]["data"] # estas keys son strings que deben coiincidir con el kargs: var 
            
            for j in range(0,len(var3d_array), size_3d): # Este último bucle, itera sobre el array y te devuelve
                    d3_array.append(var3d_array[j: j+size_3d]) # el slice del array en función del Nz
            return 
         

In [130]:
all_data["11/05/2016 00:00"]   

{'file_name': 'GFS_2016051118_f006.gra',
 'var_3d': {'HGTprs': {'dimesiones': [13, 9, 26],
   'data': array([  162.55891,   160.92691,   157.23091, ..., 31379.352  ,
          31378.871  , 31379.031  ], dtype=float32)},
  'CLWMRprs': {'dimesiones': [13, 9, 26],
   'data': array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)},
  'RHprs': {'dimesiones': [13, 9, 26],
   'data': array([53.8     , 48.8     , 47.100002, ...,  0.1     ,  0.1     ,
           0.1     ], dtype=float32)},
  'Velprs': {'dimesiones': [13, 9, 26],
   'data': array([1.9991999, 1.8582786, 3.93459  , ..., 6.963476 , 6.9856997,
          6.9856997], dtype=float32)},
  'UGRDprs': {'dimesiones': [13, 9, 26],
   'data': array([ 1.88     ,  1.8399999,  3.9299998, ..., -6.8      , -6.8      ,
          -6.8      ], dtype=float32)},
  'VGRDprs': {'dimesiones': [13, 9, 26],
   'data': array([0.68, 0.26, 0.19, ..., 1.5 , 1.6 , 1.6 ], dtype=float32)},
  'TMPprs': {'dimesiones': [13, 9, 26],
   'data': array([293.1    , 293.80002

In [137]:
func_1(all_data, "HGTprs")

array([  162.55891,   160.92691,   157.23091, ..., 31379.352  ,
       31378.871  , 31379.031  ], dtype=float32)

In [126]:
testing_func = get_var(all_data, list_var)
testing_func

TypeError: 'dict' object is not callable

In [112]:
df = pd.DataFrame(testing_func)

In [113]:
df.T

In [None]:
# NO TE DEVUELVE TODO EL ARCHIVO

# Comprobar que algunos arrays no tengan doble [] -> por qué?