# ETL

Será feito por ano, gerando cada ano como um arquivo separado pra memória poder aguentar

In [1]:
import numpy as np
import pandas as pd
import json
import os
import matplotlib.pyplot as plt
import seaborn as sns
import re
from datetime import datetime

In [2]:
def is_csv(x):
    if x[-4:] == '.csv':
        return True
    return False

def read_all_files(path, files):
    df = pd.DataFrame()
    for arquivo in files:
        df_aux = pd.read_csv(f'{path}/{arquivo}')
        df = df.append(df_aux)

    return df

def kelvin_to_celsius(k):
    return k - 273

def print_status(df):
    print(df.shape)
    print('='*10)
    print(df.info())

def str_to_datime(x):
    return datetime.strptime(x[:-5].replace('T', ' '), '%Y-%m-%d %H:%M:%S')

In [3]:
forecast_path = 'forecasts/2.5'
gridpp_path = 'forecasts/1'
observation_path = 'observation'

In [33]:
desired_columns = ['station_id', 'lat', 'long', 'forecast', 'gridpp', 'observations', 'year', 'month', 'day', 'hour']

## 2019

In [4]:
year = 2019

In [5]:
to_transform = ['year', 'month', 'day', 'hour'] # to int

In [22]:
forecast_files = os.listdir(forecast_path)
forecast_files = list(filter(is_csv, forecast_files))

forecasts = read_all_files(forecast_path, forecast_files)
forecasts = forecasts[forecasts.year == year]

forecasts.reset_index(drop=True, inplace=True)
forecasts['forecast'] = forecasts['forecast'].apply(lambda x: kelvin_to_celsius(x))

for item in to_transform:
    forecasts = forecasts.astype({item: 'int32'})

forecasts['datetime'] = forecasts.apply(lambda x: datetime(x['year'], x['month'], x['day'], x['hour']), axis=1)

print_status(forecasts)

(1068594, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1068594 entries, 0 to 1068593
Data columns (total 12 columns):
 #   Column                 Non-Null Count    Dtype         
---  ------                 --------------    -----         
 0   nearest_forecast_long  1068594 non-null  float64       
 1   nearest_forecast_lat   1068594 non-null  float64       
 2   long                   1068594 non-null  float64       
 3   station_id             1068594 non-null  object        
 4   lat                    1068594 non-null  float64       
 5   forecast               1068594 non-null  float64       
 6   indexes                1068594 non-null  object        
 7   year                   1068594 non-null  int32         
 8   month                  1068594 non-null  int32         
 9   day                    1068594 non-null  int32         
 10  hour                   1068594 non-null  int32         
 11  datetime               1068594 non-null  datetime64[ns]
dtypes: datetime64[

In [18]:
gridpp_files = os.listdir(gridpp_path)
gridpp_files = list(filter(is_csv, gridpp_files))

gridpp = read_all_files(gridpp_path, gridpp_files)
gridpp = gridpp[gridpp.year == year]

gridpp.reset_index(drop=True, inplace=True)
gridpp['forecast'] = gridpp['forecast'].apply(lambda x: kelvin_to_celsius(x))

for item in to_transform:
    gridpp = gridpp.astype({item: 'int32'})

gridpp['datetime'] = gridpp.apply(lambda x: datetime(x['year'], x['month'], x['day'], x['hour']), axis=1)

# Only for gridpp
gridpp.rename(columns={'forecast': 'gridpp'}, inplace=True)

print_status(gridpp)

(1045685, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1045685 entries, 0 to 1045684
Data columns (total 14 columns):
 #   Column                 Non-Null Count    Dtype         
---  ------                 --------------    -----         
 0   long                   1045685 non-null  float64       
 1   lat                    1045685 non-null  float64       
 2   gridpp                 1045685 non-null  float64       
 3   station_id             1045685 non-null  object        
 4   nearest_forecast_lat   0 non-null        float64       
 5   indexes                1045685 non-null  object        
 6   nearest_forecast_long  0 non-null        float64       
 7   nearest_gridpp_lat     1045685 non-null  float64       
 8   nearest_gridpp_long    1045685 non-null  float64       
 9   year                   1045685 non-null  int32         
 10  month                  1045685 non-null  int32         
 11  day                    1045685 non-null  int32         
 12  hour          

In [8]:
observation = pd.read_csv(f'{observation_path}/observation_{year}.csv')

observation['observations'] = observation['observations'].apply(lambda x: x.split(':'))
re_to_extract_numbers = r'\-*\d+\.*\d*'
observation['observations'] = observation['observations'].apply(lambda x: float(re.findall(re_to_extract_numbers, x[-1])[0]))

observation['datetime'] = observation.referenceTime.apply(lambda x: str_to_datime(x))

observation['sourceId'] = observation['sourceId'].apply(lambda x: x.split(':')[0])

print_status(observation)


(6405876, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6405876 entries, 0 to 6405875
Data columns (total 4 columns):
 #   Column         Dtype         
---  ------         -----         
 0   sourceId       object        
 1   referenceTime  object        
 2   observations   float64       
 3   datetime       datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 195.5+ MB
None


### Merge

In [23]:
final_data = forecasts.merge(gridpp[['station_id', 'datetime', 'gridpp']], how='inner', on=['station_id', 'datetime'], copy=False, suffixes=('_f', '_g'))

In [27]:
final_data = final_data.merge(observation, how='inner', left_on=['station_id', 'datetime'], right_on=['sourceId', 'datetime'])

In [34]:
final_data[desired_columns]

Unnamed: 0,station_id,lat,long,forecast,gridpp,observations,year,month,day,hour
0,SN18700,59.9423,10.7200,22.148926,20.850006,20.7,2019,6,28,18
1,SN78910,64.6933,12.3295,10.692871,11.356781,11.4,2019,6,28,18
2,SN50810,60.4742,5.3418,15.672363,18.380768,15.6,2019,6,28,18
3,SN51010,60.5205,5.7243,16.461426,17.142456,17.8,2019,6,28,18
4,SN44640,58.9563,5.7278,16.282715,16.750000,16.6,2019,6,28,18
...,...,...,...,...,...,...,...,...,...,...
308027,SN89213,69.0720,17.9633,-3.258301,-3.631897,-4.8,2019,12,12,12
308028,SN70680,63.9425,11.4255,3.166504,4.050293,5.5,2019,12,12,12
308029,SN9160,62.1282,9.9947,-1.467285,0.350006,0.2,2019,12,12,12
308030,SN26996,59.6473,10.1052,0.749023,0.530396,0.7,2019,12,12,12
