In [33]:
from datetime import datetime
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud import bigquery
from google.cloud.exceptions import NotFound
import pandas as pd
import yahoo_fin.stock_info as si
from utils.utils_bigquery import *
from datetime import *

In [34]:
key_path = key_path
project = project_id

# Sources
table = 'bronze_fred_macro_data'
dataset = 'bronze'

# Tables id
table_conca = f'{project}.{dataset}.{table}'

# Loading tables
table_to_save = 'silver_fred_macro_data'
dataset_to_save = 'silver'

In [35]:
# Conectamos con Bigquery
bigquery = BigQueryUtils(key_path)

In [36]:
macro_data = bigquery.run_query(
    f"""
    SELECT
        *
    FROM {project}.{dataset}.{table}
    """
)
macro_data

Unnamed: 0,date,realtime_start,realtime_end,value,serie_id,id
0,2015-01-01 00:00:00+00:00,2024-08-28,2024-08-28,18063.529,GDP,c9c54655bf3f26795cde838f06a25bd7
1,2015-04-01 00:00:00+00:00,2024-08-28,2024-08-28,18279.784,GDP,14c74cebfff6bb3f9917359c00a3a0bb
2,2015-07-01 00:00:00+00:00,2024-08-28,2024-08-28,18401.626,GDP,ffff25f5e55f40990b2e90a4c7165ceb
3,2015-10-01 00:00:00+00:00,2024-08-28,2024-08-28,18435.137,GDP,f1ed1f9a501d4609d184eafeb2574206
4,2016-01-01 00:00:00+00:00,2024-08-28,2024-08-28,18525.933,GDP,54b3d90583a5b0d5287deae543708ac6
...,...,...,...,...,...,...
1677,2022-02-01 00:00:00+00:00,2024-08-28,2024-08-28,0.080,FEDFUNDS,33c7cb21eec7146e44c44faed1c1502f
1678,2015-04-01 00:00:00+00:00,2024-08-28,2024-08-28,0.120,FEDFUNDS,5087f7c987dd7835dd7c4d096a222869
1679,2015-05-01 00:00:00+00:00,2024-08-28,2024-08-28,0.120,FEDFUNDS,78a9856ba0cc194abe3ec7f814cebd7b
1680,2015-10-01 00:00:00+00:00,2024-08-28,2024-08-28,0.120,FEDFUNDS,ca2c7daffc0603f812c1a6ffad2f435b


In [37]:
# Pivotar el DataFrame
df_pivot = macro_data.pivot(index=['date'], columns='serie_id', values='value')

# Opcional: Resetear el índice si deseas que 'date' sea una columna en lugar de un índice
df_pivot = df_pivot.reset_index()

df_pivot

ValueError: Index contains duplicate entries, cannot reshape

In [38]:
# Convertimos 'date' a tipo datetime
macro_data['date'] = pd.to_datetime(macro_data['date'])

# Forzamos que sea sin zona horaria
macro_data['date'] = macro_data['date'].dt.tz_localize(None)

# Obtiene la fecha mínima
min_date = macro_data['date'].min()

# Obtiene la fecha de hoy como naive
end_date = pd.Timestamp.today().normalize()  # normalize() para eliminar la hora

# Generar una serie de fechas desde la fecha mínima hasta hoy
date_range = pd.date_range(start=min_date, end=end_date)

# Crear un nuevo DataFrame con la columna 'date'
df_dates = pd.DataFrame(date_range, columns=['date'])

df_dates

Unnamed: 0,date
0,2015-01-01
1,2015-01-02
2,2015-01-03
3,2015-01-04
4,2015-01-05
...,...
3523,2024-08-24
3524,2024-08-25
3525,2024-08-26
3526,2024-08-27


In [39]:
df = pd.merge(df_dates, df_pivot, on=['date'], how='left')
df


Unnamed: 0,date,BOPGSTB,CPIAUCSL,FEDFUNDS,GDP,PPIACO,RSAFS,UMCSENT,UNRATE
0,2015-01-01,-38865.0,234.747,0.11,18063.529,192.0,436125.0,98.1,5.7
1,2015-01-02,,,,,,,,
2,2015-01-03,,,,,,,,
3,2015-01-04,,,,,,,,
4,2015-01-05,,,,,,,,
...,...,...,...,...,...,...,...,...,...
3523,2024-08-24,,,,,,,,
3524,2024-08-25,,,,,,,,
3525,2024-08-26,,,,,,,,
3526,2024-08-27,,,,,,,,


In [40]:
# Asegúrate de que la columna 'date' esté en formato datetime y ordena el DataFrame
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(by='date')

# Rellenar los valores nulos en 'USDCNY' con el valor anterior
df['BOPGSTB'] = df['BOPGSTB'].fillna(method='ffill')
df['CPIAUCSL'] = df['CPIAUCSL'].fillna(method='ffill')
df['FEDFUNDS'] = df['FEDFUNDS'].fillna(method='ffill')
df['GDP'] = df['GDP'].fillna(method='ffill')
df['PPIACO'] = df['PPIACO'].fillna(method='ffill')
df['RSAFS'] = df['RSAFS'].fillna(method='ffill')
df['UMCSENT'] = df['UMCSENT'].fillna(method='ffill')
df['UNRATE'] = df['UNRATE'].fillna(method='ffill')

df

Unnamed: 0,date,BOPGSTB,CPIAUCSL,FEDFUNDS,GDP,PPIACO,RSAFS,UMCSENT,UNRATE
0,2015-01-01,-38865.0,234.747,0.11,18063.529,192.000,436125.0,98.1,5.7
1,2015-01-02,-38865.0,234.747,0.11,18063.529,192.000,436125.0,98.1,5.7
2,2015-01-03,-38865.0,234.747,0.11,18063.529,192.000,436125.0,98.1,5.7
3,2015-01-04,-38865.0,234.747,0.11,18063.529,192.000,436125.0,98.1,5.7
4,2015-01-05,-38865.0,234.747,0.11,18063.529,192.000,436125.0,98.1,5.7
...,...,...,...,...,...,...,...,...,...
3523,2024-08-24,-73109.0,313.534,5.33,28629.153,257.723,709668.0,68.2,4.3
3524,2024-08-25,-73109.0,313.534,5.33,28629.153,257.723,709668.0,68.2,4.3
3525,2024-08-26,-73109.0,313.534,5.33,28629.153,257.723,709668.0,68.2,4.3
3526,2024-08-27,-73109.0,313.534,5.33,28629.153,257.723,709668.0,68.2,4.3


In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3528 entries, 0 to 3527
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   date      3528 non-null   datetime64[ns]
 1   BOPGSTB   3528 non-null   float64       
 2   CPIAUCSL  3528 non-null   float64       
 3   FEDFUNDS  3528 non-null   float64       
 4   GDP       3528 non-null   float64       
 5   PPIACO    3528 non-null   float64       
 6   RSAFS     3528 non-null   float64       
 7   UMCSENT   3528 non-null   float64       
 8   UNRATE    3528 non-null   float64       
dtypes: datetime64[ns](1), float64(8)
memory usage: 248.2 KB


In [42]:
# En el caso de tener un df en Bigquery, lo leemos y guardamos solo los nuevos registros
try:
    # Filtramos solamente los nuevos registros
    df_incremental = bigquery.select_for_incremental(id='date', table=table_conca, new_df=df)

    # Guardamos los datos en bigquery
    bigquery.save_dataframe(df_incremental, project, dataset, table, if_exists='append', schema=None)

# En el caso de no tener datos en Bigquery, guardamos todo el df
except:
    bigquery.save_dataframe(df, project, dataset_to_save, table_to_save, if_exists='append', schema=None)

100%|██████████| 1/1 [00:00<?, ?it/s]
