In [20]:
from datetime import datetime
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud import bigquery
from google.cloud.exceptions import NotFound
import pandas as pd
import yahoo_fin.stock_info as si
from utils.utils_bigquery import *
from datetime import *

In [21]:
key_path = key_path
project = project_id

# Sources
table = 'bronze_fred_macro_data'
dataset = 'bronze'

# Tables id
table_conca = f'{project}.{dataset}.{table}'

# Loading tables
table_to_save = 'silver_fred_macro_data'
dataset_to_save = 'silver'
table_conca_to_save = f'{project}.{dataset_to_save}.{table_to_save}'

In [22]:
# Conectamos con Bigquery
bigquery = BigQueryUtils(key_path)

In [4]:
macro_data = bigquery.run_query(
    f"""
    SELECT
        *
    FROM {project}.{dataset}.{table}
    """
)
macro_data

Unnamed: 0,date,realtime_start,realtime_end,value,serie_id,id
0,2015-01-01 00:00:00+00:00,2024-08-29,2024-08-29,18063.529,GDP,c9c54655bf3f26795cde838f06a25bd7
1,2015-04-01 00:00:00+00:00,2024-08-29,2024-08-29,18279.784,GDP,14c74cebfff6bb3f9917359c00a3a0bb
2,2015-07-01 00:00:00+00:00,2024-08-29,2024-08-29,18401.626,GDP,ffff25f5e55f40990b2e90a4c7165ceb
3,2015-10-01 00:00:00+00:00,2024-08-29,2024-08-29,18435.137,GDP,f1ed1f9a501d4609d184eafeb2574206
4,2016-01-01 00:00:00+00:00,2024-08-29,2024-08-29,18525.933,GDP,54b3d90583a5b0d5287deae543708ac6
...,...,...,...,...,...,...
836,2022-02-01 00:00:00+00:00,2024-08-29,2024-08-29,0.080,FEDFUNDS,33c7cb21eec7146e44c44faed1c1502f
837,2015-04-01 00:00:00+00:00,2024-08-29,2024-08-29,0.120,FEDFUNDS,5087f7c987dd7835dd7c4d096a222869
838,2015-05-01 00:00:00+00:00,2024-08-29,2024-08-29,0.120,FEDFUNDS,78a9856ba0cc194abe3ec7f814cebd7b
839,2015-10-01 00:00:00+00:00,2024-08-29,2024-08-29,0.120,FEDFUNDS,ca2c7daffc0603f812c1a6ffad2f435b


In [5]:
# Pivotar el DataFrame
df_pivot = macro_data.pivot(index=['date'], columns='serie_id', values='value')

# Resetear el índice para que 'date' sea una columna en lugar de un índice
df_pivot = df_pivot.reset_index()

# Normalizamos la fecha
df_pivot['date'] = df_pivot['date'].dt.tz_localize(None)


df_pivot

serie_id,date,BOPGSTB,CPIAUCSL,FEDFUNDS,GDP,PPIACO,RSAFS,UMCSENT,UNRATE
0,2015-01-01,-38865.0,234.747,0.11,18063.529,192.000,436125.0,98.1,5.7
1,2015-02-01,-33903.0,235.342,0.11,,191.100,435450.0,95.4,5.5
2,2015-03-01,-48732.0,235.976,0.11,,191.500,441895.0,93.0,5.4
3,2015-04-01,-39836.0,236.222,0.12,18279.784,190.900,442967.0,95.9,5.4
4,2015-05-01,-38502.0,237.001,0.12,,193.400,446114.0,90.7,5.6
...,...,...,...,...,...,...,...,...,...
110,2024-03-01,-68582.0,312.230,5.33,,255.095,703738.0,79.4,3.8
111,2024-04-01,-74462.0,313.207,5.33,28652.337,256.984,702681.0,77.2,3.9
112,2024-05-01,-75006.0,313.225,5.33,,255.268,704309.0,69.1,4.0
113,2024-06-01,-73109.0,313.049,5.33,,255.779,702862.0,68.2,4.1


In [6]:
# Convertimos 'date' a tipo datetime
macro_data['date'] = pd.to_datetime(macro_data['date'])

# Forzamos que sea sin zona horaria
macro_data['date'] = macro_data['date'].dt.tz_localize(None)

# Obtiene la fecha mínima
min_date = macro_data['date'].min()

# Obtiene la fecha de hoy como naive
end_date = pd.Timestamp.today().normalize()  # normalize() para eliminar la hora

# Generar una serie de fechas desde la fecha mínima hasta hoy
date_range = pd.date_range(start=min_date, end=end_date)

# Crear un nuevo DataFrame con la columna 'date'
df_dates = pd.DataFrame(date_range, columns=['date'])

df_dates

Unnamed: 0,date
0,2015-01-01
1,2015-01-02
2,2015-01-03
3,2015-01-04
4,2015-01-05
...,...
3525,2024-08-26
3526,2024-08-27
3527,2024-08-28
3528,2024-08-29


In [7]:
df = pd.merge(df_dates, df_pivot, on=['date'], how='left')

# Resetear el índice para que 'date' sea una columna en lugar de un índice
df = df.reset_index(drop=True)

df.columns

Index(['date', 'BOPGSTB', 'CPIAUCSL', 'FEDFUNDS', 'GDP', 'PPIACO', 'RSAFS',
       'UMCSENT', 'UNRATE'],
      dtype='object')

In [8]:
# Asegúrate de que la columna 'date' esté en formato datetime y ordena el DataFrame
df = df.sort_values(by='date')

# Rellenar los valores nulos en 'USDCNY' con el valor anterior
df['BOPGSTB'] = df['BOPGSTB'].fillna(method='ffill')
df['CPIAUCSL'] = df['CPIAUCSL'].fillna(method='ffill')
df['FEDFUNDS'] = df['FEDFUNDS'].fillna(method='ffill')
df['GDP'] = df['GDP'].fillna(method='ffill')
df['PPIACO'] = df['PPIACO'].fillna(method='ffill')
df['RSAFS'] = df['RSAFS'].fillna(method='ffill')
df['UMCSENT'] = df['UMCSENT'].fillna(method='ffill')
df['UNRATE'] = df['UNRATE'].fillna(method='ffill')

df

Unnamed: 0,date,BOPGSTB,CPIAUCSL,FEDFUNDS,GDP,PPIACO,RSAFS,UMCSENT,UNRATE
0,2015-01-01,-38865.0,234.747,0.11,18063.529,192.000,436125.0,98.1,5.7
1,2015-01-02,-38865.0,234.747,0.11,18063.529,192.000,436125.0,98.1,5.7
2,2015-01-03,-38865.0,234.747,0.11,18063.529,192.000,436125.0,98.1,5.7
3,2015-01-04,-38865.0,234.747,0.11,18063.529,192.000,436125.0,98.1,5.7
4,2015-01-05,-38865.0,234.747,0.11,18063.529,192.000,436125.0,98.1,5.7
...,...,...,...,...,...,...,...,...,...
3525,2024-08-26,-73109.0,313.534,5.33,28652.337,257.723,709668.0,68.2,4.3
3526,2024-08-27,-73109.0,313.534,5.33,28652.337,257.723,709668.0,68.2,4.3
3527,2024-08-28,-73109.0,313.534,5.33,28652.337,257.723,709668.0,68.2,4.3
3528,2024-08-29,-73109.0,313.534,5.33,28652.337,257.723,709668.0,68.2,4.3


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3530 entries, 0 to 3529
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   date      3530 non-null   datetime64[ns]
 1   BOPGSTB   3530 non-null   float64       
 2   CPIAUCSL  3530 non-null   float64       
 3   FEDFUNDS  3530 non-null   float64       
 4   GDP       3530 non-null   float64       
 5   PPIACO    3530 non-null   float64       
 6   RSAFS     3530 non-null   float64       
 7   UMCSENT   3530 non-null   float64       
 8   UNRATE    3530 non-null   float64       
dtypes: datetime64[ns](1), float64(8)
memory usage: 248.3 KB


In [10]:
# Fields that make up the ID
id_fields = ['date']

# Apply the function to the DataFrame to create the 'id' column
df['id'] = df.apply(generate_id, axis=1, fields=id_fields)

df

Unnamed: 0,date,BOPGSTB,CPIAUCSL,FEDFUNDS,GDP,PPIACO,RSAFS,UMCSENT,UNRATE,id
0,2015-01-01,-38865.0,234.747,0.11,18063.529,192.000,436125.0,98.1,5.7,4c6f3a3ad2c3295df78d540b586fa7a3
1,2015-01-02,-38865.0,234.747,0.11,18063.529,192.000,436125.0,98.1,5.7,d046a975255510ffcbbf9698294fdf40
2,2015-01-03,-38865.0,234.747,0.11,18063.529,192.000,436125.0,98.1,5.7,839cd1c08a392d57d500f00285268f2d
3,2015-01-04,-38865.0,234.747,0.11,18063.529,192.000,436125.0,98.1,5.7,0518f2b63236a9bc3119f02cabae4fd4
4,2015-01-05,-38865.0,234.747,0.11,18063.529,192.000,436125.0,98.1,5.7,71e7b2696f675336f79208247cdc5594
...,...,...,...,...,...,...,...,...,...,...
3525,2024-08-26,-73109.0,313.534,5.33,28652.337,257.723,709668.0,68.2,4.3,ef84b4618ba298b6579599a9133cb02c
3526,2024-08-27,-73109.0,313.534,5.33,28652.337,257.723,709668.0,68.2,4.3,d03c334c76ec10668a8689d5490c227f
3527,2024-08-28,-73109.0,313.534,5.33,28652.337,257.723,709668.0,68.2,4.3,c03f67e2119c230ed5198baf394982cf
3528,2024-08-29,-73109.0,313.534,5.33,28652.337,257.723,709668.0,68.2,4.3,a2125ebc5a8ad74a15dbf2b516e983c5


In [23]:
# Filtramos solamente los nuevos registros
df_incremental = bigquery.select_for_incremental(id='id', table=table_conca_to_save, new_df=df)
print(df_incremental)

AttributeError: 'BigQueryUtils' object has no attribute 'select_for_incremental'

In [42]:
# En el caso de tener un df en Bigquery, lo leemos y guardamos solo los nuevos registros
try:
    # Filtramos solamente los nuevos registros
    df_incremental = bigquery.select_for_incremental(id='date', table=table_conca_to_save, new_df=df)

    # Guardamos los datos en bigquery
    bigquery.save_dataframe(df_incremental, project, dataset_to_save, table_to_save, if_exists='append', schema=None)

# En el caso de no tener datos en Bigquery, guardamos todo el df
except:
    bigquery.save_dataframe(df, project, dataset_to_save, table_to_save, if_exists='append', schema=None)

100%|██████████| 1/1 [00:00<?, ?it/s]


In [19]:
import pandas as pd

def select_incremental(id: str, table: str, new_df: pd.DataFrame) -> pd.DataFrame:
    current_df = bigquery.run_query(
        f"""
        SELECT
            {id} 
        FROM {table}
        """
    )

    # Filtrar los registros que no están en current_df
    df_incremental = new_df[~new_df[id].isin(current_df[id])]

    return df_incremental


# Llamada a la función
df_incremental = select_incremental(id='id', table=table_conca_to_save, new_df=df)
df_incremental


Unnamed: 0,date,BOPGSTB,CPIAUCSL,FEDFUNDS,GDP,PPIACO,RSAFS,UMCSENT,UNRATE,id
