In [1]:
from datetime import datetime
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud import bigquery
from google.cloud.exceptions import NotFound
import pandas as pd
import yahoo_fin.stock_info as si
from utils.utils_bigquery import *
from datetime import *
import hashlib

In [2]:
key_path = key_path
project = project_id
dataset = 'bronze'
table = 'bronze_ticker_data'
table_conca = f'{project}.{dataset}.{table}'

# schema = [
#     {'name': 'dd', 'type': 'STRING'},
#     {'name': 'date', 'type': 'DATE'},
#     {'name': 'ticker', 'type': 'STRING'},
#     {'name': 'adj close', 'type': 'INTEGER'},
#     {'name': 'close', 'type': 'INTEGER'},
#     {'name': 'high', 'type': 'INTEGER'},
#     {'name': 'low', 'type': 'INTEGER'},
#     {'name': 'Open', 'type': 'INTEGER'},
#     {'name': 'Volume', 'type': 'INTEGER'},
# ]

In [3]:
# Conectamos con Bigquery
bigquery = BigQueryUtils(key_path)

In [4]:
unique_tickers = bigquery.run_query(
    f"""
    SELECT
        ticker
    FROM sara-carles-keepcoding.bronze.bronze_sp500_tickers
    """
)
unique_tickers

Unnamed: 0,ticker
0,COP
1,EOG
2,OXY
3,HES
4,FANG
...,...
498,TTWO
499,GOOGL
500,GOOG
501,META


In [5]:
# Assign an initial date to each ticker.
unique_tickers_initial_date = initial_date_by_ticker(unique_tickers, initial_date='2015-01-01')

In [8]:
# Attempt to fetch the maximum date per ticker from BigQuery; fallback to initial dates if an error occurs.
try:
    max_date_by_ticker = bigquery.run_query(
        f"""
        SELECT
            ticker,
            max(date) as date 
        FROM {table_conca} 
        GROUP BY 
            ticker
        """
    )
    max_date_by_ticker['date'] = pd.to_datetime(max_date_by_ticker['date'])
    max_date_by_ticker['date'] = max_date_by_ticker['date'].dt.tz_localize(None)
    print(max_date_by_ticker)

except Exception as e:
    print(e)
    max_date_by_ticker = unique_tickers_initial_date
    print(max_date_by_ticker)

     ticker       date
0      AAPL 2024-08-13
1      AMGN 2024-08-13
2      AMZN 2024-08-13
3       AXP 2024-08-13
4        BA 2024-08-13
...     ...        ...
4446  PDYNW 2024-08-13
4447  PGYWW 2024-08-13
4448  RENEW 2024-08-13
4449  AIMAW 2024-08-12
4450  BAERW 2024-08-12

[4451 rows x 2 columns]


In [9]:
# Merge with the DataFrame of unique tickers
max_date_by_ticker = pd.merge(unique_tickers_initial_date, max_date_by_ticker, how='left', on='ticker')

# Keep the maximum date between both fields
max_date_by_ticker['date'] = max_date_by_ticker[['date', 'initial_date']].max(axis=1)

# Remove the auxiliary 'initial_date' column
max_date_by_ticker.drop(columns=['initial_date'], inplace=True)

max_date_by_ticker

Unnamed: 0,ticker,date
0,COP,2015-01-01
1,EOG,2015-01-01
2,OXY,2015-01-01
3,HES,2015-01-01
4,FANG,2024-08-13
...,...,...
498,TTWO,2024-08-13
499,GOOGL,2024-08-13
500,GOOG,2024-08-13
501,META,2024-08-13


In [10]:
# Perform an incremental update to fetch the latest records
new_df = fetch_historical_data(max_date_by_ticker, start_date_col='date', interval='1d')
new_df

Datos obtenidos para COP.
Datos obtenidos para EOG.
Datos obtenidos para OXY.
Datos obtenidos para HES.
Datos obtenidos para FANG.
Datos obtenidos para DVN.
Datos obtenidos para EQT.
Datos obtenidos para CTRA.
Datos obtenidos para MRO.
Datos obtenidos para APA.
Datos obtenidos para WMB.
Datos obtenidos para OKE.
Datos obtenidos para KMI.
Datos obtenidos para TRGP.
Datos obtenidos para XOM.
Datos obtenidos para CVX.
Datos obtenidos para SLB.
Datos obtenidos para BKR.
Datos obtenidos para HAL.
Datos obtenidos para MPC.
Datos obtenidos para PSX.
Datos obtenidos para VLO.
Datos obtenidos para CEG.
Datos obtenidos para GEV.
Datos obtenidos para SRE.
Datos obtenidos para AES.
Datos obtenidos para ATO.
Datos obtenidos para NI.
Datos obtenidos para AWK.
Datos obtenidos para NEE.
Datos obtenidos para SO.
Datos obtenidos para DUK.
Datos obtenidos para AEP.
Datos obtenidos para PCG.
Datos obtenidos para D.
Datos obtenidos para PEG.
Datos obtenidos para EXC.
Datos obtenidos para ED.
Datos obtenido

Unnamed: 0,date,open,high,low,close,adjclose,volume,ticker
0,2015-01-02,68.500000,69.480003,68.230003,68.919998,50.309769,5701800,COP
1,2015-01-05,67.709999,67.980003,65.430000,65.639999,47.915482,10938900,COP
2,2015-01-06,65.290001,66.599998,62.880001,62.930000,45.937244,18054700,COP
3,2015-01-07,64.010002,64.230003,62.849998,63.349998,46.243835,12350500,COP
4,2015-01-08,64.849998,65.489998,63.900002,64.930000,47.397194,10348300,COP
...,...,...,...,...,...,...,...,...
776567,2024-08-19,34.910000,36.700001,34.910000,36.580002,36.580002,3884700,MTCH
776568,2024-08-20,36.360001,36.595001,36.169998,36.529999,36.529999,3375900,MTCH
776569,2024-08-21,36.689999,37.580002,36.439999,37.139999,37.139999,4550400,MTCH
776570,2024-08-22,37.099998,37.160000,36.360001,36.419998,36.419998,4800500,MTCH


In [12]:
# Fields that make up the ID
id_fields = ['ticker', 'date']

# Apply the function to the DataFrame to create the 'id' column
new_df['id'] = new_df.apply(generate_id, axis=1, fields=id_fields)

new_df

Unnamed: 0,date,open,high,low,close,adjclose,volume,ticker,id
0,2015-01-02,68.500000,69.480003,68.230003,68.919998,50.309769,5701800,COP,4342948b0ecbd3d9b2bd691bb3d0f45a
1,2015-01-05,67.709999,67.980003,65.430000,65.639999,47.915482,10938900,COP,8035c040b4c16aaae94334be92d773fd
2,2015-01-06,65.290001,66.599998,62.880001,62.930000,45.937244,18054700,COP,273e2f6115430f61fc8d6cfe366d260e
3,2015-01-07,64.010002,64.230003,62.849998,63.349998,46.243835,12350500,COP,4581c3e5a836782fddb2a4505e45fb66
4,2015-01-08,64.849998,65.489998,63.900002,64.930000,47.397194,10348300,COP,2475f7a4282d8fdbe55aefd82b119046
...,...,...,...,...,...,...,...,...,...
776567,2024-08-19,34.910000,36.700001,34.910000,36.580002,36.580002,3884700,MTCH,cd0484f31059c8724b858370c1dc9b8f
776568,2024-08-20,36.360001,36.595001,36.169998,36.529999,36.529999,3375900,MTCH,d7b903d7ef9849e21d50e60813dbfebd
776569,2024-08-21,36.689999,37.580002,36.439999,37.139999,37.139999,4550400,MTCH,c2e2f3489b2be3381ced46016e4ed21d
776570,2024-08-22,37.099998,37.160000,36.360001,36.419998,36.419998,4800500,MTCH,7ff4851947e5aec2c4f32365c70f10a3


In [13]:
# En el caso de tener un df en Bigquery, lo leemos y guardamos solo los nuevos registros
try:
    # Filtramos solamente los nuevos registros
    df_incremental = select_for_incremental(id='id', table=table_conca, new_df=new_df)

    # Guardamos los datos en bigquery
    bigquery.save_dataframe(df_incremental, project, dataset, table, if_exists='append', schema=None)

# En el caso de no tener datos en Bigquery, guardamos todo el df
except:
    bigquery.save_dataframe(new_df, project, dataset, table, if_exists='append', schema=None)

InvalidSchema: Reason: Provided Schema does not match Table sara-carles-keepcoding:bronze.bronze_ticker_data. Cannot add fields (field: id)