In [1]:
from datetime import datetime
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud import bigquery
from google.cloud.exceptions import NotFound
import pandas as pd
import yahoo_fin.stock_info as si
from utils.utils_bigquery import *
from datetime import *

In [2]:
key_path = key_path
project = project_id

# Sources
currency_table = 'silver_currency_data'
currency_dataset = 'silver'
indicators_table = 'silver_indicators'
indicators_dataset = 'silver'
ticker_info_table = 'silver_ticker_info'
ticker_info_dataset = 'silver'
macro_data_table = 'silver_fred_macro_data'
macro_data_dataset = 'silver'
cluster_table = 'gold_clustering_sp500'
cluster_dataset = 'gold'

# Tables id
table_conca_currency = f'{project}.{currency_dataset}.{currency_table}'
table_conca_indicators = f'{project}.{indicators_dataset}.{indicators_table}'
table_conca_ticker_info = f'{project}.{ticker_info_dataset}.{ticker_info_table}'
table_conca_macro_data = f'{project}.{macro_data_dataset}.{macro_data_table}'
table_conca_cluster = f'{project}.{cluster_dataset}.{cluster_table}'

# Loading tables
table_to_save = 'gold_main_sp500'
dataset_to_save = 'gold'

In [3]:
# Conectamos con Bigquery
bigquery = BigQueryUtils(key_path)

In [4]:
df = bigquery.run_query(
    f"""
    SELECT
        indi.*,
        cl.cluster,
        info.sector_group,
        info.industry_group,
        cu.USDEUR,
        cu.USDJPY,
        cu.USDGBP,
        cu.USDCHF,
        cu.USDCNY,
        mc.BOPGSTB,
        mc.CPIAUCSL,
        mc.FEDFUNDS,
        mc.GDP,
        mc.PPIACO,
        mc.RSAFS,
        mc.UMCSENT,
        mc.UNRATE
    FROM {table_conca_indicators} AS indi
    INNER JOIN {table_conca_ticker_info} AS info
        ON info.ticker = indi.ticker    
    INNER JOIN {table_conca_currency} AS cu
        ON indi.date = cu.Date
    INNER JOIN {table_conca_macro_data} AS mc
        ON indi.date = mc.Date
    INNER JOIN {table_conca_cluster} AS cl
        ON indi.ticker = cl.ticker        
    """
)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1249071 entries, 0 to 1249070
Data columns (total 47 columns):
 #   Column            Non-Null Count    Dtype              
---  ------            --------------    -----              
 0   date              1249071 non-null  datetime64[us, UTC]
 1   Open              1249071 non-null  float64            
 2   High              1249071 non-null  float64            
 3   Low               1249071 non-null  float64            
 4   Close             1249071 non-null  float64            
 5   adjclose          1249071 non-null  float64            
 6   Volume            1249071 non-null  Int64              
 7   Ticker            1249071 non-null  object             
 8   id                1249071 non-null  object             
 9   SMA_20            1249071 non-null  float64            
 10  EMA_50            1249071 non-null  float64            
 11  ADX               1249071 non-null  float64            
 12  RSI               1249071 no

In [5]:
# Calculate the number of null values per column
nulls_by_column = df.isnull().sum()

# Sort the result in descending order
nulls_by_column_sorted = nulls_by_column.sort_values(ascending=False)

nulls_by_column_sorted

Volume_Change       722
date                  0
USDJPY                0
stoch_d               0
Target                0
Log_Return            0
Volatility            0
cluster               0
sector_group          0
industry_group        0
USDEUR                0
USDGBP                0
macd_hist             0
USDCHF                0
USDCNY                0
BOPGSTB               0
CPIAUCSL              0
FEDFUNDS              0
GDP                   0
PPIACO                0
RSAFS                 0
UMCSENT               0
stoch                 0
macd_signal           0
Open                  0
ADX                   0
High                  0
Low                   0
Close                 0
adjclose              0
Volume                0
Ticker                0
id                    0
SMA_20                0
EMA_50                0
RSI                   0
macd                  0
CDL_DOJI              0
CDL_HAMMER            0
CDL_MORNING_STAR      0
CDL_ENGULFING         0
CDL_LONGLINE    

In [6]:
# Convertimos las variables categoricas en dummies
columns_to_encode = ['sector_group', 'industry_group']
categorical_df = pd.DataFrame([], columns=[])

# Aplicamos el one hot encoding
for col in columns_to_encode:
  new_cols_oneHot = pd.get_dummies(df[col], prefix=col)*1.0 # multiplico por 1.0 para convertir True/False a números directamente
  categorical_df = pd.concat([categorical_df, new_cols_oneHot], axis=1)

# Unimos en el mismo df
df_final = pd.concat([df, categorical_df], axis=1, join='inner')

# Eliminamos las variables categoricas que ya hemos transformado
df_final = df_final.drop(columns=columns_to_encode)

df_final.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1249071 entries, 0 to 1249070
Data columns (total 57 columns):
 #   Column                                          Non-Null Count    Dtype              
---  ------                                          --------------    -----              
 0   date                                            1249071 non-null  datetime64[us, UTC]
 1   Open                                            1249071 non-null  float64            
 2   High                                            1249071 non-null  float64            
 3   Low                                             1249071 non-null  float64            
 4   Close                                           1249071 non-null  float64            
 5   adjclose                                        1249071 non-null  float64            
 6   Volume                                          1249071 non-null  Int64              
 7   Ticker                                          1249071 non-nul

In [7]:
import pandas as pd
import re

# Define a function to clean column names
def clean_column_names(df):
    # Replace invalid characters with an underscore
    df.columns = [re.sub(r'[^a-zA-Z0-9_]', '_', col) for col in df.columns]
    # Ensure column names do not exceed 300 characters
    df.columns = [col[:300] for col in df.columns]
    return df

# Assume 'df' is your DataFrame
df_cleaned = clean_column_names(df_final)

In [8]:
# Guardamos los datos en bigquery
bigquery.save_dataframe(df_cleaned, project, dataset_to_save, table_to_save, if_exists='replace', schema=None)

100%|██████████| 1/1 [00:00<?, ?it/s]
