In [None]:
# pyright: reportMissingModuleSource=false
import sys
import os
import time
import logging
import datetime
import json
from datetime import datetime, timedelta
import yaml
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
from scipy.signal import argrelextrema
import progressbar


# load dotenv
load_dotenv()


# import local files if necessary
# pyright: reportMissingImports=false
sys.path.append('..//src')
from utils import load_config, cw_filter_df, create_progress_bar
import training_data as td
importlib.reload(td)
import feature_engineering as fe
importlib.reload(fe)
import coin_wallet_metrics as cwm
importlib.reload(cwm)
import modeling as m
importlib.reload(m)
import insights as i
importlib.reload(i)
import utils as u
importlib.reload(u)

# load configs
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')

# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)

# Custom format function for displaying numbers
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')


In [None]:
def retrieve_google_trends_data():
    """
    Retrieves google trends data from the macro_trends dataset.

    Returns:
    - google_trends_df: DataFrame containing google trends values for multiple search terms, keyed
        on week start date
    """
    query_sql = '''
        select *
        from `macro_trends.google_trends`
        order by date
    '''

    # Run the SQL query using dgc's run_sql method
    google_trends_df = dgc().run_sql(query_sql)
    logger.info('retrieved Google Trends data with shape %s',google_trends_df.shape)

    # Convert the date column to datetime format
    google_trends_df['date'] = pd.to_datetime(google_trends_df['date'])

    # Resample the df to fill in missing days by using date as the index
    google_trends_df.set_index('date', inplace=True)
    google_trends_df = google_trends_df.resample('D').interpolate(method='linear')
    google_trends_df.reset_index(inplace=True)

    return google_trends_df

google_trends_df = td.retrieve_google_trends_data()
google_trends_df.shape

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')


dataset_category = 'macro_series'
dataset_name = 'google_trends'



# load configs
dataset_config = config['datasets'][dataset_category][dataset_name]
dataset_metrics_config = metrics_config[dataset_category][dataset_name]


dataset_config
dataset_metrics_config

## Prices Metrics

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')




# retrieve and clean prices data
prices_df = td.retrieve_prices_data()
prices_df,prices_log = td.fill_prices_gaps(prices_df,config['data_cleaning']['max_gap_days'])



# generate prices metrics metrics
prices_metrics_df,partial_prices_metrics_df = cwm.generate_time_series_metrics(
    prices_df,
    config,
    metrics_config,
    dataset_key=dataset_name,
    value_column=dataset_config['value_column']
)

# # flatten, save, and preprocess the flattened df
# flattened_output_directory = os.path.join(modeling_config['modeling']['modeling_folder'],'outputs/flattened_outputs')

# flattened_prices_metrics_df = fe.flatten_coin_date_df(
#     prices_metrics_df,
#     dataset_metrics_config,
#     config['training_data']['training_period_end']
# )
# flattened_prices_metrics_df, flattened_prices_metrics_filepath = fe.save_flattened_outputs(
#     flattened_prices_metrics_df,
#     flattened_output_directory,
#     dataset_config['description'],
#     config['training_data']['modeling_period_start']
# )
# prices_preprocessed_df, prices_preprocessed_filepath = fe.preprocess_coin_df(
#     flattened_prices_metrics_filepath
#     ,modeling_config
#     ,dataset_config
#     ,dataset_metrics_config
# )


# prices_tuple = (prices_preprocessed_filepath.split('preprocessed_outputs/')[1], dataset_config['fill_method'])

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')


# load configs

dataset_category = 'time_series'
dataset_name = 'prices'
dataset_config = config['datasets'][dataset_category][dataset_name]
dataset_metrics_config = metrics_config[dataset_category][dataset_name]
value_column = list(dataset_metrics_config.keys())[0]
value_column_metrics_config = metrics_config[dataset_category][dataset_name][value_column]
value_column_metrics_config

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')


# load configs
dataset_category = 'time_series'
dataset_name = 'prices'
dataset_config = config['datasets'][dataset_category][dataset_name]
dataset_metrics_config = metrics_config[dataset_category][dataset_name]
value_column = list(dataset_metrics_config.keys())[0] # could be iterated through in a loop
value_column_metrics_config = metrics_config[dataset_category][dataset_name][value_column]['metrics']


# generate prices metrics metrics
prices_metrics_df,partial_prices_metrics_df = cwm.generate_time_series_metrics(
    prices_df,
    config,
    value_column_metrics_config,
    value_column
)

print(prices_metrics_df.shape)
print(partial_prices_metrics_df.shape)
prices_metrics_df.head()

In [None]:
value_column_metrics_config

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')


# load configs
dataset_category = 'macro_series'
dataset_name = 'google_trends'
dataset_config = config['datasets'][dataset_category][dataset_name]
dataset_metrics_config = metrics_config[dataset_category][dataset_name]
macro_series_df = google_trends_df[['date','altcoin_worldwide','cryptocurrency_worldwide']]

value_column = list(dataset_metrics_config.keys())[0] # could be iterated through in a loop
value_column_metrics_config = metrics_config[dataset_category][dataset_name][value_column]['metrics']

# generate metrics
gt_metrics_df,partial_gt_metrics_df = cwm.generate_time_series_metrics(
    macro_series_df,
    config,
    value_column_metrics_config,
    value_column,
    id_column=None
)


print(gt_metrics_df.shape)
print(partial_gt_metrics_df.shape)
gt_metrics_df.head()



In [None]:
macro_series_df = google_trends_df[['date','altcoin_worldwide','cryptocurrency_worldwide']]

In [None]:
for key in list(dataset_metrics_config.keys()):
    print(key)
    value_column_metrics_config = metrics_config[dataset_category][dataset_name][key]['metrics']
    macro_series_df = google_trends_df[['date',key]]

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')


# load configs
dataset_category = 'macro_series'
dataset_name = 'google_trends'
dataset_config = config['datasets'][dataset_category][dataset_name]
dataset_metrics_config = metrics_config[dataset_category][dataset_name]

all_metrics = []
for key in list(dataset_metrics_config.keys()):
    print(key)
    value_column_metrics_config = metrics_config[dataset_category][dataset_name][key]['metrics']
    macro_series_df = google_trends_df[['date',key]]

    # generate metrics
    gt_metrics_df,partial_gt_metrics_df = cwm.generate_time_series_metrics(
        macro_series_df,
        config,
        value_column_metrics_config,
        key,
        id_column=None
    )

    all_metrics.append(gt_metrics_df)

all_metrics_df = pd.concat(all_metrics).reset_index(drop=True)

print(all_metrics_df.shape)
all_metrics_df.head()



In [None]:
value_column_metrics_config

In [None]:
print(prices_metrics_df.shape)
print(partial_prices_metrics_df.shape)
prices_metrics_df.head()

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')



time_series_df=prices_df.copy()
config=config
metrics_config=metrics_config
dataset_key=dataset_name
value_column=dataset_config['value_column']
date_column = 'date'
id_column='coin_id'


# Data Quality Checks and Formatting
if value_column not in time_series_df.columns:
    raise KeyError(f"Input DataFrame does not include column '{value_column}'.")
if time_series_df[value_column].isnull().any():
    raise ValueError(f"The '{value_column}' column contains null values, which are not allowed.")

time_series_metrics_config = metrics_config['time_series'].get(dataset_key)
if not time_series_metrics_config:
    raise KeyError(f"No metrics are specified for key [{dataset_key}] in metrics_config.")

time_series_df[date_column] = pd.to_datetime(time_series_df[date_column])
time_series_df = time_series_df.sort_values(by=[date_column])

training_period_start = pd.to_datetime(config['training_data']['training_period_start'])
modeling_period_end = pd.to_datetime(config['training_data']['modeling_period_end'])

time_series_df = time_series_df[
    (time_series_df[date_column] >= training_period_start) &
    (time_series_df[date_column] <= modeling_period_end)
]

# Metric Calculations
if id_column and id_column in time_series_df.columns:
    grouped = time_series_df.groupby(id_column)
    metrics_df = grouped.apply(lambda x: cwm.calculate_metrics(x[value_column], time_series_metrics_config))
    metrics_df = metrics_df.reset_index()
    metrics_df = metrics_df.rename(columns={'level_1': date_column})

#     # Split into full and partial dataframes
#     full_metrics_df, partial_metrics_df = cwm.split_full_partial(metrics_df, id_column, date_column,
#                                                                 training_period_start, modeling_period_end)

#     log_coin_counts(time_series_df, full_metrics_df, partial_metrics_df, id_column)

#     return full_metrics_df, partial_metrics_df
# else:
#     metrics_df = calculate_metrics(time_series_df[value_column], time_series_metrics_config)
#     metrics_df[date_column] = time_series_df[date_column]
#     return metrics_df, None

time_series_df

In [None]:
metrics_df

In [None]:


type(training_period_start)

In [None]:
metrics_df

In [None]:
prices_df

In [None]:
df=metrics_df.copy()
training_period_start = pd.to_datetime(config['training_data']['training_period_start'])
training_period_end = pd.to_datetime(config['training_data']['training_period_end'])



df[date_column] = pd.to_datetime(df[date_column])
start_date = training_period_start
end_date = training_period_end

data_range = df.groupby(id_column)[date_column].agg(['min', 'max'])
full_ids = data_range[(data_range['min'] <= start_date) & (data_range['max'] >= end_date)].index
full_df = df[df[id_column].isin(full_ids)]
partial_df = df[~df[id_column].isin(full_ids)]

print(full_df.shape)
print(partial_df.shape)