In [16]:
import kfp
import kfp.dsl as dsl
import kfp.components as comp

In [17]:
client = kfp.Client(host='https://3bfdcd9631f1c32f-dot-us-central2.pipelines.googleusercontent.com')

In [18]:
def download_raw_data(raw_data_path: str) -> str:
    '''Download data from Yahoo Finance'''
    import yfinance as yf
    sp500_df = yf.download('^GSPC', progress=False)
    print('Downloaded data...')
    print(sp500_df.head())
    print('trying to write to GS')
    sp500_df.to_parquet(raw_data_path, compression='GZIP')
    print('Done!')
    return raw_data_path

In [19]:
download_raw_data_op = comp.create_component_from_func(
    download_raw_data, output_component_file='download_raw_data.yaml', packages_to_install=['yfinance', 'fastparquet', 'fsspec', 'gcsfs'])

In [20]:
def feature_processing(raw_data_path: str, feature_data_path: str) -> None:
    '''Calculates sum of two arguments'''
    import pandas as pd
    
    # read dataframe
    sp500_df = pd.read_parquet(raw_data_path)
    
    # create empty df to store feature
    sp500_feautres_df = pd.DataFrame()
    
    average_days_window_closing_price = [5, 30, 120, 365]
    # average price for window of different days
    for window in average_days_window_closing_price:
        sp500_feautres_df['Close__rolling_mean__'+str(window)+'_days'] = sp500_df['Close'].rolling(window).mean().shift(periods=1)
        sp500_feautres_df['Close__rolling_std__'+str(window)+'_days'] = sp500_df['Close'].rolling(window).std().shift(periods=1)
        sp500_feautres_df['Close__rolling_max__'+str(window)+'_days'] = sp500_df['Close'].rolling(window).max().shift(periods=1)
        sp500_feautres_df['Close__rolling_min__'+str(window)+'_days'] = sp500_df['Close'].rolling(window).min().shift(periods=1)
        sp500_feautres_df['Close__rolling_range__'+str(window)+'_days'] = sp500_feautres_df['Close__rolling_max__'+str(window)+'_days'] - sp500_feautres_df['Close__rolling_min__'+str(window)+'_days']
    
    average_days_window_volume = [5, 10, 15]
    # average price for window of different days
    for window in average_days_window_volume:
        sp500_feautres_df['Volume__rolling_max__'+str(window)+'_days'] = sp500_df['Close'].rolling(window).max().shift(periods=1)
        sp500_feautres_df['Volume__rolling_sum__'+str(window)+'_days'] = sp500_df['Close'].rolling(window).sum().shift(periods=1)
        
    # get day of the week
    sp500_df['day_of_week'] = sp500_df.index.dayofweek
    # get quarter
    sp500_df['quarter'] = sp500_df.index.quarter
    
    sp500_feautres_df = pd.concat([sp500_feautres_df, pd.get_dummies(sp500_df['day_of_week'], prefix='day_of_week')], 1)
    sp500_feautres_df = pd.concat([sp500_feautres_df, pd.get_dummies(sp500_df['day_of_week'], prefix='quarter')], 1)
    
    sp500_feautres_df.to_parquet(feature_data_path, compression='GZIP')
    print('Done!')

In [21]:
feature_processing_op = comp.create_component_from_func(
    feature_processing, output_component_file='feature_processing.yaml', packages_to_install=['fastparquet', 'fsspec', 'gcsfs'])

In [22]:
@dsl.pipeline(
  name='SP500 Random Forest',
  description='Predicting closing value of SP500 with Random Forest'
)
def sp500_pipeline(raw_data_path, feature_data_path):
  download_raw_data_task = download_raw_data_op(raw_data_path)
  feature_processing_task = feature_processing_op(download_raw_data_task.output, feature_data_path)
    
# Specify argument values for your pipeline run.
arguments = {'raw_data_path': 'gs://mlops-stock-prediction/raw/sp500.parquet',
            'feature_data_path': 'gs://mlops-stock-prediction/feature_store/sp500_features.parquet'}
    
# Create a pipeline run, using the client you initialized in a prior step.
client.create_run_from_pipeline_func(sp500_pipeline, arguments=arguments)

RunPipelineResult(run_id=6d9b4fa1-458a-49bb-bd7a-c68b787401c9)