In [None]:
from datetime import datetime, timedelta

import yfinance as yf
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_date, date_format

In [None]:
spark = SparkSession.builder.getOrCreate()

# Define tickers, date range and output directory
TICKERS = ['AAPL', 'MSFT', 'GOOG', 'AMZN', 'V']
start_date = '2022-12-13'
yesterday = datetime.now() - timedelta(1)
end_date = yesterday.strftime('%Y-%m-%d')
HDFS_PATH = 'hdfs://10.84.129.52:9000/trab/g05'
output_dir = HDFS_PATH + '/data/'

In [None]:
# Fetch recent data and save to CSV
def fetch_recent_data():
    for ticker in TICKERS:
        # Fetch data using yfinance
        data = yf.download(ticker, start=start_date, end=end_date)

        # Convert the pandas DataFrame to a Spark DataFrame
        data_spark = spark.createDataFrame(data.reset_index())

        # Convert the 'Date' column to a date type and format it as "dd-MM-yyyy"
        data_spark = data_spark.withColumn('Date', date_format(to_date('Date', 'yyyy-MM-dd'), 'dd-MM-yyyy'))

        # Reorder columns
        columns_order = ['Date', 'Low', 'Open', 'Volume', 'High', 'Close', 'Adj Close']
        data_spark = data_spark.select(columns_order)

        # Rename columns to match the required format
        data_spark = data_spark.withColumnRenamed('Adj Close', 'Adjusted Close')

        # Save to CSV
        output_file = output_dir + ticker + '/recent_data.csv'
        data_spark.write.csv(output_file, header=True, mode='overwrite')
        print(f"Saved {ticker} file")

In [None]:
fetch_recent_data()

In [None]:
spark.stop()