In [108]:
import subprocess

def install(package):
    subprocess.check_call(["pip", "install", package])

# Try to import packages; if they don't exist, install them
try:
    import yfinance as yf
    import pandas as pd
except ImportError:
    install("yfinance")
    import yfinance as yf
    import pandas as pd

def download_historical_stock_data(ticker: str, start_date, end_date):
    try:
        print(f'>>> Downloading {ticker} Stock Data from yfinance Website.')
        
        # Download historical data
        data = yf.download(ticker, start=start_date, end=end_date)
        print(f'>>> Downloading {ticker} is Complete.')
        print(f"{'-'}"*40)

        # Reset the index to get the 'Date' as a column
        data.reset_index(inplace=True)

        # Sort the DataFrame by the 'Date' column in ascending order
        data.sort_values(by='Date', inplace=True)

        # Convert the 'Date' column to datetime format
        data['Date'] = pd.to_datetime(data['Date'])

        # Write the merged DataFrame to a CSV file
        save_path = f"data/{ticker}_stock_price_data.csv"
        data.to_csv(save_path, index=False)
        print(f"{'-'}"*40)
        print(f'>>> Saving {ticker} is Complete and Saved at: {save_path}')
        print(f"{'='}"*80)
        
        return data

    except Exception as e:
        print(f"Error: {e}")
        return None

In [110]:
ticker = "GOOGL"  # Replace with the desired stock symbol
start_date = "2022-01-01"
end_date = "2022-12-31"

# from download_historical_stock_data import *
stock_data = download_historical_stock_data(ticker, start_date, end_date)

if stock_data is not None:
    print(stock_data.info())  # Check the data types, 'Date' should be datetime64[ns]
    print(stock_data.head())


>>> Downloading GOOGL Stock Data from yfinance Website.
[*********************100%%**********************]  1 of 1 completed
>>> Downloading GOOGL is Complete.
----------------------------------------
----------------------------------------
>>> Saving GOOGL is Complete and Saved at: data/GOOGL_stock_price_data.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 251 entries, 0 to 250
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       251 non-null    datetime64[ns]
 1   Open       251 non-null    float64       
 2   High       251 non-null    float64       
 3   Low        251 non-null    float64       
 4   Close      251 non-null    float64       
 5   Adj Close  251 non-null    float64       
 6   Volume     251 non-null    int64         
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 13.9 KB
None
        Date        Open        High         Low       Close   Adj Close  \
0 202

In [114]:
# ticker_symbol = "AAPL"  # Replace with the desired stock symbol
# from download_historical_stock_data import *
start_date = "2020-01-01"
end_date = "2023-12-31"
tickers = ["GOOGL", "AAPL", "MSFT", "TSLA"]

for ticker in tickers:
    download_historical_stock_data(ticker, start_date, end_date)

# download_historical_stock_data(ticker=symbol, start=start_date, end=end_date)
print()
print(f'>>> You have downloaded {len(ticker)} datasets.')

>>> Downloading GOOGL Stock Data from yfinance Website.
[*********************100%%**********************]  1 of 1 completed
>>> Downloading GOOGL is Complete.
----------------------------------------
----------------------------------------
>>> Saving GOOGL is Complete and Saved at: data/GOOGL_stock_price_data.csv
>>> Downloading AAPL Stock Data from yfinance Website.
[*********************100%%**********************]  1 of 1 completed
>>> Downloading AAPL is Complete.
----------------------------------------
----------------------------------------
>>> Saving AAPL is Complete and Saved at: data/AAPL_stock_price_data.csv
>>> Downloading MSFT Stock Data from yfinance Website.
[*********************100%%**********************]  1 of 1 completed
>>> Downloading MSFT is Complete.
----------------------------------------
----------------------------------------
>>> Saving MSFT is Complete and Saved at: data/MSFT_stock_price_data.csv
>>> Downloading TSLA Stock Data from yfinance Website.
[*


1 Failed download:
['TSLA']: Exception('%ticker%: No price data found, symbol may be delisted (1d 2020-01-01 -> 2023-12-31)')



>>> Downloading TSLA is Complete.
----------------------------------------
----------------------------------------
>>> Saving TSLA is Complete and Saved at: data/TSLA_stock_price_data.csv

>>> You have downloaded 4 datasets.


In [122]:
from glob import glob
import os

# read file paths and printout
file_paths = glob("./data/*_stock_price_data.csv")

# For loop to read data from file paths
dataframes = []
for file in file_paths:
    data = pd.read_csv(file)
    
    # Extract the stock symbol from the file name
    symbol = os.path.basename(file).split('_')[0]
    
    # Create a new column 'symbol' based on the stock symbol
    data["symbol"] = symbol
    
    # Rearrange columns to place 'symbol' as the first column
    columns_order = ['symbol'] + [col for col in data.columns if col != 'symbol']
    data = data[columns_order]
    
    dataframes.append(data)

# Concatenate the dataframes
df = pd.concat(dataframes)

# Print the dimension of the dataframe and view the first 5 rows
print(df.shape)
print()
df.tail()


(3018, 8)



Unnamed: 0,symbol,Date,Open,High,Low,Close,Adj Close,Volume
1001,MSFT,2023-12-22,373.679993,375.179993,372.709991,374.579987,374.579987,17091100
1002,MSFT,2023-12-26,375.0,376.940002,373.5,374.660004,374.660004,12673100
1003,MSFT,2023-12-27,373.690002,375.059998,372.809998,374.070007,374.070007,14905400
1004,MSFT,2023-12-28,375.369995,376.459991,374.160004,375.279999,375.279999,14327000
1005,MSFT,2023-12-29,376.0,377.160004,373.480011,376.040009,376.040009,18723000


In [None]:
from glob import glob
import os

file_paths = glob("./data/*_stock_price_data.csv")

def merge_stock_data():
    
    # For loop to read data from file paths
    dataframes = []
    
    for file in file_paths:
        
        data = pd.read_csv(file)
    
        # Extract the stock symbol from the file name
        symbol = os.path.basename(file).split('_')[0]

        # Create a new column 'symbol' based on the stock symbol
        data["symbol"] = symbol

        # Rearrange columns to place 'symbol' as the first column
        columns_order = ['symbol'] + [col for col in data.columns if col != 'symbol']
        data = data[columns_order]

        dataframes.append(data)
        
        # Concatenate the dataframes
        df = pd.concat(dataframes)
        
        return df

In [126]:
import pytimetk as tk

summary_symbol_df = (df
                     .set_index("Date", inplace=True)
                     .groupby("symbol") 
                     .summarize_by_time(
                         date_column  = 'Date', 
                         value_column = 'Close',
                         freq         = "MS",
                         agg_func     = 'sum',
                         wide_format  = False
                     ))

# First 5 rows shown
summary_symbol_df.head()

AttributeError: 'NoneType' object has no attribute 'groupby'

In [97]:
import pandas as pd

print(f">>> Importing data.....")
print(f"{'-'}"*40)
data = pd.read_csv("./data/GOOGL_stock_price_data.csv")
print(data.info())
data.head()

>>> Importing data.....
----------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 251 entries, 0 to 250
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       251 non-null    object 
 1   Open       251 non-null    float64
 2   High       251 non-null    float64
 3   Low        251 non-null    float64
 4   Close      251 non-null    float64
 5   Adj Close  251 non-null    float64
 6   Volume     251 non-null    int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 13.9+ KB
None


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2022-01-03,145.054993,145.850998,143.712997,144.991501,144.991501,28646000
1,2022-01-04,145.395996,146.485001,143.716507,144.399506,144.399506,28400000
2,2022-01-05,144.419998,144.499496,137.688004,137.774994,137.774994,54618000
3,2022-01-06,136.998505,139.940002,136.558502,137.747498,137.747498,37348000
4,2022-01-07,138.145493,138.448502,135.766495,137.016998,137.016998,29760000


In [98]:
from create_daily_stock_returns import *
tickers = "GOOGL"
file_path = "data/GOOGL_returns_data.csv"
returns = pd.read_csv(file_path)
returns['date'] = pd.to_datetime(returns['date'])
# returns.date = [x.date() for x in returns.date]
returns.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1748 entries, 0 to 1747
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   date     1748 non-null   object 
 1   price    1748 non-null   float64
 2   returns  1748 non-null   float64
dtypes: float64(2), object(1)
memory usage: 41.1+ KB


In [28]:
returns.plot_timeseries('date', 'returns')

TypeError: float() argument must be a string or a real number, not 'Timestamp'