In [3]:
import pandas as pd
import numpy as np
import requests
import re
import os
from time import sleep
from tqdm import tqdm
from functools import reduce

import datetime

In [13]:
BASE_GOOGLE_FINANCE_URL = "https://api.polygon.io/v2/aggs/ticker/{TICKER}/range/{INTERVAL}/" +\
                          "{RANGE}/{FROM_DATE}/{TO_DATE}?limit=50000&apiKey=MNxlbyNXPpKmAki1jcScqmXxzPvdXWYT"

TOKENS = [
    "AAPL", "GOOG", "MSFT", "TSLA", "NVDA", "AMZN",
    "FB", "BABA", "CRM", "INTC", "AMD", "PYPL", "ATVI",
    "EA", "TTD", "MTCH", "ZG", "YELP"
]
# TOKENS = ["AAPL"]

TIME_INTERVAL = 15
TIME_UNITS = "minute"
FROM_DATE = pd.Timestamp("2019-02-18").date()
TO_DATE = pd.Timestamp("2022-04-04").date()

DATASET_NAME = "tech_companies"

FULL_RESULT_PATH = f"prices/{DATASET_NAME}_{TIME_INTERVAL}{TIME_UNITS}.csv"
TRAIN_RESULT_PATH = f"prices/{DATASET_NAME}_{TIME_INTERVAL}{TIME_UNITS}_train.csv"
TEST_RESULT_PATH = f"prices/{DATASET_NAME}_{TIME_INTERVAL}{TIME_UNITS}_test.csv"

In [5]:
def get_url(token: str, interval_len: int, interval: str, from_date: datetime.date, to_date: datetime.date):
    """
    Build url for Google Finance API
    """
    url = re.sub("{TICKER}", token, BASE_GOOGLE_FINANCE_URL)
    url = re.sub("{INTERVAL}", str(interval_len), url)
    url = re.sub("{RANGE}", interval, url)
    url = re.sub("{FROM_DATE}", str(from_date), url)
    url = re.sub("{TO_DATE}", str(to_date), url)
    return url

In [6]:
def get_data(token: str, interval_len: int, interval: str, from_date: datetime.date, to_date: datetime.date):
    TIMEOUT = 20
    current_date = from_date
    previous_date = current_date
    dfs = []
    while current_date <= to_date:
        request_url = get_url(token, interval_len, interval, current_date, to_date)
        data = requests.get(url=request_url)
        if data.json()["status"] == "ERROR":
            print("Waiting for", TIMEOUT, "seconds")
            for _ in tqdm(range(TIMEOUT)):
                sleep(1)
        else:
            try:
                current_df = pd.DataFrame(data.json()["results"])
            except:
                pass
            current_date = (pd.Timestamp(current_df["t"].max(), unit="ms") + pd.DateOffset(days=1)).date()
            print("Current date:", current_date)
            print("Get DataFrame of shape", current_df.shape)
            dfs.append(current_df)
    result_df = pd.concat(dfs)
    result_df.drop_duplicates(inplace=True)
    return result_df

In [7]:
dfs = []
for token in TOKENS:
    data = get_data(
        token=token,
        interval_len=TIME_INTERVAL,
        interval=TIME_UNITS,
        from_date=FROM_DATE, 
        to_date=TO_DATE
    )

    data["t"] = data["t"].apply(lambda row: pd.Timestamp(row, unit="ms"))
    data["ticker"] = token
    dfs.append(data)

Current date: 2020-07-15
Get DataFrame of shape (4395, 8)
Current date: 2020-10-08
Get DataFrame of shape (3809, 8)
Current date: 2021-01-01
Get DataFrame of shape (3662, 8)
Current date: 2021-03-26
Get DataFrame of shape (3616, 8)
Current date: 2021-06-24
Get DataFrame of shape (3939, 8)
Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Current date: 2021-09-24
Get DataFrame of shape (4041, 8)
Current date: 2021-12-22
Get DataFrame of shape (3909, 8)
Current date: 2022-03-22
Get DataFrame of shape (3853, 8)
Current date: 2022-04-05
Get DataFrame of shape (639, 8)
Current date: 2020-09-30
Get DataFrame of shape (4582, 8)
Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Current date: 2021-04-02
Get DataFrame of shape (4563, 8)
Current date: 2021-10-14
Get DataFrame of shape (4712, 8)
Current date: 2022-04-05
Get DataFrame of shape (4312, 8)
Current date: 2020-07-21
Get DataFrame of shape (4589, 8)
Current date: 2020-10-31
Get DataFrame of shape (4535, 8)
Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Current date: 2021-03-02
Get DataFrame of shape (4883, 8)
Current date: 2021-07-01
Get DataFrame of shape (5126, 8)
Current date: 2021-11-02
Get DataFrame of shape (5186, 8)
Current date: 2022-02-24
Get DataFrame of shape (4824, 8)
Current date: 2022-04-05
Get DataFrame of shape (1755, 8)
Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Current date: 2020-07-23
Get DataFrame of shape (4650, 8)
Current date: 2020-10-23
Get DataFrame of shape (3994, 8)
Current date: 2021-01-26
Get DataFrame of shape (3945, 8)
Current date: 2021-04-29
Get DataFrame of shape (4076, 8)
Current date: 2021-08-10
Get DataFrame of shape (4452, 8)
Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Current date: 2021-11-19
Get DataFrame of shape (4493, 8)
Current date: 2022-02-26
Get DataFrame of shape (4210, 8)
Current date: 2022-04-05
Get DataFrame of shape (1660, 8)
Current date: 2020-08-07
Get DataFrame of shape (4980, 8)
Current date: 2020-12-09
Get DataFrame of shape (4975, 8)
Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Current date: 2021-04-29
Get DataFrame of shape (5275, 8)
Current date: 2021-08-27
Get DataFrame of shape (4959, 8)
Current date: 2021-12-10
Get DataFrame of shape (4505, 8)
Current date: 2022-03-12
Get DataFrame of shape (4006, 8)
Current date: 2022-04-05
Get DataFrame of shape (1020, 8)
Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Current date: 2020-08-27
Get DataFrame of shape (5302, 8)
Current date: 2021-01-23
Get DataFrame of shape (5225, 8)
Current date: 2021-06-25
Get DataFrame of shape (5290, 8)
Current date: 2021-12-01
Get DataFrame of shape (5197, 8)
Current date: 2022-04-05
Get DataFrame of shape (4125, 8)
Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Current date: 2020-07-31
Get DataFrame of shape (4959, 8)
Current date: 2020-11-25
Get DataFrame of shape (4918, 8)
Current date: 2021-04-01
Get DataFrame of shape (5098, 8)
Current date: 2021-08-12
Get DataFrame of shape (5267, 8)
Current date: 2021-12-16
Get DataFrame of shape (5118, 8)
Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Current date: 2022-04-05
Get DataFrame of shape (4655, 8)
Current date: 2020-07-30
Get DataFrame of shape (5029, 8)
Current date: 2020-11-20
Get DataFrame of shape (4930, 8)
Current date: 2021-03-10
Get DataFrame of shape (4543, 8)
Current date: 2021-07-08
Get DataFrame of shape (5144, 8)
Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Current date: 2021-10-19
Get DataFrame of shape (4528, 8)
Current date: 2022-02-02
Get DataFrame of shape (4565, 8)
Current date: 2022-04-05
Get DataFrame of shape (2742, 8)
Current date: 2020-09-15
Get DataFrame of shape (4978, 8)
Current date: 2021-02-13
Get DataFrame of shape (5184, 8)
Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Current date: 2021-07-24
Get DataFrame of shape (5107, 8)
Current date: 2022-01-05
Get DataFrame of shape (5022, 8)
Current date: 2022-04-05
Get DataFrame of shape (2900, 8)
Current date: 2020-08-12
Get DataFrame of shape (5189, 8)
Current date: 2020-12-05
Get DataFrame of shape (4958, 8)
Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Current date: 2021-04-14
Get DataFrame of shape (5120, 8)
Current date: 2021-08-20
Get DataFrame of shape (5258, 8)
Current date: 2021-12-22
Get DataFrame of shape (5163, 8)
Current date: 2022-04-05
Get DataFrame of shape (4415, 8)
Current date: 2020-07-11
Get DataFrame of shape (4270, 8)
Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Current date: 2020-10-20
Get DataFrame of shape (4342, 8)
Current date: 2021-02-02
Get DataFrame of shape (4356, 8)
Current date: 2021-05-15
Get DataFrame of shape (4457, 8)
Current date: 2021-08-28
Get DataFrame of shape (4507, 8)
Current date: 2021-12-11
Get DataFrame of shape (4518, 8)
Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Current date: 2022-03-16
Get DataFrame of shape (4072, 8)
Current date: 2022-04-05
Get DataFrame of shape (894, 8)
Current date: 2020-09-09
Get DataFrame of shape (5384, 8)
Current date: 2021-02-04
Get DataFrame of shape (5422, 8)
Current date: 2021-06-30
Get DataFrame of shape (5360, 8)
Waiting for 20 seconds


100%|███████████████████████████████████████████████████████████████████████████████| 20/20 [2:29:29<00:00, 448.49s/it]


Current date: 2021-11-20
Get DataFrame of shape (5189, 8)
Current date: 2022-03-26
Get DataFrame of shape (5135, 8)
Current date: 2022-04-05
Get DataFrame of shape (357, 8)
Current date: 2020-09-18
Get DataFrame of shape (4977, 8)
Current date: 2021-03-11
Get DataFrame of shape (4821, 8)
Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Current date: 2021-08-31
Get DataFrame of shape (4587, 8)
Current date: 2022-02-08
Get DataFrame of shape (5164, 8)
Current date: 2022-04-05
Get DataFrame of shape (1631, 8)
Current date: 2020-09-30
Get DataFrame of shape (4244, 8)
Current date: 2021-03-31
Get DataFrame of shape (4125, 8)
Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Current date: 2021-09-29
Get DataFrame of shape (3947, 8)
Current date: 2022-03-31
Get DataFrame of shape (3967, 8)
Current date: 2022-04-05
Get DataFrame of shape (85, 8)
Current date: 2020-09-26
Get DataFrame of shape (4797, 8)
Current date: 2021-04-17
Get DataFrame of shape (4956, 8)
Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Current date: 2021-10-01
Get DataFrame of shape (5233, 8)
Current date: 2022-03-09
Get DataFrame of shape (5299, 8)
Current date: 2022-04-05
Get DataFrame of shape (917, 8)
Current date: 2020-10-06
Get DataFrame of shape (4023, 8)
Current date: 2021-04-09
Get DataFrame of shape (4062, 8)
Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Waiting for 20 seconds


100%|█████████████████████████████████████████████████████████████████████████████| 20/20 [10:18:46<00:00, 1856.31s/it]


Current date: 2021-10-08
Get DataFrame of shape (3938, 8)
Current date: 2022-04-05
Get DataFrame of shape (3852, 8)
Current date: 2020-10-29
Get DataFrame of shape (4358, 8)
Current date: 2021-06-05
Get DataFrame of shape (4666, 8)
Current date: 2022-01-04
Get DataFrame of shape (4701, 8)
Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Waiting for 20 seconds


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


Current date: 2022-04-05
Get DataFrame of shape (2010, 8)
Current date: 2020-10-14
Get DataFrame of shape (4268, 8)
Current date: 2021-05-04
Get DataFrame of shape (4244, 8)
Current date: 2021-12-04
Get DataFrame of shape (4265, 8)
Current date: 2022-04-05
Get DataFrame of shape (2328, 8)


In [12]:
RENAME_COLUMNS = {
    "v": "volume",
    "o": "open",
    "c": "close",
    "h": "high",
    "l": "low",
    "t": "date"
}
KEEP_COLUMNS = ["volume", "open", "close", "high", "low", "date", "ticker"]

result_df = pd.concat(dfs).rename(RENAME_COLUMNS, axis=1)[KEEP_COLUMNS].reset_index(drop=True)
print(result_df.shape)
result_df.sample(10)

(445787, 7)


Unnamed: 0,volume,open,close,high,low,date,ticker
66309,11548.0,257.76,257.81,258.0,257.76,2021-04-23 11:30:00,MSFT
431204,15967.0,20.82,20.755,20.82,20.755,2020-04-28 18:05:00,YELP
36119,20508.0,1535.35,1532.76,1535.84,1532.0,2020-09-16 17:15:00,GOOG
442225,13143.0,39.75,39.75,39.75,39.75,2021-10-01 20:00:00,YELP
250968,1306.0,210.01,209.95,210.01,209.95,2022-03-03 13:30:00,CRM
311728,4209779.0,112.08,112.6532,112.7,111.34,2022-02-24 19:00:00,AMD
266994,474621.0,66.2085,66.3899,66.47,66.175,2021-04-07 18:30:00,INTC
416161,30683.0,58.66,58.7,58.9,58.6,2020-06-02 16:04:00,ZG
43456,17493.0,2629.8,2626.25,2630.5,2625.52,2021-07-13 14:30:00,GOOG
160264,94052.0,3499.68,3500.415,3507.2099,3499.0701,2021-09-02 14:45:00,AMZN


In [14]:
TEST_RATIO = 0.2
test_size = int(result_df.shape[0] * TEST_RATIO)
data_train = result_df.iloc[:-test_size]
data_test = result_df.iloc[-test_size:]

result_df.to_csv(FULL_RESULT_PATH)
data_train.to_csv(TRAIN_RESULT_PATH)
data_test.to_csv(TEST_RESULT_PATH)