In [1]:
import pyarrow as pa
from pyarrow import csv, parquet
from pyarrow import dataset as pads
from pyarrow import compute as pac
from pathlib import Path
import pandas as pd
import os

from config import PolygonConfig

In [19]:
environ = {
    # "POLYGON_DATA_DIR": "../data/polygon",
    "POLYGON_DATA_DIR": "/Volumes/Oahu/Mirror/files.polygon.io",
    # "POLYGON_AGG_TIME": "minute",
}

config = PolygonConfig(
    environ=environ,
    calendar_name="XNYS",
    # start_session="2020-10-07",
    # end_session="2020-10-15",
    # start_session="2003-10-01",
    start_session="2016-01-05",
    # start_session="2023-01-03",
    end_session="2022-12-30",
    # end_session="2023-12-28",
    # end_session="2020-12-31",
    # end_session="2024-06-30",
)

In [12]:
test_csv_path = Path(config.aggs_dir) / "2020/10/2020-10-08.csv.gz"
print(f"{test_csv_path.exists()=}")
test_csv_path

test_csv_path.exists()=True


PosixPath('/Volumes/Oahu/Mirror/files.polygon.io/flatfiles/us_stocks_sip/day_aggs_v1/2020/10/2020-10-08.csv.gz')

The Polygon historical flatfile docs say Minute Aggregates have millisecond timestamps (like the Day Aggregates) but they actually have nanosecond timestamps (like Trades).
https://polygon.io/flat-files/stocks-min-aggs?stocks-min-aggs=documentation

I reported this to them in June 2024 and it (the docs, I don't think the resolution should change) hasn't been corrected as of September (I did ping them again).

Actually just discovered the Day Aggs are nanosecond resolution also (I had been working only on minute bars).

In [5]:
# Polygon Aggregate flatfile timestamps are in nanoseconds (like trades), not milliseconds as the docs say.
# I make the timestamp timezone-aware because that's how Unix timestamps work and it may help avoid mistakes.
timestamp_type = pa.timestamp('ns', tz='UTC')

# But we can;t use the timestamp type in the schema here because it's not supported by the CSV reader.
# So we'll use int64 and cast it later.

# strptime(3) (used by CSV reader for timestamps in ConvertOptions.timestamp_parsers) supports Unix timestamps (%s) and milliseconds (%f) but not nanoseconds.
# https://www.geeksforgeeks.org/how-to-use-strptime-with-milliseconds-in-python/
# Actually that's the wrong strptime (it's Python's).  C++ strptime(3) doesn't even support %f.
# https://github.com/apache/arrow/issues/39839#issuecomment-1915981816
# Also I don't think you can use those in a format string without a separator.
# This seems weird given (Py)Arrow's high-performance focus so maybe I'm missing something.

# Polygon price scale is 4 decimal places (i.e. hundredths of a penny), but we'll use 10 because we have precision to spare.
price_type = pa.decimal128(precision=38, scale=10)
# price_type = pa.float64()

polygon_aggs_schema = pa.schema([
    pa.field('ticker', pa.string(), nullable=False),
    pa.field('volume', pa.int64(), nullable=False),
    pa.field('open', price_type, nullable=False),
    pa.field('close', price_type, nullable=False),
    pa.field('high', price_type, nullable=False),
    pa.field('low', price_type, nullable=False),
    pa.field('window_start', pa.int64(), nullable=False),
    pa.field('transactions', pa.int64(), nullable=False),
])
polygon_aggs_schema

ticker: string not null
volume: int64 not null
open: decimal128(38, 10) not null
close: decimal128(38, 10) not null
high: decimal128(38, 10) not null
low: decimal128(38, 10) not null
window_start: int64 not null
transactions: int64 not null

In [81]:
convert_options = csv.ConvertOptions(
    column_types=polygon_aggs_schema,
    strings_can_be_null=False,
    quoted_strings_can_be_null=False,
)
table = csv.read_csv(test_csv_path, convert_options=convert_options)

table

pyarrow.Table
ticker: string
volume: int64
open: decimal128(38, 10)
close: decimal128(38, 10)
high: decimal128(38, 10)
low: decimal128(38, 10)
window_start: int64
transactions: int64
----
ticker: [["A","AA","AAA","AAAU","AACG",...,"ZVZZT","ZXIET","ZYME","ZYNE","ZYXI"]]
volume: [[761745,4872674,1677,417339,5607,...,726399,4000,143093,298434,503380]]
open: [[104.2000000000,12.4500000000,24.9000000000,18.9000000000,1.0900000000,...,10.0400000000,100.0000000000,45.2700000000,3.5600000000,19.7100000000]]
close: [[104.1600000000,12.7300000000,24.9000000000,18.8700000000,1.0900000000,...,10.0500000000,100.0000000000,45.6100000000,3.6100000000,19.1000000000]]
high: [[104.3900000000,12.7400000000,24.9050000000,18.9300000000,1.0900000000,...,10.1300000000,100.0000000000,45.9700000000,3.6895000000,19.9900000000]]
low: [[103.1300000000,12.3800000000,24.9000000000,18.7550000000,1.0700000000,...,10.0000000000,100.0000000000,44.6000000000,3.5200000000,19.0900000000]]
window_start: [[16021296000000000

In [82]:
table = table.set_column(
        table.column_names.index('window_start'),
        'window_start',
        table.column("window_start").cast(timestamp_type)
)
table

pyarrow.Table
ticker: string
volume: int64
open: decimal128(38, 10)
close: decimal128(38, 10)
high: decimal128(38, 10)
low: decimal128(38, 10)
window_start: timestamp[ns, tz=UTC]
transactions: int64
----
ticker: [["A","AA","AAA","AAAU","AACG",...,"ZVZZT","ZXIET","ZYME","ZYNE","ZYXI"]]
volume: [[761745,4872674,1677,417339,5607,...,726399,4000,143093,298434,503380]]
open: [[104.2000000000,12.4500000000,24.9000000000,18.9000000000,1.0900000000,...,10.0400000000,100.0000000000,45.2700000000,3.5600000000,19.7100000000]]
close: [[104.1600000000,12.7300000000,24.9000000000,18.8700000000,1.0900000000,...,10.0500000000,100.0000000000,45.6100000000,3.6100000000,19.1000000000]]
high: [[104.3900000000,12.7400000000,24.9050000000,18.9300000000,1.0900000000,...,10.1300000000,100.0000000000,45.9700000000,3.6895000000,19.9900000000]]
low: [[103.1300000000,12.3800000000,24.9000000000,18.7550000000,1.0700000000,...,10.0000000000,100.0000000000,44.6000000000,3.5200000000,19.0900000000]]
window_start: [[2

In [83]:
table.column("window_start").value_counts().tolist()

[{'values': Timestamp('2020-10-08 04:00:00+0000', tz='UTC'), 'counts': 8957}]

In [84]:
# To work across all reasonable filesystems, we need to escape the characters in partition keys that are treated weirdly in filenames.

def partition_key_escape(c: str) -> str:
    return ("^" + c.upper()) if c.islower() else ("%" + "%02X" % ord(c))

def to_partition_key(s: str) -> str:
    if s.isalnum() and s.isupper():
        return s
    return "".join([f"{c if (c.isupper() or c.isdigit()) else partition_key_escape(c)}" for c in s])

to_partition_key("B-Cp,L.9")

'B%2DC^P%2CL%2E9'

In [85]:
[(s, to_partition_key(s)) for s in table.column('ticker').to_pylist() if s != to_partition_key(s)]

[('ABRpA', 'ABR^PA'),
 ('ABRpB', 'ABR^PB'),
 ('ABRpC', 'ABR^PC'),
 ('ACND.U', 'ACND%2EU'),
 ('ACND.WS', 'ACND%2EWS'),
 ('AELpA', 'AEL^PA'),
 ('AELpB', 'AEL^PB'),
 ('AGM.A', 'AGM%2EA'),
 ('AGMpC', 'AGM^PC'),
 ('AGMpD', 'AGM^PD'),
 ('AGMpE', 'AGM^PE'),
 ('AGMpF', 'AGM^PF'),
 ('AGOpB', 'AGO^PB'),
 ('AGOpE', 'AGO^PE'),
 ('AGOpF', 'AGO^PF'),
 ('AHHpA', 'AHH^PA'),
 ('AHLpC', 'AHL^PC'),
 ('AHLpD', 'AHL^PD'),
 ('AHLpE', 'AHL^PE'),
 ('AHTpD', 'AHT^PD'),
 ('AHTpF', 'AHT^PF'),
 ('AHTpG', 'AHT^PG'),
 ('AHTpH', 'AHT^PH'),
 ('AHTpI', 'AHT^PI'),
 ('AIpB', 'AI^PB'),
 ('AIpC', 'AI^PC'),
 ('AIGpA', 'AIG^PA'),
 ('AIG.WS', 'AIG%2EWS'),
 ('AKO.A', 'AKO%2EA'),
 ('AKO.B', 'AKO%2EB'),
 ('ALpA', 'AL^PA'),
 ('ALINpA', 'ALIN^PA'),
 ('ALINpB', 'ALIN^PB'),
 ('ALINpE', 'ALIN^PE'),
 ('ALLpB', 'ALL^PB'),
 ('ALLpG', 'ALL^PG'),
 ('ALLpH', 'ALL^PH'),
 ('ALLpI', 'ALL^PI'),
 ('ALLYpA', 'ALLY^PA'),
 ('ALPpQ', 'ALP^PQ'),
 ('ALUS.WS', 'ALUS%2EWS'),
 ('AMBC.WS', 'AMBC%2EWS'),
 ('AMHpD', 'AMH^PD'),
 ('AMHpE', 'AMH^PE'),
 ('AMH

In [86]:
table = table.append_column('part', pa.array([to_partition_key(ticker) for ticker in table.column('ticker').to_pylist()]))
table

pyarrow.Table
ticker: string
volume: int64
open: decimal128(38, 10)
close: decimal128(38, 10)
high: decimal128(38, 10)
low: decimal128(38, 10)
window_start: timestamp[ns, tz=UTC]
transactions: int64
part: string
----
ticker: [["A","AA","AAA","AAAU","AACG",...,"ZVZZT","ZXIET","ZYME","ZYNE","ZYXI"]]
volume: [[761745,4872674,1677,417339,5607,...,726399,4000,143093,298434,503380]]
open: [[104.2000000000,12.4500000000,24.9000000000,18.9000000000,1.0900000000,...,10.0400000000,100.0000000000,45.2700000000,3.5600000000,19.7100000000]]
close: [[104.1600000000,12.7300000000,24.9000000000,18.8700000000,1.0900000000,...,10.0500000000,100.0000000000,45.6100000000,3.6100000000,19.1000000000]]
high: [[104.3900000000,12.7400000000,24.9050000000,18.9300000000,1.0900000000,...,10.1300000000,100.0000000000,45.9700000000,3.6895000000,19.9900000000]]
low: [[103.1300000000,12.3800000000,24.9000000000,18.7550000000,1.0700000000,...,10.0000000000,100.0000000000,44.6000000000,3.5200000000,19.0900000000]]
wind

In [6]:
config.by_ticker_dir

'/Volumes/Oahu/Mirror/files.polygon.io/flatfiles/us_stocks_sip/day_by_ticker_v1'

In [234]:
import os

In [87]:
# ticker_count = len(table.column("ticker").unique())
len(table.column("ticker").unique()), len(table.column("part").unique())

(8957, 8957)

In [20]:
by_ticker_base_dir = os.path.join(
    config.by_ticker_dir,
    f"{config.agg_time}_{config.start_timestamp.date().isoformat()}_{config.end_timestamp.date().isoformat()}.hive",
)
by_ticker_base_dir

'/Volumes/Oahu/Mirror/files.polygon.io/flatfiles/us_stocks_sip/day_by_ticker_v1/day_2016-01-05_2022-12-30.hive'

In [88]:
partition_by_ticker = pads.partitioning(
    pa.schema([("part", pa.string())]), flavor="hive"
)
pads.write_dataset(
    table,
    base_dir=by_ticker_base_dir,
    format="parquet",
    partitioning=partition_by_ticker,
    existing_data_behavior="overwrite_or_ignore",
    max_partitions=len(table.column("ticker").unique()),
)

In [21]:
part_dataset = pads.dataset(by_ticker_base_dir, format="parquet", partitioning="hive")
part_dataset.schema

ticker: string not null
volume: int64 not null
open: decimal128(38, 10) not null
close: decimal128(38, 10) not null
high: decimal128(38, 10) not null
low: decimal128(38, 10) not null
window_start: timestamp[ns, tz=UTC] not null
transactions: int64 not null

In [22]:
part_dataset.count_rows()

15795459

In [51]:
sorted_ds = part_dataset.sort_by([("ticker", "ascending"), ("window_start", "ascending")])
sorted_table = sorted_ds.to_table()
sorted_table

pyarrow.Table
ticker: string
volume: int64
open: decimal128(38, 10)
close: decimal128(38, 10)
high: decimal128(38, 10)
low: decimal128(38, 10)
window_start: timestamp[ns, tz=UTC]
transactions: int64
----
ticker: [["A","A","A","A","A",...,"AAN","AAN","AAN","AAN","AAN"],["AAN","AAN","AAN","AAN","AAN",...,"ABB","ABB","ABB","ABB","ABB"],...,["ZTAX","ZTAX","ZTAX","ZTAX","ZTAX",...,"ZYNE","ZYNE","ZYNE","ZYNE","ZYNE"],["ZYNE","ZYNE","ZYNE","ZYNE","ZYNE",...,"ZZZ","ZZZ","ZZZ","ZZZ","ZZZ"]]
volume: [[1174134,1325110,1114549,3133146,2029378,...,124936,318086,333095,426054,358163],[271381,211140,282364,637709,210797,...,1551985,1691483,1822405,1675097,2760531],...,[13,0,0,1,1,...,4183569,2888980,2198566,1553160,4005385],[3172048,5124566,4307965,4284297,2375614,...,28,9,48,25,65]]
open: [[58.5700000000,58.2700000000,58.8600000000,59.2000000000,57.8400000000,...,7.2100000000,7.0000000000,7.3200000000,7.7200000000,8.6500000000],[8.1200000000,8.1100000000,8.4300000000,8.4000000000,7.9600000000,...,19

In [52]:
df = sorted_table.to_pandas()
df

Unnamed: 0,ticker,volume,open,close,high,low,window_start,transactions
0,A,1174134,58.5700000000,58.3200000000,58.8600000000,58.1500000000,2014-06-16 04:00:00+00:00,9235
1,A,1325110,58.2700000000,58.8700000000,59.2000000000,58.1901000000,2014-06-17 04:00:00+00:00,9430
2,A,1114549,58.8600000000,59.2700000000,59.3200000000,58.5300000000,2014-06-18 04:00:00+00:00,9856
3,A,3133146,59.2000000000,58.6200000000,59.5400000000,58.1500000000,2014-06-19 04:00:00+00:00,18652
4,A,2029378,57.8400000000,58.7700000000,59.0550000000,57.8400000000,2014-06-20 04:00:00+00:00,11623
...,...,...,...,...,...,...,...,...
23333527,ZZZ,28,24.4578000000,24.4578000000,24.4578000000,24.4578000000,2024-08-30 04:00:00+00:00,4
23333528,ZZZ,9,23.9687000000,23.9687000000,23.9687000000,23.9687000000,2024-09-03 04:00:00+00:00,3
23333529,ZZZ,48,23.9728000000,23.9728000000,23.9728000000,23.9728000000,2024-09-04 04:00:00+00:00,11
23333530,ZZZ,25,23.6200000000,23.6200000000,23.6200000000,23.6200000000,2024-09-05 04:00:00+00:00,8


A newly listed stock with "official" day prices:

ZZZ	100	0.0100000000	0.0100000000	0.0100000000	0.0100000000	2017-06-07 04:00:00+00:00	1
ZZZ	100	0.0100000000	0.0100000000	0.0100000000	0.0100000000	2017-06-05 04:00:00+00:00	1
ZZZ	0	25.0000000000	25.0000000000	25.0000000000	25.0000000000	2017-03-20 04:00:00+00:00	0
ZZZ	100	0.0100000000	0.0100000000	0.0100000000	0.0100000000	2016-08-26 04:00:00+00:00	1
ZZZ	500	1.0000000000	1.0000000000	1.0000000000	1.0000000000	2015-07-17 04:00:00+00:00	2

In [23]:
part_table = part_dataset.to_table()
part_table

pyarrow.Table
ticker: string not null
volume: int64 not null
open: decimal128(38, 10) not null
close: decimal128(38, 10) not null
high: decimal128(38, 10) not null
low: decimal128(38, 10) not null
window_start: timestamp[ns, tz=UTC] not null
transactions: int64 not null
----
ticker: [["A","AA","AApB","AAAP","AAC",...,"ZUMZ","ZVZZT","ZX","ZXZZT","ZYNE"],["A","AA","AApB","AAAP","AAC",...,"ZUMZ","ZVZZT","ZX","ZXZZT","ZYNE"],...,["A","AA","AAA","AAAU","AAC",...,"ZXIET","ZXZZT","ZYME","ZYNE","ZYXI"],["A","AA","AAA","AAAU","AAC",...,"ZXIET","ZXZZT","ZYME","ZYNE","ZYXI"]]
volume: [[2583939,39316674,98052,18270,104795,...,350089,1843000,200,1310,40915],[2102412,55199288,184287,11958,126818,...,448660,1784933,2108,15129,55898],...,[854050,4717272,105,270113,113934,...,2000,92220,457755,261901,159746],[693790,3594647,307,138431,213355,...,2000,1950,362492,470734,235576]]
open: [[40.7300000000,9.7200000000,33.1200000000,31.0000000000,18.5800000000,...,14.9800000000,8.5500000000,0.7700000000,93.37

In [24]:
tickers = part_table.column("ticker").to_pylist()
tickers

['A',
 'AA',
 'AApB',
 'AAAP',
 'AAC',
 'AADR',
 'AAL',
 'AAMC',
 'AAME',
 'AAN',
 'AAOI',
 'AAON',
 'AAP',
 'AAPL',
 'AAT',
 'AAU',
 'AAV',
 'AAVL',
 'AAWW',
 'AAXJ',
 'AB',
 'ABAC',
 'ABAX',
 'ABB',
 'ABBV',
 'ABC',
 'ABCB',
 'ABCD',
 'ABCO',
 'ABCW',
 'ABDC',
 'ABE',
 'ABEO',
 'ABEV',
 'ABG',
 'ABGB',
 'ABIL',
 'ABILW',
 'ABIO',
 'ABM',
 'ABMD',
 'ABR',
 'ABRpA',
 'ABRpB',
 'ABRpC',
 'ABRN',
 'ABT',
 'ABTL',
 'ABTX',
 'ABUS',
 'ABX',
 'ABY',
 'AC',
 'ACAD',
 'ACAS',
 'ACAT',
 'ACBI',
 'ACC',
 'ACCO',
 'ACE',
 'ACET',
 'ACFC',
 'ACG',
 'ACGL',
 'ACH',
 'ACHC',
 'ACHN',
 'ACI',
 'ACIM',
 'ACIW',
 'ACLS',
 'ACM',
 'ACN',
 'ACNB',
 'ACOR',
 'ACP',
 'ACPW',
 'ACRE',
 'ACRS',
 'ACRX',
 'ACSF',
 'ACST',
 'ACTA',
 'ACTG',
 'ACTS',
 'ACU',
 'ACUR',
 'ACV',
 'ACW',
 'ACWF',
 'ACWI',
 'ACWV',
 'ACWX',
 'ACXM',
 'ACY',
 'ADAP',
 'ADAT',
 'ADBE',
 'ADC',
 'ADGE',
 'ADHD',
 'ADI',
 'ADK',
 'ADKpA',
 'ADM',
 'ADMA',
 'ADMP',
 'ADMS',
 'ADP',
 'ADPT',
 'ADRD',
 'ADRE',
 'ADRO',
 'ADRU',
 'ADS',
 'A

In [25]:
tickers = sorted(set(tickers))

In [26]:
[ticker for ticker in tickers if ticker.upper().startswith("BCP")]

['BCPC', 'BCpA', 'BCpB', 'BCpC']

In [28]:
'BCPC'.isupper()

True

In [31]:
def symbol_to_upper(s: str) -> str:
    if s.isupper():
        return s
    return "".join(map(lambda c: ("^" + c.upper()) if c.islower() else c, s))

for ticker in [ticker for ticker in tickers if ticker.upper().startswith("BCP")]:
    print(f"{ticker=}, {symbol_to_upper(ticker)=}")

ticker='BCPC', symbol_to_upper(ticker)='BCPC'
ticker='BCpA', symbol_to_upper(ticker)='BC^PA'
ticker='BCpB', symbol_to_upper(ticker)='BC^PB'
ticker='BCpC', symbol_to_upper(ticker)='BC^PC'


: 

In [9]:
symbols = sorted(list(set(part_table.column("ticker").to_pylist())))
symbols

['A',
 'AA',
 'AAA',
 'AAAU',
 'AAC',
 'AAC.U',
 'AAC.WS',
 'AACG',
 'AACI',
 'AACIU',
 'AACIW',
 'AACT',
 'AACT.U',
 'AACT.WS',
 'AADI',
 'AADR',
 'AAGR',
 'AAGRW',
 'AAIC',
 'AAICpB',
 'AAICpC',
 'AAIN',
 'AAL',
 'AAMC',
 'AAME',
 'AAMpA',
 'AAMpB',
 'AAN',
 'AAOI',
 'AAON',
 'AAP',
 'AAPB',
 'AAPD',
 'AAPL',
 'AAPU',
 'AAPY',
 'AAT',
 'AAU',
 'AAWW',
 'AAXJ',
 'AB',
 'ABAT',
 'ABB',
 'ABBV',
 'ABC',
 'ABCB',
 'ABCL',
 'ABCM',
 'ABCS',
 'ABEO',
 'ABEQ',
 'ABEV',
 'ABG',
 'ABGI',
 'ABIO',
 'ABL',
 'ABLLL',
 'ABLLW',
 'ABLV',
 'ABLVW',
 'ABM',
 'ABNB',
 'ABOS',
 'ABR',
 'ABRpD',
 'ABRpE',
 'ABRpF',
 'ABSI',
 'ABST',
 'ABT',
 'ABTS',
 'ABUS',
 'ABVC',
 'ABVX',
 'AC',
 'ACA',
 'ACAB',
 'ACABU',
 'ACABW',
 'ACAC',
 'ACACU',
 'ACACW',
 'ACAD',
 'ACAH',
 'ACAHU',
 'ACAHW',
 'ACAQ',
 'ACAQ.U',
 'ACAQ.WS',
 'ACAX',
 'ACAXR',
 'ACAXU',
 'ACAXW',
 'ACB',
 'ACBA',
 'ACBAU',
 'ACBAW',
 'ACCD',
 'ACCO',
 'ACDC',
 'ACDCW',
 'ACDI',
 'ACDI.U',
 'ACEL',
 'ACER',
 'ACES',
 'ACET',
 'ACGL',
 'ACGLN',
 

In [93]:
# part_table.combine_chunks().equals(hive_table.combine_chunks())
part_table.sort_by("part").equals(table.sort_by("part"))

False

In [20]:
part_table.column("window_start").value_counts().tolist()

[{'values': Timestamp('2014-06-16 04:00:00+0000', tz='UTC'), 'counts': 7635},
 {'values': Timestamp('2014-06-17 04:00:00+0000', tz='UTC'), 'counts': 7664},
 {'values': Timestamp('2014-06-18 04:00:00+0000', tz='UTC'), 'counts': 7690},
 {'values': Timestamp('2014-06-19 04:00:00+0000', tz='UTC'), 'counts': 7662},
 {'values': Timestamp('2014-06-20 04:00:00+0000', tz='UTC'), 'counts': 7717},
 {'values': Timestamp('2014-06-23 04:00:00+0000', tz='UTC'), 'counts': 7734},
 {'values': Timestamp('2014-06-24 04:00:00+0000', tz='UTC'), 'counts': 7733},
 {'values': Timestamp('2014-06-25 04:00:00+0000', tz='UTC'), 'counts': 7714},
 {'values': Timestamp('2014-06-26 04:00:00+0000', tz='UTC'), 'counts': 7676},
 {'values': Timestamp('2014-06-27 04:00:00+0000', tz='UTC'), 'counts': 7691},
 {'values': Timestamp('2014-06-30 04:00:00+0000', tz='UTC'), 'counts': 7748},
 {'values': Timestamp('2014-07-01 04:00:00+0000', tz='UTC'), 'counts': 7736},
 {'values': Timestamp('2014-07-02 04:00:00+0000', tz='UTC'), 'co

In [38]:
from exchange_calendars.calendar_helpers import Date, parse_date
from zipline.utils.calendar_utils import get_calendar

In [46]:
end_timestamp = parse_date("2023-12-28", calendar="XYNS", raise_oob=False)
expr = (pac.field("window_start") >= pa.scalar(config.start_timestamp, type=timestamp_type)) & (pac.field("window_start") < pa.scalar(end_timestamp + pd.to_timedelta(1, unit="day"), type=timestamp_type))
expr

<pyarrow.compute.Expression ((window_start >= 2023-01-01 00:00:00.000000000Z) and (window_start < 2023-12-29 00:00:00.000000000Z))>

In [35]:
expr = (
    pac.field("window_start") >= pa.scalar(config.start_timestamp, type=timestamp_type)
) & (
    pac.field("window_start")
    < pa.scalar(
        config.end_timestamp + pd.to_timedelta(1, unit="day"), type=timestamp_type
    )
)
# expr = (pac.field("window_start") < pa.scalar(config.end_timestamp, type=timestamp_type))
expr

<pyarrow.compute.Expression ((window_start >= 2023-01-01 00:00:00.000000000Z) and (window_start < 2023-12-31 00:00:00.000000000Z))>

In [47]:
# filtered_table = part_table.filter(pac.field("window_start") >= pa.scalar(config.start_timestamp, type=timestamp_type)).filter(pac.field("window_start") < pa.scalar(config.end_timestamp, type=timestamp_type))
filtered_table = part_table.filter(expr)
filtered_table

pyarrow.Table
ticker: string not null
volume: int64 not null
open: decimal128(38, 10) not null
close: decimal128(38, 10) not null
high: decimal128(38, 10) not null
low: decimal128(38, 10) not null
window_start: timestamp[ns, tz=UTC] not null
transactions: int64 not null
----
ticker: [[],[],...,[],[]]
volume: [[],[],...,[],[]]
open: [[],[],...,[],[]]
close: [[],[],...,[],[]]
high: [[],[],...,[],[]]
low: [[],[],...,[],[]]
window_start: [[],[],...,[],[]]
transactions: [[],[],...,[],[]]

In [48]:
filtered_table.column("window_start").value_counts().tolist()

[{'values': Timestamp('2023-01-03 05:00:00+0000', tz='UTC'), 'counts': 10970},
 {'values': Timestamp('2023-01-04 05:00:00+0000', tz='UTC'), 'counts': 10927},
 {'values': Timestamp('2023-01-05 05:00:00+0000', tz='UTC'), 'counts': 10954},
 {'values': Timestamp('2023-01-06 05:00:00+0000', tz='UTC'), 'counts': 11065},
 {'values': Timestamp('2023-01-09 05:00:00+0000', tz='UTC'), 'counts': 10953},
 {'values': Timestamp('2023-01-10 05:00:00+0000', tz='UTC'), 'counts': 10892},
 {'values': Timestamp('2023-01-11 05:00:00+0000', tz='UTC'), 'counts': 10979},
 {'values': Timestamp('2023-01-12 05:00:00+0000', tz='UTC'), 'counts': 10919},
 {'values': Timestamp('2023-01-13 05:00:00+0000', tz='UTC'), 'counts': 10957},
 {'values': Timestamp('2023-01-17 05:00:00+0000', tz='UTC'), 'counts': 10992},
 {'values': Timestamp('2023-01-18 05:00:00+0000', tz='UTC'), 'counts': 10968},
 {'values': Timestamp('2023-01-19 05:00:00+0000', tz='UTC'), 'counts': 10903},
 {'values': Timestamp('2023-01-20 05:00:00+0000', tz

In [12]:
table = part_table.set_column(
        part_table.column_names.index('window_start'),
        'window_start',
        part_table.column("window_start").cast(timestamp_type)
)
table

pyarrow.Table
ticker: string not null
volume: int64 not null
open: decimal128(38, 10) not null
close: decimal128(38, 10) not null
high: decimal128(38, 10) not null
low: decimal128(38, 10) not null
window_start: timestamp[ns, tz=UTC]
transactions: int64 not null
----
ticker: [["A","AA","AAp","AAL","AAMC",...,"ZU","ZUMZ","ZVZZT","ZX","ZXZZT"],["A","AA","AAp","AADR","AAL",...,"ZU","ZUMZ","ZVZZT","ZX","ZXZZT"],...,["A","AA","AAA","AAAU","AACG",...,"ZXIET","ZXZZT","ZYME","ZYXI","ZZZ"],["A","AA","AAA","AAAU","AACG",...,"ZWS","ZXIET","ZYME","ZYXI","ZZZ"]]
volume: [[1174134,7463200,401,13634011,8130,...,874387,311355,2923014,7643,9039],[1325110,7292444,2359,1072,12527641,...,863136,391885,1256456,13108,15201],...,[1151440,3001485,5234,2377775,5003,...,2000,12267,356671,89566,25],[1269360,6625851,2266,2628083,8170,...,1317012,2000,339156,81055,65]]
open: [[58.5700000000,14.4900000000,87.0000000000,39.6800000000,1099.9800000000,...,38.5200000000,28.2300000000,18.0000000000,2.2600000000,3.0500000

In [13]:
table.column("window_start").value_counts().tolist()

[{'values': Timestamp('2014-06-16 04:00:00+0000', tz='UTC'), 'counts': 7635},
 {'values': Timestamp('2014-06-17 04:00:00+0000', tz='UTC'), 'counts': 7664},
 {'values': Timestamp('2014-06-18 04:00:00+0000', tz='UTC'), 'counts': 7690},
 {'values': Timestamp('2014-06-19 04:00:00+0000', tz='UTC'), 'counts': 7662},
 {'values': Timestamp('2014-06-20 04:00:00+0000', tz='UTC'), 'counts': 7717},
 {'values': Timestamp('2014-06-23 04:00:00+0000', tz='UTC'), 'counts': 7734},
 {'values': Timestamp('2014-06-24 04:00:00+0000', tz='UTC'), 'counts': 7733},
 {'values': Timestamp('2014-06-25 04:00:00+0000', tz='UTC'), 'counts': 7714},
 {'values': Timestamp('2014-06-26 04:00:00+0000', tz='UTC'), 'counts': 7676},
 {'values': Timestamp('2014-06-27 04:00:00+0000', tz='UTC'), 'counts': 7691},
 {'values': Timestamp('2014-06-30 04:00:00+0000', tz='UTC'), 'counts': 7748},
 {'values': Timestamp('2014-07-01 04:00:00+0000', tz='UTC'), 'counts': 7736},
 {'values': Timestamp('2014-07-02 04:00:00+0000', tz='UTC'), 'co

In [38]:
table = table.sort_by([("ticker", "ascending"), ("window_start", "descending")])
table

NameError: name 'table' is not defined

In [47]:
df = part_table.to_pandas()
df

Unnamed: 0,ticker,volume,open,close,high,low,window_start,transactions
0,A,1174134,58.5700000000,58.3200000000,58.8600000000,58.1500000000,2014-06-16 04:00:00+00:00,9235
1,AA,7463200,14.4900000000,14.3500000000,14.5200000000,14.3200000000,2014-06-16 04:00:00+00:00,25250
2,AAp,401,87.0000000000,87.0000000000,87.0000000000,87.0000000000,2014-06-16 04:00:00+00:00,5
3,AAL,13634011,39.6800000000,41.0600000000,41.2900000000,39.6700000000,2014-06-16 04:00:00+00:00,67792
4,AAMC,8130,1099.9800000000,1094.0000000000,1106.4750000000,1079.0500000000,2014-06-16 04:00:00+00:00,250
...,...,...,...,...,...,...,...,...
23333527,ZWS,1317012,30.4600000000,29.7100000000,30.6900000000,29.6300000000,2024-09-06 04:00:00+00:00,18083
23333528,ZXIET,2000,100.0000000000,100.0000000000,100.0000000000,100.0000000000,2024-09-06 04:00:00+00:00,1
23333529,ZYME,339156,11.7800000000,11.5600000000,11.9500000000,11.3500000000,2024-09-06 04:00:00+00:00,5711
23333530,ZYXI,81055,7.8700000000,7.8400000000,7.8900000000,7.6830000000,2024-09-06 04:00:00+00:00,1429


In [53]:
by_ticker_df = df.set_index("ticker")
# by_ticker_df.sort_values("window_start", inplace=True)
by_ticker_df.loc["AAPL"]

Unnamed: 0_level_0,volume,open,close,high,low,window_start,transactions
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AAPL,35561270,91.5100000000,92.2000000000,92.7500000000,91.4500000000,2014-06-16 04:00:00+00:00,190980
AAPL,29519647,92.3100000000,92.0800000000,92.7000000000,91.8000000000,2014-06-17 04:00:00+00:00,164357
AAPL,33514073,92.2700000000,92.1800000000,92.2900000000,91.3500000000,2014-06-18 04:00:00+00:00,179630
AAPL,35522851,92.2900000000,91.8600000000,92.3000000000,91.3389000000,2014-06-19 04:00:00+00:00,184265
AAPL,100881742,91.8500000000,90.9100000000,92.5500000000,90.9000000000,2014-06-20 04:00:00+00:00,214807
...,...,...,...,...,...,...,...
AAPL,50340308,230.1900000000,229.0000000000,230.4000000000,227.4800000000,2024-08-30 04:00:00+00:00,594026
AAPL,49286866,228.5500000000,222.7700000000,229.0000000000,221.1700000000,2024-09-03 04:00:00+00:00,813544
AAPL,42699700,221.6600000000,220.8500000000,221.7800000000,217.4800000000,2024-09-04 04:00:00+00:00,679903
AAPL,34772036,221.6250000000,222.3800000000,225.4800000000,221.5200000000,2024-09-05 04:00:00+00:00,587991


In [54]:
by_ticker_df.loc["ZZZ"]

Unnamed: 0_level_0,volume,open,close,high,low,window_start,transactions
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ZZZ,500,1.0000000000,1.0000000000,1.0000000000,1.0000000000,2015-07-17 04:00:00+00:00,2
ZZZ,100,0.0100000000,0.0100000000,0.0100000000,0.0100000000,2016-08-26 04:00:00+00:00,1
ZZZ,0,25.0000000000,25.0000000000,25.0000000000,25.0000000000,2017-03-20 04:00:00+00:00,0
ZZZ,100,0.0100000000,0.0100000000,0.0100000000,0.0100000000,2017-06-05 04:00:00+00:00,1
ZZZ,100,0.0100000000,0.0100000000,0.0100000000,0.0100000000,2017-06-07 04:00:00+00:00,1
...,...,...,...,...,...,...,...
ZZZ,28,24.4578000000,24.4578000000,24.4578000000,24.4578000000,2024-08-30 04:00:00+00:00,4
ZZZ,9,23.9687000000,23.9687000000,23.9687000000,23.9687000000,2024-09-03 04:00:00+00:00,3
ZZZ,48,23.9728000000,23.9728000000,23.9728000000,23.9728000000,2024-09-04 04:00:00+00:00,11
ZZZ,25,23.6200000000,23.6200000000,23.6200000000,23.6200000000,2024-09-05 04:00:00+00:00,8


In [37]:
by_ticker_df.loc["ZZZ"]

Unnamed: 0_level_0,volume,open,close,high,low,window_start,transactions
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ZZZ,500,1.0000000000,1.0000000000,1.0000000000,1.0000000000,2015-07-17 04:00:00+00:00,2
ZZZ,100,0.0100000000,0.0100000000,0.0100000000,0.0100000000,2016-08-26 04:00:00+00:00,1
ZZZ,0,25.0000000000,25.0000000000,25.0000000000,25.0000000000,2017-03-20 04:00:00+00:00,0
ZZZ,100,0.0100000000,0.0100000000,0.0100000000,0.0100000000,2017-06-05 04:00:00+00:00,1
ZZZ,100,0.0100000000,0.0100000000,0.0100000000,0.0100000000,2017-06-07 04:00:00+00:00,1
...,...,...,...,...,...,...,...
ZZZ,28,24.4578000000,24.4578000000,24.4578000000,24.4578000000,2024-08-30 04:00:00+00:00,4
ZZZ,9,23.9687000000,23.9687000000,23.9687000000,23.9687000000,2024-09-03 04:00:00+00:00,3
ZZZ,48,23.9728000000,23.9728000000,23.9728000000,23.9728000000,2024-09-04 04:00:00+00:00,11
ZZZ,25,23.6200000000,23.6200000000,23.6200000000,23.6200000000,2024-09-05 04:00:00+00:00,8


In [243]:
one_chunk_table = part_table.combine_chunks()
one_chunk_table

pyarrow.Table
volume: int64
open: double
close: double
high: double
low: double
window_start: timestamp[ns, tz=UTC]
transactions: int64
ticker: string
----
volume: [[1322141,3921229,3774,449029,6447,...,1100960,4000,225817,284131,469532]]
open: [[102.47,11.96,24.9017,18.8,1.115,...,10.05,100,43.99,3.51,19.21]]
close: [[103.88,12.24,24.9,18.8,1.1,...,10,100,45.06,3.57,19.38]]
high: [[104.75,12.3,24.91,18.82,1.115,...,10.05,100,45.68,3.6162,19.73]]
low: [[102.42,11.96,24.9,18.73,1.095,...,10,100,43.74,3.4866,19.02]]
window_start: [[2020-10-07 04:00:00.000000000Z,2020-10-07 04:00:00.000000000Z,2020-10-07 04:00:00.000000000Z,2020-10-07 04:00:00.000000000Z,2020-10-07 04:00:00.000000000Z,...,2020-10-07 04:00:00.000000000Z,2020-10-07 04:00:00.000000000Z,2020-10-07 04:00:00.000000000Z,2020-10-07 04:00:00.000000000Z,2020-10-07 04:00:00.000000000Z]]
transactions: [[14803,22909,28,596,89,...,10831,2,2475,1919,4861]]
ticker: [["A","AA","AAA","AAAU","AACG",...,"ZVZZT","ZXIET","ZYME","ZYNE","ZYXI"]]

In [244]:
table.combine_chunks()

pyarrow.Table
ticker: string
volume: int64
open: double
close: double
high: double
low: double
window_start: timestamp[ns, tz=UTC]
transactions: int64
----
ticker: [["A","AA","AAA","AAAU","AACG",...,"ZVZZT","ZXIET","ZYME","ZYNE","ZYXI"]]
volume: [[1322141,3921229,3774,449029,6447,...,1100960,4000,225817,284131,469532]]
open: [[102.47,11.96,24.9017,18.8,1.115,...,10.05,100,43.99,3.51,19.21]]
close: [[103.88,12.24,24.9,18.8,1.1,...,10,100,45.06,3.57,19.38]]
high: [[104.75,12.3,24.91,18.82,1.115,...,10.05,100,45.68,3.6162,19.73]]
low: [[102.42,11.96,24.9,18.73,1.095,...,10,100,43.74,3.4866,19.02]]
window_start: [[2020-10-07 04:00:00.000000000Z,2020-10-07 04:00:00.000000000Z,2020-10-07 04:00:00.000000000Z,2020-10-07 04:00:00.000000000Z,2020-10-07 04:00:00.000000000Z,...,2020-10-07 04:00:00.000000000Z,2020-10-07 04:00:00.000000000Z,2020-10-07 04:00:00.000000000Z,2020-10-07 04:00:00.000000000Z,2020-10-07 04:00:00.000000000Z]]
transactions: [[14803,22909,28,596,89,...,10831,2,2475,1919,4861]]

MacOS apparently doesn't use ASCII/UTF8 collation order for sorting file names, or maybe it is something in Arrow datasets?
Also MacOS filenames are case insensitive by default so we're missing one of pairs like 'BCPC' and 'BCpC'.

In any case, when we read a partitioned dataset the parts will not have the same order as we get by sorting part or ticker.

In [50]:
# l1 = sorted(hive_table.column("ticker").to_pylist())
# l2 = sorted(part_table.column("ticker").to_pylist())
# l1 = hive_table.column("ticker").to_pylist()
# l2 = part_table.column("ticker").to_pylist()
# l1 = table.sort_by("part").column("ticker").to_pylist()
# l2 = part_table.sort_by("part").column("ticker").to_pylist()
l1 = table.sort_by("ticker").column("ticker").to_pylist()
l2 = part_table.sort_by("ticker").column("ticker").to_pylist()
print(f"{len(l1)=} {len(l2)=}")
print(f"{len(l1)==len(l2)=}")
mismatches = [(i, l1[i], l2[i]) for i in range(min(len(l1), len(l2))) if l1[i] != l2[i]]
mismatches

len(l1)=8926 len(l2)=8926
len(l1)==len(l2)=True


[]

In [51]:
missing_tickers = set(l1) - set(l2)
missing_tickers

set()

In [53]:
df = table.to_pandas()
df[df['ticker'].isin(missing_tickers)]

Unnamed: 0,ticker,volume,open,close,high,low,window_start,transactions,part


In [54]:
'BCpC' in df['ticker'].unique(), 'BCPC' in df['ticker'].unique()

(True, True)

In [55]:
df[df['ticker'].apply(lambda t: t.lower in set(x.lower for x in missing_tickers))]

Unnamed: 0,ticker,volume,open,close,high,low,window_start,transactions,part


In [56]:
df2 = part_table.to_pandas()
df2[df2['ticker'].isin(missing_tickers)]

Unnamed: 0,ticker,volume,open,close,high,low,window_start,transactions,part


In [57]:
'BCpC' in df2['ticker'].unique(), 'BCPC' in df2['ticker'].unique()

(True, True)

In [58]:
sorted_indicies = pyarrow.compute.sort_indices(table, sort_keys=[("ticker", "ascending"), ("window_start", "ascending")])
sorted_indicies

NameError: name 'pyarrow' is not defined

In [207]:
deltas = pyarrow.compute.pairwise_diff(sorted_indicies)
deltas

<pyarrow.lib.UInt64Array object at 0x35fb67ca0>
[
  null,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  ...
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1
]

In [229]:
deltas.drop_null().value_counts().tolist()

[{'values': 1, 'counts': 1420550},
 {'values': 30, 'counts': 4},
 {'values': 18446744073709551571, 'counts': 1},
 {'values': 18, 'counts': 2},
 {'values': 3, 'counts': 1},
 {'values': 47, 'counts': 1},
 {'values': 18446744073709551474, 'counts': 1},
 {'values': 98, 'counts': 1},
 {'values': 18446744073709547632, 'counts': 1},
 {'values': 3984, 'counts': 1},
 {'values': 46, 'counts': 4},
 {'values': 277, 'counts': 1},
 {'values': 18446744073709549836, 'counts': 1},
 {'values': 1506, 'counts': 1},
 {'values': 29, 'counts': 1},
 {'values': 18446744073709551537, 'counts': 1},
 {'values': 53, 'counts': 3},
 {'values': 18446744073709542097, 'counts': 1},
 {'values': 9476, 'counts': 1},
 {'values': 90, 'counts': 1},
 {'values': 18446744073709551525, 'counts': 1},
 {'values': 4, 'counts': 8},
 {'values': 60, 'counts': 1},
 {'values': 18446744073709551280, 'counts': 1},
 {'values': 279, 'counts': 1},
 {'values': 181, 'counts': 1},
 {'values': 18446744073709550846, 'counts': 1},
 {'values': 592,

In [161]:
sorted_table

pyarrow.Table
ticker: string
volume: int64
open: double
close: double
high: double
low: double
window_start: timestamp[ns, tz=UTC]
transactions: int64
----
ticker: [["A","A","A","A","A",...,"ZYXI","ZYXI","ZYXI","ZYXI","ZYXI"]]
volume: [[714,9178,177,529,3206,...,4158,2192,4779,8271,6232]]
open: [[101.6537,102.47,102.49,102.925,102.88,...,19.44,19.445,19.45,19.4196,19.38]]
close: [[101.6537,102.44,102.49,102.98,103,...,19.445,19.45,19.42,19.41,19.38]]
high: [[101.6537,102.47,102.49,102.98,103,...,19.445,19.45,19.45,19.42,19.38]]
low: [[101.6537,102.42,102.49,102.92,102.84,...,19.43,19.445,19.42,19.38,19.38]]
window_start: [[2020-10-07 13:09:00.000000000Z,2020-10-07 13:30:00.000000000Z,2020-10-07 13:31:00.000000000Z,2020-10-07 13:32:00.000000000Z,2020-10-07 13:33:00.000000000Z,...,2020-10-07 19:56:00.000000000Z,2020-10-07 19:57:00.000000000Z,2020-10-07 19:58:00.000000000Z,2020-10-07 19:59:00.000000000Z,2020-10-07 20:00:00.000000000Z]]
transactions: [[1,16,8,12,67,...,55,48,114,104,12]]

In [149]:
sorted_table.shape

(1420803, 8)

In [193]:
unchunked_table.column("window_start")

<pyarrow.lib.ChunkedArray object at 0x35fa40860>
[
  [
    2020-10-07 13:09:00.000000000Z,
    2020-10-07 13:30:00.000000000Z,
    2020-10-07 13:31:00.000000000Z,
    2020-10-07 13:32:00.000000000Z,
    2020-10-07 13:33:00.000000000Z,
    ...
    2020-10-07 19:56:00.000000000Z,
    2020-10-07 19:57:00.000000000Z,
    2020-10-07 19:58:00.000000000Z,
    2020-10-07 19:59:00.000000000Z,
    2020-10-07 20:00:00.000000000Z
  ]
]

In [195]:
unchunked_table.column("window_start") == sorted_table.column("window_start")

True

In [200]:
unchunked_table.column("window_start").to_array().diff(sorted_table.column("window_start").to_array())

AttributeError: 'pyarrow.lib.ChunkedArray' object has no attribute 'to_array'

In [203]:
import pyarrow

In [61]:
pa.compute.pairwise_diff(table.column("window_start").to_pylist())

<pyarrow.lib.DurationArray object at 0x14ee0ebc0>
[
  null,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  ...
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0
]

In [63]:
df.reset_index(inplace=True)
df.index

RangeIndex(start=0, stop=8926, step=1)

In [64]:
df1 = df.sort_values(by=["ticker", "window_start"])

In [65]:
df.index = list(df.index)

In [66]:
df.index

Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
       ...
       8916, 8917, 8918, 8919, 8920, 8921, 8922, 8923, 8924, 8925],
      dtype='int64', length=8926)

In [67]:
diff_df = df.compare(df1)
print(f"{len(diff_df)=}")
diff_df

ValueError: Can only compare identically-labeled (both index and columns) DataFrame objects

In [68]:
df.index

Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
       ...
       8916, 8917, 8918, 8919, 8920, 8921, 8922, 8923, 8924, 8925],
      dtype='int64', length=8926)

In [69]:
df1.index

Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
       ...
       8916, 8917, 8918, 8919, 8920, 8921, 8922, 8923, 8924, 8925],
      dtype='int64', length=8926)

In [71]:
df['window_start']

0      2020-10-07 04:00:00+00:00
1      2020-10-07 04:00:00+00:00
2      2020-10-07 04:00:00+00:00
3      2020-10-07 04:00:00+00:00
4      2020-10-07 04:00:00+00:00
                  ...           
8921   2020-10-07 04:00:00+00:00
8922   2020-10-07 04:00:00+00:00
8923   2020-10-07 04:00:00+00:00
8924   2020-10-07 04:00:00+00:00
8925   2020-10-07 04:00:00+00:00
Name: window_start, Length: 8926, dtype: datetime64[ns, UTC]

In [72]:
df1['window_start']

0      2020-10-07 04:00:00+00:00
1      2020-10-07 04:00:00+00:00
2      2020-10-07 04:00:00+00:00
3      2020-10-07 04:00:00+00:00
4      2020-10-07 04:00:00+00:00
                  ...           
8921   2020-10-07 04:00:00+00:00
8922   2020-10-07 04:00:00+00:00
8923   2020-10-07 04:00:00+00:00
8924   2020-10-07 04:00:00+00:00
8925   2020-10-07 04:00:00+00:00
Name: window_start, Length: 8926, dtype: datetime64[ns, UTC]

In [73]:
df['window_start'].compare(df1['window_start'])

ValueError: Can only compare identically-labeled Series objects

In [74]:
df['ticker']

0           A
1          AA
2         AAA
3        AAAU
4        AACG
        ...  
8921    ZVZZT
8922    ZXIET
8923     ZYME
8924     ZYNE
8925     ZYXI
Name: ticker, Length: 8926, dtype: object

In [75]:
df1['ticker']

0           A
1          AA
2         AAA
3        AAAU
4        AACG
        ...  
8921    ZVZZT
8922    ZXIET
8923     ZYME
8924     ZYNE
8925     ZYXI
Name: ticker, Length: 8926, dtype: object

In [76]:
df1['ticker'].compare(df1['ticker'])

Unnamed: 0,self,other


In [77]:
df['ticker'].equals(df['ticker'])

True

In [78]:
df.info()
df

<class 'pandas.core.frame.DataFrame'>
Index: 8926 entries, 0 to 8925
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   index         8926 non-null   int64              
 1   ticker        8926 non-null   object             
 2   volume        8926 non-null   int64              
 3   open          8926 non-null   object             
 4   close         8926 non-null   object             
 5   high          8926 non-null   object             
 6   low           8926 non-null   object             
 7   window_start  8926 non-null   datetime64[ns, UTC]
 8   transactions  8926 non-null   int64              
 9   part          8926 non-null   object             
dtypes: datetime64[ns, UTC](1), int64(3), object(6)
memory usage: 767.1+ KB


Unnamed: 0,index,ticker,volume,open,close,high,low,window_start,transactions,part
0,0,A,1322141,102.4700000000,103.8800000000,104.7500000000,102.4200000000,2020-10-07 04:00:00+00:00,14803,A
1,1,AA,3921229,11.9600000000,12.2400000000,12.3000000000,11.9600000000,2020-10-07 04:00:00+00:00,22909,AA
2,2,AAA,3774,24.9017000000,24.9000000000,24.9100000000,24.9000000000,2020-10-07 04:00:00+00:00,28,AAA
3,3,AAAU,449029,18.8000000000,18.8000000000,18.8200000000,18.7300000000,2020-10-07 04:00:00+00:00,596,AAAU
4,4,AACG,6447,1.1150000000,1.1000000000,1.1150000000,1.0950000000,2020-10-07 04:00:00+00:00,89,AACG
...,...,...,...,...,...,...,...,...,...,...
8921,8921,ZVZZT,1100960,10.0500000000,10.0000000000,10.0500000000,10.0000000000,2020-10-07 04:00:00+00:00,10831,ZVZZT
8922,8922,ZXIET,4000,100.0000000000,100.0000000000,100.0000000000,100.0000000000,2020-10-07 04:00:00+00:00,2,ZXIET
8923,8923,ZYME,225817,43.9900000000,45.0600000000,45.6800000000,43.7400000000,2020-10-07 04:00:00+00:00,2475,ZYME
8924,8924,ZYNE,284131,3.5100000000,3.5700000000,3.6162000000,3.4866000000,2020-10-07 04:00:00+00:00,1919,ZYNE


In [79]:
df1.info()
df1

<class 'pandas.core.frame.DataFrame'>
Index: 8926 entries, 0 to 8925
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   index         8926 non-null   int64              
 1   ticker        8926 non-null   object             
 2   volume        8926 non-null   int64              
 3   open          8926 non-null   object             
 4   close         8926 non-null   object             
 5   high          8926 non-null   object             
 6   low           8926 non-null   object             
 7   window_start  8926 non-null   datetime64[ns, UTC]
 8   transactions  8926 non-null   int64              
 9   part          8926 non-null   object             
dtypes: datetime64[ns, UTC](1), int64(3), object(6)
memory usage: 767.1+ KB


Unnamed: 0,index,ticker,volume,open,close,high,low,window_start,transactions,part
0,0,A,1322141,102.4700000000,103.8800000000,104.7500000000,102.4200000000,2020-10-07 04:00:00+00:00,14803,A
1,1,AA,3921229,11.9600000000,12.2400000000,12.3000000000,11.9600000000,2020-10-07 04:00:00+00:00,22909,AA
2,2,AAA,3774,24.9017000000,24.9000000000,24.9100000000,24.9000000000,2020-10-07 04:00:00+00:00,28,AAA
3,3,AAAU,449029,18.8000000000,18.8000000000,18.8200000000,18.7300000000,2020-10-07 04:00:00+00:00,596,AAAU
4,4,AACG,6447,1.1150000000,1.1000000000,1.1150000000,1.0950000000,2020-10-07 04:00:00+00:00,89,AACG
...,...,...,...,...,...,...,...,...,...,...
8921,8921,ZVZZT,1100960,10.0500000000,10.0000000000,10.0500000000,10.0000000000,2020-10-07 04:00:00+00:00,10831,ZVZZT
8922,8922,ZXIET,4000,100.0000000000,100.0000000000,100.0000000000,100.0000000000,2020-10-07 04:00:00+00:00,2,ZXIET
8923,8923,ZYME,225817,43.9900000000,45.0600000000,45.6800000000,43.7400000000,2020-10-07 04:00:00+00:00,2475,ZYME
8924,8924,ZYNE,284131,3.5100000000,3.5700000000,3.6162000000,3.4866000000,2020-10-07 04:00:00+00:00,1919,ZYNE
