In [None]:
from datetime import date, timedelta
from pathlib import Path
import time
import requests
from requests.adapters import HTTPAdapter, Retry

In [None]:
# ----- CONFIG -----
START_DATE = date(2017, 12, 29)
END_DATE   = date(2025, 7, 31)
SEGMENTS   = ["otcmarket"]         # set to ["otcmarket"] if you only want OTC
OUTDIR     = Path("finra_raw")               # base output folder
SLEEP_SECS = 0.2                             # polite throttle
TIMEOUT    = 45
# ----------------------------------------

def daterange(d0: date, d1: date):
    d = d0
    while d <= d1:
        yield d
        d += timedelta(days=1)

def url_for(segment: str, d: date) -> str:
    return f"https://cdn.finra.org/equity/{segment}/biweekly/shrt{d:%Y%m%d}.csv"

def make_session() -> requests.Session:
    s = requests.Session()
    s.headers.update({
        "User-Agent": "finra-raw-downloader/1.0",
        "Accept": "text/csv, */*;q=0.1",
    })
    # robust retries for transient errors
    retries = Retry(
        total=5,
        backoff_factor=0.4,
        status_forcelist=(429, 500, 502, 503, 504),
        allowed_methods=frozenset(["GET"]),
        raise_on_status=False,
    )
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def save_raw(url: str, dest: Path) -> bool:
    try:
        r = session.get(url, timeout=TIMEOUT)
    except requests.RequestException as e:
        print(f"[err ] {url} -> {e!r}")
        return False

    if r.status_code == 200 and r.content:
        dest.parent.mkdir(parents=True, exist_ok=True)
        dest.write_bytes(r.content)
        print(f"[save] {url} -> {dest}")
        return True
    else:
        # most missing dates are 404, but log whatever we get
        print(f"[skip] {url} -> HTTP {r.status_code}")
        return False

In [38]:
session = make_session()

for d in daterange(START_DATE, END_DATE):
    for seg in SEGMENTS:
        out = OUTDIR / seg / f"shrt{d:%Y%m%d}.csv"
        if out.exists():
            # already downloaded
            continue
        url = url_for(seg, d)
        save_raw(url, out)
        time.sleep(SLEEP_SECS)

[save] https://cdn.finra.org/equity/otcmarket/biweekly/shrt20171229.csv -> finra_raw\otcmarket\shrt20171229.csv
[skip] https://cdn.finra.org/equity/otcmarket/biweekly/shrt20171230.csv -> HTTP 403
[skip] https://cdn.finra.org/equity/otcmarket/biweekly/shrt20171231.csv -> HTTP 403
[skip] https://cdn.finra.org/equity/otcmarket/biweekly/shrt20180101.csv -> HTTP 403
[skip] https://cdn.finra.org/equity/otcmarket/biweekly/shrt20180102.csv -> HTTP 403
[skip] https://cdn.finra.org/equity/otcmarket/biweekly/shrt20180103.csv -> HTTP 403
[skip] https://cdn.finra.org/equity/otcmarket/biweekly/shrt20180104.csv -> HTTP 403
[skip] https://cdn.finra.org/equity/otcmarket/biweekly/shrt20180105.csv -> HTTP 403
[skip] https://cdn.finra.org/equity/otcmarket/biweekly/shrt20180106.csv -> HTTP 403
[skip] https://cdn.finra.org/equity/otcmarket/biweekly/shrt20180107.csv -> HTTP 403
[skip] https://cdn.finra.org/equity/otcmarket/biweekly/shrt20180108.csv -> HTTP 403
[skip] https://cdn.finra.org/equity/otcmarket/bi

In [None]:
def convert_finra_raw_to_readable(src_path, dst_path=None, encoding="utf-8-sig"):
    """
    Convert a raw FINRA short-interest file (pipe-delimited) into a standard comma-CSV.

    - Streams line-by-line (memory-friendly).
    - Ignores quoting to avoid "EOF inside string" errors.
    - Skips repeated header lines that sometimes appear mid-file.
    - If a row has too few fields: pads with "".
      If a row has too many fields: merges extras into the last column.

    Args:
        src_path (str|Path): e.g. r"finra_raw/otcmarket\shrt20180228.csv"
        dst_path (str|Path|None): output path; defaults to "<src>_clean.csv"
        encoding (str): default "utf-8-sig" to handle BOMs

    Returns:
        Path to the written CSV.
    """
    src = Path(src_path)
    if dst_path is None:
        dst = src.with_name(src.stem + "_clean.csv")
    else:
        dst = Path(dst_path)
    dst.parent.mkdir(parents=True, exist_ok=True)

    # Open src with universal newlines; write dst with newline='' for CSV correctness
    with src.open("r", encoding=encoding, errors="replace", newline="") as fin, \
         dst.open("w", encoding="utf-8", newline="") as fout:

        # Writer for the output (comma CSV)
        writer = csv.writer(fout)

        # Read first non-empty line as header
        for line in fin:
            line = line.rstrip("\r\n")
            if line.strip() == "":
                continue
            header = line.split("|")
            # Normalize header whitespace
            header = [h.strip() for h in header]
            writer.writerow(header)
            break
        else:
            # Empty file
            return dst

        expected_cols = len(header)

        # Process remaining lines
        header_joined = "|".join(header)
        for line in fin:
            line = line.rstrip("\r\n")
            if not line:
                continue

            # Skip repeated header rows appearing mid-file
            if line.strip() == header_joined:
                continue

            parts = line.split("|")

            if len(parts) == expected_cols:
                writer.writerow(parts)
                continue

            if len(parts) < expected_cols:
                # Pad missing columns
                parts = parts + [""] * (expected_cols - len(parts))
                writer.writerow(parts)
                continue

            # Too many columns → merge extras into last field
            fixed = parts[:expected_cols-1] + ["|".join(parts[expected_cols-1:])]
            writer.writerow(fixed)

    return dst


In [54]:

for file in (OUTDIR / "otcmarket").glob("*.csv"):
    out = convert_finra_raw_to_readable(file, dst_path="finra_clean" + '/' + file.stem + "_clean.csv")
    print("Wrote:", out)

Wrote: finra_clean\shrt20171229_clean.csv
Wrote: finra_clean\shrt20180112_clean.csv
Wrote: finra_clean\shrt20180131_clean.csv
Wrote: finra_clean\shrt20180215_clean.csv
Wrote: finra_clean\shrt20180228_clean.csv
Wrote: finra_clean\shrt20180315_clean.csv
Wrote: finra_clean\shrt20180329_clean.csv
Wrote: finra_clean\shrt20180413_clean.csv
Wrote: finra_clean\shrt20180430_clean.csv
Wrote: finra_clean\shrt20180515_clean.csv
Wrote: finra_clean\shrt20180531_clean.csv
Wrote: finra_clean\shrt20180615_clean.csv
Wrote: finra_clean\shrt20180629_clean.csv
Wrote: finra_clean\shrt20180713_clean.csv
Wrote: finra_clean\shrt20180731_clean.csv
Wrote: finra_clean\shrt20180815_clean.csv
Wrote: finra_clean\shrt20180831_clean.csv
Wrote: finra_clean\shrt20180914_clean.csv
Wrote: finra_clean\shrt20180928_clean.csv
Wrote: finra_clean\shrt20181015_clean.csv
Wrote: finra_clean\shrt20181031_clean.csv
Wrote: finra_clean\shrt20181115_clean.csv
Wrote: finra_clean\shrt20181130_clean.csv
Wrote: finra_clean\shrt20181214_cl

In [55]:
data = pd.read_csv(out)
data

Unnamed: 0,accountingYearMonthNumber,symbolCode,issueName,issuerServicesGroupExchangeCode,marketClassCode,currentShortPositionQuantity,previousShortPositionQuantity,stockSplitFlag,averageDailyVolumeQuantity,daysToCoverQuantity,revisionFlag,changePercent,changePreviousNumber,settlementDate
0,20250715,A,Agilent Technologies Inc.,A,NYSE,3514460,3354818,,1916368,1.83,,4.76,159642,2025-07-15
1,20250715,AA,Alcoa Corporation,A,NYSE,14593406,17156674,,5879706,2.48,,-14.94,-2563268,2025-07-15
2,20250715,AAA,Alternative Access First Prior,E,ARCA,199,5987,,15969,1.00,,-96.68,-5788,2025-07-15
3,20250715,AAAU,Goldman Sachs Physical Gold ET,H,BZX,1188985,966030,,2362004,1.00,,23.08,222955,2025-07-15
4,20250715,AABB,Asia Broadband Inc Common Stoc,S,OTC,1,366938,,17902730,1.00,,-100.00,-366937,2025-07-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20606,20250715,ZYBT,Zhengye Biotechnology Holding,R,SC,246380,261314,,84310,2.92,,-5.71,-14934,2025-07-15
20607,20250715,ZYME,Zymeworks Inc. Common Stock,R,NNM,6868563,6925863,,378522,18.15,,-0.83,-57300,2025-07-15
20608,20250715,ZYXI,"Zynex, Inc. Common Stock",R,NNM,3506501,3579138,,148544,23.61,,-2.03,-72637,2025-07-15
20609,20250715,ZZHGF,Zhongan Online PC Ins Co Ltd.,S,OTC,224800,1760800,,0,999.99,,-87.23,-1536000,2025-07-15
