In [94]:
import datetime
import json
import os
import urllib.parse
import urllib.request

import pandas as pd

from typing import List, Dict, Union
from tqdm import tqdm


In [23]:
EDINET_API_KEY = "3a903d9c6bfe48bdad1b737b1dfc768d"

In [42]:
file_config = {
    "PDF": {
        "type": 2,
        "extension": "pdf"
    },
    "CSV": {
        "type": 5,
        "extension": "zip"
    }
}

FILE_TYPE = "PDF"
FILE_TYPE_CODE = file_config[FILE_TYPE]["type"]
FILE_EXT = file_config[FILE_TYPE]["extension"]

In [85]:
def filter_by_codes(
    docs: List[Dict],
    edinet_codes: Union[List[str], str] = [],
    doc_type_codes: Union[List[str], str] = [],
) -> List[Dict]:
    """Filter documents by EDINET codes and document type codes."""

    if len(edinet_codes) == 0:
        edinet_codes = [doc["edinetCode"] for doc in docs]
    elif isinstance(edinet_codes, str):
        edinet_codes = [edinet_codes]

    if len(doc_type_codes) == 0:
        doc_type_codes = [doc["docTypeCode"] for doc in docs]
    elif isinstance(doc_type_codes, str):
        doc_type_codes = [doc_type_codes]

    return [
        doc
        for doc in docs
        if doc["edinetCode"] in edinet_codes and doc["docTypeCode"] in doc_type_codes
    ]


def disclosure_documents(date: Union[str, datetime.date], type: int = 2) -> Dict:
    """Retrieve disclosure documents from EDINET API for a specified date."""
    
    if isinstance(date, str):
        try:
            datetime.datetime.strptime(date, "%Y-%m-%d")
        except ValueError:
            raise ValueError("Invalid date string. Use format 'YYYY-MM-DD'")
        date_str = date
    elif isinstance(date, datetime.date):
        date_str = date.strftime("%Y-%m-%d")
    else:
        raise TypeError("Date must be a string ('YYYY-MM-DD') or datetime.date")

    url = "https://disclosure.edinet-fsa.go.jp/api/v2/documents.json"
    params = {
        "date": date_str,
        "type": type,  # '1' is metadata only, '2' is metadata and results
        "Subscription-Key": EDINET_API_KEY,
    }
    query_string = urllib.parse.urlencode(params)
    full_url = f"{url}?{query_string}"

    with urllib.request.urlopen(full_url) as response:
        return json.loads(response.read().decode("utf-8"))


def get_document(doc_id: str) -> urllib.request.urlopen:
    """Retrieve a specific document from EDINET API."""
    url = f"https://api.edinet-fsa.go.jp/api/v2/documents/{doc_id}"
    params = {
        "type": FILE_TYPE_CODE,  # '2' for PDF, '5' for CSV
        "Subscription-Key": EDINET_API_KEY,
    }
    query_string = urllib.parse.urlencode(params)
    full_url = f"{url}?{query_string}"
    return urllib.request.urlopen(full_url)


def save_document(doc_res: urllib.request.urlopen, output_path: str) -> None:
    """Save the document content to file."""
    with open(output_path, "wb") as file_out:
        file_out.write(doc_res.read())
    print(f"Saved: {output_path}")


def get_documents_for_date_range(
    start_date: datetime.date,
    end_date: datetime.date,
    edinet_codes: List[str] = [],
    doc_type_codes: List[str] = [],
) -> List[Dict]:
    """Retrieve and filter documents for a date range."""
    matching_docs = []
    
    total_days = (end_date - start_date).days + 1

    for single_date in tqdm(
        (start_date + datetime.timedelta(days=n) for n in range(total_days)),
        total=total_days,
        desc="Retrieving documents",
    ):
        docs_res = disclosure_documents(date=single_date)
        if docs_res["results"]:
            filtered_docs = filter_by_codes(
                docs_res["results"], edinet_codes, doc_type_codes
            )
            matching_docs.extend(filtered_docs)

    return matching_docs

In [86]:
start_date = datetime.date(2016, 1, 1)
end_date = datetime.date(2016, 4, 4)

In [87]:
doc_type_codes = ["140", "160"]  # Quarterly and Semi-Annual Reports
megabanks = {
    "E03614": "Sumitomo Mitsui Financial Group, Inc.",
    "E03615": "Mizuho Financial Group, Inc.",
    "E03606": "Mitsubishi UFJ Financial Group, Inc.",
    "E03530": "SBI Shinsei Bank, Limited",
}

In [88]:
print(f"Requesting documents of type {doc_type_codes}, filed by:")
for index, item in enumerate(list(megabanks.values()), start=1):
    print(f"{index}. {item}")

Requesting documents of type ['140', '160'], filed by:
1. Sumitomo Mitsui Financial Group, Inc.
2. Mizuho Financial Group, Inc.
3. Mitsubishi UFJ Financial Group, Inc.
4. SBI Shinsei Bank, Limited


In [None]:
docs = get_documents_for_date_range(
    start_date, end_date, list(megabanks.keys()), doc_type_codes
)

In [61]:
docs

[{'seqNumber': 434,
  'docID': 'S100I07X',
  'edinetCode': 'E03530',
  'secCode': '83030',
  'JCN': '7010001016855',
  'filerName': '株式会社新生銀行',
  'fundCode': None,
  'ordinanceCode': '010',
  'formCode': '043000',
  'docTypeCode': '140',
  'periodStart': '2019-10-01',
  'periodEnd': '2019-12-31',
  'submitDateTime': '2020-02-13 09:52',
  'docDescription': '四半期報告書－第20期第3四半期(令和1年10月1日－令和1年12月31日)',
  'issuerEdinetCode': None,
  'subjectEdinetCode': None,
  'subsidiaryEdinetCode': None,
  'currentReportReason': None,
  'parentDocID': None,
  'opeDateTime': None,
  'withdrawalStatus': '0',
  'docInfoEditStatus': '0',
  'disclosureStatus': '0',
  'xbrlFlag': '1',
  'pdfFlag': '1',
  'attachDocFlag': '0',
  'englishDocFlag': '0',
  'csvFlag': '1',
  'legalStatus': '2'},
 {'seqNumber': 1029,
  'docID': 'S100I1X9',
  'edinetCode': 'E03615',
  'secCode': '84110',
  'JCN': '9010001081419',
  'filerName': '株式会社みずほフィナンシャルグループ',
  'fundCode': None,
  'ordinanceCode': '010',
  'formCode': '043000',


In [62]:
print(f"Found {len(docs)} matching documents. Saving results:")

Found 48 matching documents. Saving results:


In [49]:
for doc in docs:
    doc_id = doc['docID']
    edinet_code = doc['edinetCode']
    doc_type_code = doc['docTypeCode']
    filer = doc['filerName']
    save_name = f'{edinet_code}_{filer}_{doc_type_code}_{doc_id}.{FILE_EXT}'
    output_path = os.path.join('..', "documents", save_name)
    doc_res = get_document(doc_id)
    save_document(doc_res, output_path)

Saved: ../documents/E03615_株式会社みずほフィナンシャルグループ_140_S100SW62.pdf
Saved: ../documents/E03614_株式会社三井住友フィナンシャルグループ_140_S100SVMN.pdf
Saved: ../documents/E03606_株式会社三菱ＵＦＪフィナンシャル・グループ_140_S100SVF9.pdf


In [None]:
# Tasks

# Verify doc type codes and include quarterly, semi-annually and yearly
# doc_type_codes = ["140", "160"]  # Quarterly and Semi-Annual Reports

# Get list of asset codes for Nikkei225 and industrials in particular
# megabanks = {
#     "E03614": "Sumitomo Mitsui Financial Group, Inc.",
#     "E03615": "Mizuho Financial Group, Inc.",
#     "E03606": "Mitsubishi UFJ Financial Group, Inc.",
#     "E03530": "SBI Shinsei Bank, Limited",
# }

# Get and store matching document list for Nikkei 225 companies for each year from 2016 onwards

# Download each document

# For each quarter, get latest submission date for the documents. Use that as the base date.
# Use that as a rebalance date
# In every base date, rank the 225 (or K) stocks in the basket using LLM

# For each stock in basket, get daily returns
# Perform L/S Equity

# MAIN BLOCKER:

# Get list of stocks
# Map to Edinet ID
# Understand how the mapping works
# ChatGPT API
# Time and motivation




In [None]:
edinet_codes_file = "edinet_codes.xlsx"
edinet_codes_path = os.path.join('..', "refdata", edinet_codes_file)

df = pd.read_excel(edinet_codes_path)

In [98]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10832 entries, 0 to 10831
Data columns (total 13 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   EDINET Code                         10832 non-null  object 
 1   Type of Submitter                   10832 non-null  object 
 2   Listed company / Unlisted company   5077 non-null   object 
 3   Consolidated / NonConsolidated      5077 non-null   object 
 4   Capital stock                       8458 non-null   float64
 5   account closing date                5113 non-null   object 
 6   Submitter Name                      10832 non-null  object 
 7   Submitter Name（alphabetic）          4376 non-null   object 
 8   Submitter Name（phonetic）            10832 non-null  object 
 9   Province                            7722 non-null   object 
 10  Submitter's industry                10832 non-null  object 
 11  Securities Identification Code      3943 

In [99]:
df["Listed company / Unlisted company"].unique()

array(['Unlisted company', nan, 'Listed company'], dtype=object)

In [110]:
listed_df = df[df["Listed company / Unlisted company"] == "Listed company"]
listed_df = listed_df.dropna(subset=["Submitter Name（alphabetic）"])

In [None]:
# Get duplicates

Unnamed: 0,EDINET Code,Submitter Name（alphabetic）
23,E31748,"Japan Post Holdings Co., Ltd."
34,E03614,"Sumitomo Mitsui Financial Group, Inc."
35,E03615,"Mizuho Financial Group, Inc."
37,E03606,"Mitsubishi UFJ Financial Group, Inc."
51,E04498,"Tokyo Electric Power Company Holdings, Incorpo..."
...,...,...
6819,E39406,"Ishin Co., Ltd."
6901,E38450,"Bizmates, Inc."
6908,E03000,"Kawasaki&Co.,Ltd"
7012,E38949,AVILEN Inc.
