## Import libraries

In [24]:
from bs4 import BeautifulSoup
import re, os, logging, time, argparse, unicodedata, html5lib, requests
import pandas as pd
import numpy as np
from openpyxl import Workbook
from datetime import datetime

## Helper functions

1. **parse_and_trim()** -> Parses and trims HTML content by removing all attributes from HTML tags and removing line break tags from the content
2. **remove_multiple_spaces()** -> Replace multiple spaces with a single space in a string
3. **find_qtr_date()** -> Extract and format a quarterly date from text content

In [25]:
def parse_and_trim(content, content_type):
    if content_type == 'HTML':
        soup = BeautifulSoup(content, 'html.parser')
    else:
        soup = BeautifulSoup(content, 'html.parser')

    for tag in soup.recursiveChildGenerator():
        try:
            tag.attrs = None
        except AttributeError:
            pass

    for linebreak in soup.find_all('br'):
        linebreak.extract()

    return soup

In [26]:
def remove_multiple_spaces(string):
    pattern = r'\s+'
    replaced_string = re.sub(pattern, ' ', string)
    return replaced_string

In [52]:
def find_qrt_date(content):
    qtr_date = content.find_all(string=re.compile(
        r'for\s+(the\s+)?(fiscal\s+)?year\s+ended\s+|for\s+the\s+quarter\s+ended\s+|for\s+the\s+quarterly\s+period\s+ended\s+', re.IGNORECASE))
    qtr_match = re.search(
        r'([A-Za-z]+)\s+(\d{1,2}),\s+(\d{4})', qtr_date[0].replace('\n', ''))
    print(qtr_date[0])
    if qtr_match is None:
        qtr_match = re.search(
            r'([A-Za-z]+) (\d{1,2}), (\d{4})', qtr_date[1])
    if qtr_match:
        return remove_multiple_spaces(str(qtr_match.group()))
    else:
        return None

## Preprocess filing links

In [29]:
# Assign headers
headers = {
    'User-Agent': 'Blue Owl Capital Corp II'
}

# Extract to a dataframe
df = pd.read_csv("/kaggle/input/scraping-url/scraping_url.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,accessionNumber,filingDate,reportDate,acceptanceDateTime,act,form,fileNumber,filmNumber,items,size,isXBRL,isInlineXBRL,primaryDocument,primaryDocDescription,fileLink,txtFileLink
0,0,0001655887-24-000017,2024-05-13,2024-03-31,2024-05-10T17:40:30.000Z,34,10-Q,814-01219,24936212,,28055386,1,1,obdc-20240331.htm,10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...
1,1,0001655887-24-000009,2024-03-07,2023-12-31,2024-03-06T19:19:00.000Z,34,10-K,814-01219,24727699,,35559661,1,1,obdc-20231231.htm,10-K,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...
2,2,0001655887-23-000048,2023-11-09,2023-09-30,2023-11-09T16:58:33.000Z,34,10-Q,814-01219,231393518,,28991189,1,1,obdc-20230930.htm,10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...
3,3,0001655887-23-000039,2023-08-10,2023-06-30,2023-08-10T17:14:29.000Z,34,10-Q,814-01219,231160654,,27739370,1,1,obdc-20230630.htm,10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...
4,4,0001655887-23-000025,2023-05-11,2023-03-31,2023-05-11T16:06:55.000Z,34,10-Q,814-01219,23910979,,46704697,1,1,orcc-20230331.htm,10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...


In [30]:
# Drops all the amendment filing
# Keep only Qs & Ks
df = df.drop(df[df['primaryDocDescription'].str.contains(
    'amendment', case=False)].index).reset_index(drop=True)
df['Reporting date'] = pd.to_datetime(df['reportDate'])
df.head()

Unnamed: 0.1,Unnamed: 0,accessionNumber,filingDate,reportDate,acceptanceDateTime,act,form,fileNumber,filmNumber,items,size,isXBRL,isInlineXBRL,primaryDocument,primaryDocDescription,fileLink,txtFileLink,Reporting date
0,0,0001655887-24-000017,2024-05-13,2024-03-31,2024-05-10T17:40:30.000Z,34,10-Q,814-01219,24936212,,28055386,1,1,obdc-20240331.htm,10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...,2024-03-31
1,1,0001655887-24-000009,2024-03-07,2023-12-31,2024-03-06T19:19:00.000Z,34,10-K,814-01219,24727699,,35559661,1,1,obdc-20231231.htm,10-K,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...,2023-12-31
2,2,0001655887-23-000048,2023-11-09,2023-09-30,2023-11-09T16:58:33.000Z,34,10-Q,814-01219,231393518,,28991189,1,1,obdc-20230930.htm,10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...,2023-09-30
3,3,0001655887-23-000039,2023-08-10,2023-06-30,2023-08-10T17:14:29.000Z,34,10-Q,814-01219,231160654,,27739370,1,1,obdc-20230630.htm,10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...,2023-06-30
4,4,0001655887-23-000025,2023-05-11,2023-03-31,2023-05-11T16:06:55.000Z,34,10-Q,814-01219,23910979,,46704697,1,1,orcc-20230331.htm,10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...,2023-03-31


In [31]:
# Convert to datetime format
date_columns = ['filingDate', 'reportDate']
for col in date_columns:
    df[col] = pd.to_datetime(df[col], format='%Y-%m-%d')
    
for col in date_columns:
    df[col] = df[col].dt.strftime("%B %d, %Y")
    
df.head()

Unnamed: 0.1,Unnamed: 0,accessionNumber,filingDate,reportDate,acceptanceDateTime,act,form,fileNumber,filmNumber,items,size,isXBRL,isInlineXBRL,primaryDocument,primaryDocDescription,fileLink,txtFileLink,Reporting date
0,0,0001655887-24-000017,"May 13, 2024","March 31, 2024",2024-05-10T17:40:30.000Z,34,10-Q,814-01219,24936212,,28055386,1,1,obdc-20240331.htm,10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...,2024-03-31
1,1,0001655887-24-000009,"March 07, 2024","December 31, 2023",2024-03-06T19:19:00.000Z,34,10-K,814-01219,24727699,,35559661,1,1,obdc-20231231.htm,10-K,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...,2023-12-31
2,2,0001655887-23-000048,"November 09, 2023","September 30, 2023",2023-11-09T16:58:33.000Z,34,10-Q,814-01219,231393518,,28991189,1,1,obdc-20230930.htm,10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...,2023-09-30
3,3,0001655887-23-000039,"August 10, 2023","June 30, 2023",2023-08-10T17:14:29.000Z,34,10-Q,814-01219,231160654,,27739370,1,1,obdc-20230630.htm,10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...,2023-06-30
4,4,0001655887-23-000025,"May 11, 2023","March 31, 2023",2023-05-11T16:06:55.000Z,34,10-Q,814-01219,23910979,,46704697,1,1,orcc-20230331.htm,10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...,2023-03-31


In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29 entries, 0 to 28
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   Unnamed: 0             29 non-null     int64         
 1   accessionNumber        29 non-null     object        
 2   filingDate             29 non-null     object        
 3   reportDate             29 non-null     object        
 4   acceptanceDateTime     29 non-null     object        
 5   act                    29 non-null     int64         
 6   form                   29 non-null     object        
 7   fileNumber             29 non-null     object        
 8   filmNumber             29 non-null     int64         
 9   items                  0 non-null      float64       
 10  size                   29 non-null     int64         
 11  isXBRL                 29 non-null     int64         
 12  isInlineXBRL           29 non-null     int64         
 13  primary

In [33]:
# Drop extra columns
columns_to_drop = ['Unnamed: 0', 'acceptanceDateTime', 'act', 'fileNumber', 'filmNumber', 
                   'items', 'size', 'isXBRL', 'isInlineXBRL', 'Reporting date']
df.drop(columns_to_drop, axis=1, inplace=True)
df.head()

Unnamed: 0,accessionNumber,filingDate,reportDate,form,primaryDocument,primaryDocDescription,fileLink,txtFileLink
0,0001655887-24-000017,"May 13, 2024","March 31, 2024",10-Q,obdc-20240331.htm,10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...
1,0001655887-24-000009,"March 07, 2024","December 31, 2023",10-K,obdc-20231231.htm,10-K,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...
2,0001655887-23-000048,"November 09, 2023","September 30, 2023",10-Q,obdc-20230930.htm,10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...
3,0001655887-23-000039,"August 10, 2023","June 30, 2023",10-Q,obdc-20230630.htm,10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...
4,0001655887-23-000025,"May 11, 2023","March 31, 2023",10-Q,orcc-20230331.htm,10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...


In [34]:
# Last check
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29 entries, 0 to 28
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   accessionNumber        29 non-null     object
 1   filingDate             29 non-null     object
 2   reportDate             29 non-null     object
 3   form                   29 non-null     object
 4   primaryDocument        29 non-null     object
 5   primaryDocDescription  29 non-null     object
 6   fileLink               29 non-null     object
 7   txtFileLink            29 non-null     object
dtypes: object(8)
memory usage: 1.9+ KB


## Tables extraction

In [53]:
qtr_dates = []
for index, url in enumerate(df['fileLink']):
    response = requests.get(url, headers=headers)
    content = parse_and_trim(response.content, 'HTML')
    qtr_date = find_qrt_date(content)
    print(f'{index} - {url}')
    print(f'{qtr_date}')
#     qtr_dates.append(qtr_date.replace(',', '').strip())

# Add quarter date for each filing
# df['qtr_date'] = qtr_dates

For the quarterly period ended 
0 - https://www.sec.gov/Archives/edgar/data/1655887/000165588724000017/obdc-20240331.htm
December 31, 2023
For the fiscal year ended 
1 - https://www.sec.gov/Archives/edgar/data/1655887/000165588724000009/obdc-20231231.htm
None
For the quarterly period ended 
2 - https://www.sec.gov/Archives/edgar/data/1655887/000165588723000048/obdc-20230930.htm
March 31, 2022
For the quarterly period ended 
3 - https://www.sec.gov/Archives/edgar/data/1655887/000165588723000039/obdc-20230630.htm
March 31, 2022
For the quarterly period ended 
4 - https://www.sec.gov/Archives/edgar/data/1655887/000165588723000025/orcc-20230331.htm
March 31, 2022
For the fiscal year ended 
5 - https://www.sec.gov/Archives/edgar/data/1655887/000165588723000009/orcc-20221231.htm
May 3, 2022
For the quarterly period ended September 30, 2022
6 - https://www.sec.gov/Archives/edgar/data/1655887/000165588722000008/a01orccii-2022930x10q.htm
September 30, 2022
For the quarterly period ended June 30