## Import libraries

In [42]:
from bs4 import BeautifulSoup
import re, os, logging, time, argparse, unicodedata, html5lib, requests, unicodedata
import pandas as pd
import numpy as np
from openpyxl import Workbook
from datetime import datetime

## Helper functions

1. **parse_and_trim()** -> Parses and trims HTML content by removing all attributes from HTML tags and removing line break tags from the content
2. **remove_multiple_spaces()** -> Replace multiple spaces with a single space in a string
3. **find_qtr_date()** -> Extract and format a quarterly date from text content
4. **extract_tables()** -> 
5. **process_table()** -> 

In [2]:
def parse_and_trim(content, content_type):
    if content_type == 'HTML':
        soup = BeautifulSoup(content, 'html.parser')
    else:
        soup = BeautifulSoup(content, 'html.parser')

    for tag in soup.recursiveChildGenerator():
        try:
            tag.attrs = None
        except AttributeError:
            pass

    for linebreak in soup.find_all('br'):
        linebreak.extract()

    return soup

In [3]:
def remove_multiple_spaces(string):
    pattern = r'\s+'
    replaced_string = re.sub(pattern, ' ', string)
    return replaced_string

In [4]:
def find_qrt_date(content):
    qtr_date = content.find_all(string=re.compile(
        r'for\s+(the\s+)?(fiscal\s+)?year\s+ended\s+|for\s+the\s+quarter\s+ended\s+|for\s+the\s+quarterly\s+period\s+ended\s+', re.IGNORECASE))
    qtr_match = re.search(
        r'([A-Za-z]+)\s+(\d{1,2}),\s+(\d{4})', qtr_date[0].replace('\n', ''))
    print(qtr_date[0])
    if qtr_match is None:
        qtr_match = re.search(
            r'([A-Za-z]+) (\d{1,2}), (\d{4})', qtr_date[1])
    if qtr_match:
        return remove_multiple_spaces(str(qtr_match.group()))
    else:
        return None

In [49]:
def extract_tables(soup_content, qtr_date):
    master_table = None
    all_tags = soup_content.find_all(True)
    print(type(all_tags))
    for tag in soup_content.find_all(string=re.compile('^.*consolidated\s+schedule\s+of\s+investments.*$', re.IGNORECASE)):
        date_str = re.search(r'([A-Za-z]+) (\d{1,2}), (\d{4})', tag)
        if date_str is None:
            next_line = tag.find_next(text=re.compile(
                r'([A-Za-z]+) (\d{1,2}), (\d{4})')).text
            date_str = re.search(r'([A-Za-z]+) (\d{1,2}), (\d{4})', next_line)
        if date_str is None:
            next_line = tag.next.next.next.next.next.next.text
            date_str = re.search(r'([A-Za-z]+) (\d{1,2}), (\d{4})', next_line)
        if date_str is not None:
            date_str = str(date_str.group())
            date_str = unicodedata.normalize('NFKD', date_str)
            if qtr_date.replace(',', '').strip().lower() in date_str.replace(',', '').strip().lower():
                print('Table found: ')
                html_table = tag.find_next('table')
                if master_table is None:
                    master_table = pd.read_html(
                        html_table.prettify(), skiprows=0, flavor='bs4')[0]
                    master_table = master_table.applymap(lambda x: unicodedata.normalize(
                        'NFKD', x.strip().strip(u'\u200b').replace('—', '-')) if type(x) == str else x)
                    master_table = master_table.replace(r'^\s*$', np.nan, regex=True).replace(r'^\s*\$\s*$', np.nan,
                                                                                              regex=True)
                    master_table = master_table.dropna(how='all', axis=0)
                else:
                    new_table = pd.read_html(
                        html_table.prettify(), skiprows=0, flavor='bs4')[0]
                    new_table = new_table.applymap(lambda x: unicodedata.normalize(
                        'NFKD', x.strip().strip(u'\u200b').replace('—', '-')) if type(x) == str else x)
                    new_table = new_table.replace(r'^\s*$', np.nan, regex=True).replace(r'^\s*\$\s*$', np.nan,
                                                                                        regex=True)
                    new_table = new_table.dropna(how='all', axis=0)
                    # print('head')
                    # print(new_table.head()) # text
                    master_table = master_table.append(
                        new_table.dropna(how='all', axis=0).reset_index(
                            drop=True).drop(index=0),
                        ignore_index=True)

#     master_table = master_table.applymap(
#         lambda x: x.strip().strip(u'\u200b') if type(x) == str else x)
    print(master_table)
    master_table = master_table.applymap(lambda x: x.strip().strip(u'\u200b') if isinstance(x, str) else x if x is not None else None)
    master_table = master_table.replace(r'^\s*$', np.nan, regex=True).replace(
        r'^\s*\$\s*$', np.nan, regex=True).replace(r'^\s*\)\s*$', np.nan, regex=True)
    return master_table

In [26]:
def process_table(soi_table_df, append_str):
    soi_table_df = soi_table_df.replace(r'^\s*\$\s*$', np.nan, regex=True)
    soi_table_df = soi_table_df.dropna(how='all', axis=1)
    soi_table_df = soi_table_df.dropna(
        how='all', axis=0).reset_index(drop=True)

    # Separate header and data
    soi_table_header = soi_table_df.iloc[1].dropna(how='any')
    print('header: ')
    print(soi_table_header)
    soi_table_data_df = soi_table_df.iloc[2:]
    print('1: ' + str(soi_table_data_df.shape))

    # Drop Full NnN rows
    soi_table_data_df = soi_table_data_df.dropna(how='all', axis=1)
    soi_table_data_df = soi_table_data_df.dropna(
        how='all', axis=0).reset_index(drop=True)
    print('2: ' + str(soi_table_data_df.shape))

    # Rename columns to integer range
    num_cols = soi_table_data_df.shape[1]
    data_col_mapper = dict(zip(soi_table_data_df.columns.to_list(), [
                           i for i in range(0, num_cols)]))
    soi_table_data_df = soi_table_data_df.rename(columns=data_col_mapper)
    print('3: ' + str(soi_table_data_df.shape))
    soi_table_data_df.to_csv('csv_file/3_'+append_str+'.csv')

    # Drop "Control Instruments" rows
    # soi_table_data_df = soi_table_data_df.dropna(subset=[i for i in range(1, num_cols)], how='all')
    # print('4: ' + str(soi_table_data_df.shape))

    # if num_cols > 7:
    #    # Drop labeled subtotal/total rows
    #    soi_table_data_df = soi_table_data_df.dropna(subset=[1, 3, 5, 6, 7], how='all')
    #    print('5: ' + str(soi_table_data_df.shape))

    # Drop labeled subtotal rows
    soi_table_data_df = soi_table_data_df.dropna(
        subset=[i for i in range(1, num_cols - 2)], how='all')
    print('5: ' + str(soi_table_data_df.shape))

    # Drop subtotal/total rows based on regex
    sub_total_filter_pattern = r'^([Ss]ubtotal)|([Tt]otal)'
    sub_total_filter = soi_table_data_df[0].str.contains(
        sub_total_filter_pattern).replace(np.NaN, False)
    print(sub_total_filter)
    soi_table_data_df = soi_table_data_df[~sub_total_filter]

    # Drop Full NnN rows/cols
    soi_table_data_df = soi_table_data_df.dropna(how='all', axis=1)
    soi_table_data_df = soi_table_data_df.dropna(
        how='all', axis=0).reset_index(drop=True)
    print('6: ' + str(soi_table_data_df.shape))

    # Rename columns to integer range
    num_cols = soi_table_data_df.shape[1]
    data_col_mapper = dict(zip(soi_table_data_df.columns.to_list(), [
                           i for i in range(0, num_cols)]))
    soi_table_data_df = soi_table_data_df.rename(columns=data_col_mapper)

    # Drop totals rows
    soi_table_data_df = soi_table_data_df.dropna(
        subset=[i for i in range(0, num_cols-2)], how='all')
    print('7: ' + str(soi_table_data_df.shape))

    # Forward Fill first 2 columns
    ffill_cols = [i for i in range(0, num_cols-4)]
    soi_table_data_df[ffill_cols] = soi_table_data_df[ffill_cols].fillna(
        method='ffill')
    print('8: ' + str(soi_table_data_df.shape))
    soi_table_data_df.to_csv('csv_file/8_'+append_str+'.csv')

    # Drop rows with only first 2/3 columns present
    soi_table_data_df = soi_table_data_df.dropna(
        subset=[i for i in range(num_cols-4, num_cols)], how='all')
    print('9: ' + str(soi_table_data_df.shape))

    # Fill data cols NaN with 0
    soi_table_data_df = soi_table_data_df.fillna(0)
    soi_table_data_df = soi_table_data_df.replace('—', 0)
    print('10: ' + str(soi_table_data_df.shape))
    soi_table_data_df.to_csv('csv_file/10_'+append_str+'.csv', index=False)

    # Replace hyphen with 0
    soi_table_data_df = soi_table_data_df.replace('-', 0, regex=False)
    soi_table_data_df = soi_table_data_df.replace('%', "", regex=False)

    # Typecast data cols to int
    for col_index in range(num_cols-3, num_cols):
        col_name = soi_table_data_df.columns[col_index]
        soi_table_data_df[col_name] = pd.to_numeric(
            soi_table_data_df[col_name], errors='coerce')

# Multiply numeric columns by 1000
    numeric_cols = [
        col_name for col_name in soi_table_data_df.columns[num_cols-3:num_cols]]
    soi_table_data_df[numeric_cols] = soi_table_data_df[numeric_cols].apply(
        lambda x: x * 1000)

    print('11: ' + str(soi_table_data_df.shape))

    # first_str = soi_table_data_df[3].iloc[0]
    # print(first_str)
    # ord_l = [ord(c) for c in first_str]

    # print(ord_l)
    # Rename columns to table headers
    header_col_mapper = dict(
        zip(soi_table_data_df.columns.to_list(), soi_table_header))
    soi_table_data_df = soi_table_data_df.rename(columns=header_col_mapper)
    print('12: ' + str(soi_table_data_df.shape))

    soi_table_data_df = soi_table_data_df.iloc[:, :soi_table_header.shape[0]]

    return soi_table_data_df

## Preprocess filing links

In [5]:
# Assign headers
headers = {
    'User-Agent': 'Blue Owl Capital Corp II'
}

# Extract to a dataframe
df = pd.read_csv("/kaggle/input/scraping-url/scraping_url.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,accessionNumber,filingDate,reportDate,acceptanceDateTime,act,form,fileNumber,filmNumber,items,size,isXBRL,isInlineXBRL,primaryDocument,primaryDocDescription,fileLink,txtFileLink
0,0,0001655887-24-000017,2024-05-13,2024-03-31,2024-05-10T17:40:30.000Z,34,10-Q,814-01219,24936212,,28055386,1,1,obdc-20240331.htm,10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...
1,1,0001655887-24-000009,2024-03-07,2023-12-31,2024-03-06T19:19:00.000Z,34,10-K,814-01219,24727699,,35559661,1,1,obdc-20231231.htm,10-K,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...
2,2,0001655887-23-000048,2023-11-09,2023-09-30,2023-11-09T16:58:33.000Z,34,10-Q,814-01219,231393518,,28991189,1,1,obdc-20230930.htm,10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...
3,3,0001655887-23-000039,2023-08-10,2023-06-30,2023-08-10T17:14:29.000Z,34,10-Q,814-01219,231160654,,27739370,1,1,obdc-20230630.htm,10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...
4,4,0001655887-23-000025,2023-05-11,2023-03-31,2023-05-11T16:06:55.000Z,34,10-Q,814-01219,23910979,,46704697,1,1,orcc-20230331.htm,10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...


In [6]:
# Drops all the amendment filing
# Keep only Qs & Ks
df = df.drop(df[df['form'].str.contains(
    'amendment', case=False)].index).reset_index(drop=True)
df['Reporting date'] = pd.to_datetime(df['reportDate'])
df.head()

Unnamed: 0.1,Unnamed: 0,accessionNumber,filingDate,reportDate,acceptanceDateTime,act,form,fileNumber,filmNumber,items,size,isXBRL,isInlineXBRL,primaryDocument,primaryDocDescription,fileLink,txtFileLink,Reporting date
0,0,0001655887-24-000017,2024-05-13,2024-03-31,2024-05-10T17:40:30.000Z,34,10-Q,814-01219,24936212,,28055386,1,1,obdc-20240331.htm,10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...,2024-03-31
1,1,0001655887-24-000009,2024-03-07,2023-12-31,2024-03-06T19:19:00.000Z,34,10-K,814-01219,24727699,,35559661,1,1,obdc-20231231.htm,10-K,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...,2023-12-31
2,2,0001655887-23-000048,2023-11-09,2023-09-30,2023-11-09T16:58:33.000Z,34,10-Q,814-01219,231393518,,28991189,1,1,obdc-20230930.htm,10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...,2023-09-30
3,3,0001655887-23-000039,2023-08-10,2023-06-30,2023-08-10T17:14:29.000Z,34,10-Q,814-01219,231160654,,27739370,1,1,obdc-20230630.htm,10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...,2023-06-30
4,4,0001655887-23-000025,2023-05-11,2023-03-31,2023-05-11T16:06:55.000Z,34,10-Q,814-01219,23910979,,46704697,1,1,orcc-20230331.htm,10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...,2023-03-31


In [7]:
# Convert to datetime format
date_columns = ['filingDate', 'reportDate']
for col in date_columns:
    df[col] = pd.to_datetime(df[col], format='%Y-%m-%d')
    
for col in date_columns:
    df[col] = df[col].dt.strftime("%B %d, %Y")
    
df.head()

Unnamed: 0.1,Unnamed: 0,accessionNumber,filingDate,reportDate,acceptanceDateTime,act,form,fileNumber,filmNumber,items,size,isXBRL,isInlineXBRL,primaryDocument,primaryDocDescription,fileLink,txtFileLink,Reporting date
0,0,0001655887-24-000017,"May 13, 2024","March 31, 2024",2024-05-10T17:40:30.000Z,34,10-Q,814-01219,24936212,,28055386,1,1,obdc-20240331.htm,10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...,2024-03-31
1,1,0001655887-24-000009,"March 07, 2024","December 31, 2023",2024-03-06T19:19:00.000Z,34,10-K,814-01219,24727699,,35559661,1,1,obdc-20231231.htm,10-K,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...,2023-12-31
2,2,0001655887-23-000048,"November 09, 2023","September 30, 2023",2023-11-09T16:58:33.000Z,34,10-Q,814-01219,231393518,,28991189,1,1,obdc-20230930.htm,10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...,2023-09-30
3,3,0001655887-23-000039,"August 10, 2023","June 30, 2023",2023-08-10T17:14:29.000Z,34,10-Q,814-01219,231160654,,27739370,1,1,obdc-20230630.htm,10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...,2023-06-30
4,4,0001655887-23-000025,"May 11, 2023","March 31, 2023",2023-05-11T16:06:55.000Z,34,10-Q,814-01219,23910979,,46704697,1,1,orcc-20230331.htm,10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...,2023-03-31


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29 entries, 0 to 28
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   Unnamed: 0             29 non-null     int64         
 1   accessionNumber        29 non-null     object        
 2   filingDate             29 non-null     object        
 3   reportDate             29 non-null     object        
 4   acceptanceDateTime     29 non-null     object        
 5   act                    29 non-null     int64         
 6   form                   29 non-null     object        
 7   fileNumber             29 non-null     object        
 8   filmNumber             29 non-null     int64         
 9   items                  0 non-null      float64       
 10  size                   29 non-null     int64         
 11  isXBRL                 29 non-null     int64         
 12  isInlineXBRL           29 non-null     int64         
 13  primary

In [9]:
# Drop extra columns
columns_to_drop = ['Unnamed: 0', 'acceptanceDateTime', 'act', 'fileNumber', 'filmNumber', 
                   'items', 'size', 'isXBRL', 'isInlineXBRL', 'Reporting date', 
                   'primaryDocDescription', 'primaryDocument']
df.drop(columns_to_drop, axis=1, inplace=True)
df.head()

Unnamed: 0,accessionNumber,filingDate,reportDate,form,fileLink,txtFileLink
0,0001655887-24-000017,"May 13, 2024","March 31, 2024",10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...
1,0001655887-24-000009,"March 07, 2024","December 31, 2023",10-K,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...
2,0001655887-23-000048,"November 09, 2023","September 30, 2023",10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...
3,0001655887-23-000039,"August 10, 2023","June 30, 2023",10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...
4,0001655887-23-000025,"May 11, 2023","March 31, 2023",10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...


In [10]:
# Last check
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29 entries, 0 to 28
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   accessionNumber  29 non-null     object
 1   filingDate       29 non-null     object
 2   reportDate       29 non-null     object
 3   form             29 non-null     object
 4   fileLink         29 non-null     object
 5   txtFileLink      29 non-null     object
dtypes: object(6)
memory usage: 1.5+ KB


In [11]:
df.to_csv('final_scraping_url.csv')

## Tables extraction

In [12]:
qtr_dates = []
for index, url in enumerate(df['fileLink']):
    response = requests.get(url, headers=headers)
    content = parse_and_trim(response.content, 'HTML')
    qtr_date = find_qrt_date(content)
    print(f'{index} - {url}')
    print(f'{qtr_date}')
#     qtr_dates.append(qtr_date.replace(',', '').strip())

# Add quarter date for each filing
# df['qtr_date'] = qtr_dates

For the quarterly period ended 
0 - https://www.sec.gov/Archives/edgar/data/1655887/000165588724000017/obdc-20240331.htm
December 31, 2023
For the fiscal year ended 
1 - https://www.sec.gov/Archives/edgar/data/1655887/000165588724000009/obdc-20231231.htm
None
For the quarterly period ended 
2 - https://www.sec.gov/Archives/edgar/data/1655887/000165588723000048/obdc-20230930.htm
March 31, 2022
For the quarterly period ended 
3 - https://www.sec.gov/Archives/edgar/data/1655887/000165588723000039/obdc-20230630.htm
March 31, 2022
For the quarterly period ended 
4 - https://www.sec.gov/Archives/edgar/data/1655887/000165588723000025/orcc-20230331.htm
March 31, 2022
For the fiscal year ended 
5 - https://www.sec.gov/Archives/edgar/data/1655887/000165588723000009/orcc-20221231.htm
May 3, 2022
For the quarterly period ended September 30, 2022
6 - https://www.sec.gov/Archives/edgar/data/1655887/000165588722000008/a01orccii-2022930x10q.htm
September 30, 2022
For the quarterly period ended June 30

Due to headers error, only work with Qs and Ks until September 30, 2022.

In [13]:
df_partial = df[6:]
df_partial.head()

Unnamed: 0,accessionNumber,filingDate,reportDate,form,fileLink,txtFileLink
6,0001655887-22-000008,"November 09, 2022","September 30, 2022",10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...
7,0001655887-22-000006,"August 10, 2022","June 30, 2022",10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...
8,0000950170-22-009355,"May 11, 2022","March 31, 2022",10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...
9,0000950170-22-002917,"March 04, 2022","December 31, 2021",10-K,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...
10,0000950170-21-003834,"November 10, 2021","September 30, 2021",10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...


In [20]:
qtr_dates = []
for index, url in enumerate(df_partial['fileLink']):
    response = requests.get(url, headers=headers)
    content = parse_and_trim(response.content, 'HTML')
    qtr_date = find_qrt_date(content)
    print(f'{index} - {url}')
    print(f'{qtr_date}')
    qtr_dates.append(qtr_date.replace(',', '').strip())

# Add quarter date for each filing
df_partial['qtr_date'] = qtr_dates

For the quarterly period ended September 30, 2022
0 - https://www.sec.gov/Archives/edgar/data/1655887/000165588722000008/a01orccii-2022930x10q.htm
September 30, 2022
For the quarterly period ended June 30, 2022
1 - https://www.sec.gov/Archives/edgar/data/1655887/000165588722000006/a01orccii-2022630x10q.htm
June 30, 2022
For the quarterly period ended March 31, 2022
2 - https://www.sec.gov/Archives/edgar/data/1655887/000095017022009355/orccii_10q_0331-2022.htm
March 31, 2022
For the fiscal year ended December 31, 2021
3 - https://www.sec.gov/Archives/edgar/data/1655887/000095017022002917/orccii_10k_1231_2021.htm
December 31, 2021
For the quarterly period ended September 30, 2021
4 - https://www.sec.gov/Archives/edgar/data/1655887/000095017021003834/orccii_10q_0930-2021.htm
September 30, 2021
For the quarterly period ended June 30, 2021
5 - https://www.sec.gov/Archives/edgar/data/1655887/000156459021043121/orccii-10q_20210630.htm
June 30, 2021
For the quarterly period ended March 31, 202

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_partial['qtr_date'] = qtr_dates


In [21]:
df_partial.shape

(23, 7)

In [22]:
df_partial.head()

Unnamed: 0,accessionNumber,filingDate,reportDate,form,fileLink,txtFileLink,qtr_date
6,0001655887-22-000008,"November 09, 2022","September 30, 2022",10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...,September 30 2022
7,0001655887-22-000006,"August 10, 2022","June 30, 2022",10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...,June 30 2022
8,0000950170-22-009355,"May 11, 2022","March 31, 2022",10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...,March 31 2022
9,0000950170-22-002917,"March 04, 2022","December 31, 2021",10-K,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...,December 31 2021
10,0000950170-21-003834,"November 10, 2021","September 30, 2021",10-Q,https://www.sec.gov/Archives/edgar/data/165588...,https://www.sec.gov/Archives/edgar/data/165588...,September 30 2021


In [23]:
df_partial['qtr_date'] = pd.to_datetime(df_partial['qtr_date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_partial['qtr_date'] = pd.to_datetime(df_partial['qtr_date'])


In [24]:
df_partial.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23 entries, 6 to 28
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   accessionNumber  23 non-null     object        
 1   filingDate       23 non-null     object        
 2   reportDate       23 non-null     object        
 3   form             23 non-null     object        
 4   fileLink         23 non-null     object        
 5   txtFileLink      23 non-null     object        
 6   qtr_date         23 non-null     datetime64[ns]
dtypes: datetime64[ns](1), object(6)
memory usage: 1.4+ KB


Start extracting tables and storing to Excel sheets.

In [35]:
# Create directory to store output files
!mkdir /kaggle/working/clean_csv_file

In [36]:
path = '/kaggle/working/clean_csv_file/Clean_Investment.xlsx'
writer = pd.ExcelWriter(path, engine='openpyxl')

In [None]:
count = 0
for qtr_date, html_link in zip(df_partial['qtr_date'], df_partial['fileLink']):
    print('start')
    response = requests.get(html_link, headers=headers)
    content = parse_and_trim(response.content, 'HTML')
    print('content DONE')
    
    date = pd.to_datetime(qtr_date).date()
    formatted_date = date.strftime("%B %d, %Y")
    print(formatted_date)
    master_table = extract_tables(content, formatted_date)
    print(count, "master_table DONE")
    
    processed_table_ = process_table(master_table, formatted_date.replace(' ', '').replace(',', ''))
    print(processed_table_)
    processed_table_.to_excel(writer, sheet_name=formatted_date.replace(' ', '').replace(',', ''), index=False)
    print(count, "processed_table_ DONE")
    
    count += 1
    print("-------------------------------------------------------------------------------------------------------------------")

In [None]:
# Save Excel files
writer.save()
writer.close()

# Test
print(f'Excel file "{path}" has been created.')