# Imports

In [1]:
import os
import re
import pandas as pd
import MySQLdb

# Find valid folders

In [2]:
folders_to_process = []

for folder in os.listdir(os.getcwd()):
    # Check if the folder matches the pattern eg. 2018q1_form13f
    if re.match(r'\d{4}q[1-4]_form13f', folder):
        # Construct full folder path
        folder_path = os.path.join(os.getcwd(), folder)
        if os.path.isdir(folder_path):
            folders_to_process.append(folder_path)

In [3]:
folders_to_process

['C:\\Users\\Jerry\\Projects\\tester_notebook\\2018q1_form13f',
 'C:\\Users\\Jerry\\Projects\\tester_notebook\\2018q2_form13f',
 'C:\\Users\\Jerry\\Projects\\tester_notebook\\2018q3_form13f',
 'C:\\Users\\Jerry\\Projects\\tester_notebook\\2018q4_form13f',
 'C:\\Users\\Jerry\\Projects\\tester_notebook\\2019q1_form13f',
 'C:\\Users\\Jerry\\Projects\\tester_notebook\\2019q2_form13f',
 'C:\\Users\\Jerry\\Projects\\tester_notebook\\2019q3_form13f',
 'C:\\Users\\Jerry\\Projects\\tester_notebook\\2019q4_form13f',
 'C:\\Users\\Jerry\\Projects\\tester_notebook\\2020q1_form13f',
 'C:\\Users\\Jerry\\Projects\\tester_notebook\\2020q2_form13f',
 'C:\\Users\\Jerry\\Projects\\tester_notebook\\2020q3_form13f',
 'C:\\Users\\Jerry\\Projects\\tester_notebook\\2020q4_form13f',
 'C:\\Users\\Jerry\\Projects\\tester_notebook\\2021q1_form13f',
 'C:\\Users\\Jerry\\Projects\\tester_notebook\\2021q2_form13f',
 'C:\\Users\\Jerry\\Projects\\tester_notebook\\2021q3_form13f',
 'C:\\Users\\Jerry\\Projects\\tester_not

# Assign valid dates

In [4]:
valid_dates = {'31-MAR-2018', '30-JUN-2018', '30-SEP-2018', '31-DEC-2018', 
               '31-MAR-2019', '30-JUN-2019', '30-SEP-2019', '31-DEC-2019', 
               '31-MAR-2020', '30-JUN-2020', '30-SEP-2020', '31-DEC-2020', 
               '31-MAR-2021', '30-JUN-2021', '30-SEP-2021', '31-DEC-2021', 
               '31-MAR-2022', '30-JUN-2022', '30-SEP-2022', '31-DEC-2022'
}

# Establish a connection

In [5]:
# Connecting to the MySQL server
connection = MySQLdb.connect(host="localhost", user="root", password="password", database="edgar_db_test")
cursor = connection.cursor()

# Create tables

In [6]:
# Create table security_info
cursor.execute("""
CREATE TABLE IF NOT EXISTS security_info (
    cusip CHAR(9) PRIMARY KEY,
    ticker VARCHAR(255),
    name VARCHAR(255),
    sector VARCHAR(255),
    asset_class VARCHAR(255),
    location VARCHAR(255),
    exchange VARCHAR(255)
)
""")

0

In [7]:
# Create table fund_info
cursor.execute("""
CREATE TABLE IF NOT EXISTS fund_info (
    cik INTEGER PRIMARY KEY,
    manager_name VARCHAR(255),
    city VARCHAR(255)
)
""")

0

In [8]:
# Create table position_info
cursor.execute("""
CREATE TABLE IF NOT EXISTS position_info (
    infotable_sk INTEGER PRIMARY KEY,
    accession_number VARCHAR(255),
    cusip CHAR(9),
    value INTEGER,
    shares INTEGER,
    cik INTEGER,
    filing_period VARCHAR(255),
    FOREIGN KEY (cusip) REFERENCES security_info(cusip),
    FOREIGN KEY (cik) REFERENCES fund_info(cik)
)
""")

0

In [9]:
# Committing changes
connection.commit()

# Find valid CUSIPs (Russell 1000)

In [10]:
# Query to get all 'cusip' values from 'security_info' table
cursor.execute("SELECT cusip FROM security_info")

# Fetch all rows
rows = cursor.fetchall()

# Convert rows to a set of 'cusip' values
cusips = {row[0] for row in rows}

# Create database entries

In [11]:
infotable_columns = ['ACCESSION_NUMBER', 'INFOTABLE_SK', 'CUSIP', 'VALUE', 'SSHPRNAMT', 'SSHPRNAMTTYPE', 'PUTCALL']
submissions_columns = ['ACCESSION_NUMBER', 'SUBMISSIONTYPE', 'CIK', 'PERIODOFREPORT']
coverpage_columns = ['ACCESSION_NUMBER', 'FILINGMANAGER_NAME', 'FILINGMANAGER_CITY']
grouping_columns = ['ACCESSION_NUMBER', 'CUSIP', 'SSHPRNAMTTYPE', 'SUBMISSIONTYPE', 
                    'CIK', 'PERIODOFREPORT', 'FILINGMANAGER_NAME', 'FILINGMANAGER_CITY']

for quarter in folders_to_process:
    infotable = pd.read_csv(os.path.join(quarter, 'INFOTABLE.tsv'), sep='\t', usecols=infotable_columns)
    submissions = pd.read_csv(os.path.join(quarter, 'SUBMISSION.tsv'), sep='\t', usecols=submissions_columns)
    coverpage = pd.read_csv(os.path.join(quarter, 'COVERPAGE.tsv'), sep='\t', usecols=coverpage_columns)
    
    valid_filings = submissions.query("SUBMISSIONTYPE == '13F-HR' and PERIODOFREPORT in @valid_dates")
    filtered_infotable = infotable[infotable['CUSIP'].isin(cusips) & 
                                   pd.isna(infotable['PUTCALL']) & 
                                   (infotable['SSHPRNAMTTYPE'] == 'SH')]
    valid_infotable = pd.merge(filtered_infotable, valid_filings, on='ACCESSION_NUMBER', how='inner')
    all_data = pd.merge(valid_infotable, coverpage, on='ACCESSION_NUMBER', how='left')
    all_data = all_data.groupby(grouping_columns, as_index=False).agg({'VALUE': 'sum',
                                                                       'SSHPRNAMT': 'sum',
                                                                       'INFOTABLE_SK': 'first'})
    
    print(f"{quarter} has length {len(all_data)}.")
    
    for index, row in all_data.iterrows():
        
        if index % 5000 == 0:
            print(index)
            
        # Insert to fund_info table
        try:
            cursor.execute("""
            INSERT IGNORE INTO fund_info (cik, manager_name, city)
            VALUES (%s, %s, %s)
            """, (row['CIK'], row['FILINGMANAGER_NAME'], row['FILINGMANAGER_CITY']))
        except MySQLdb.Error as e:
            print(f"Error inserting into fund_info: {e}")
        
        # Insert to position_info table
        try:
            cursor.execute("""
            INSERT IGNORE INTO position_info (infotable_sk, accession_number, cusip, value, shares, CIK, filing_period)
            VALUES (%s, %s, %s, %s, %s, %s, %s)
            """, (row['INFOTABLE_SK'], row['ACCESSION_NUMBER'], row['CUSIP'], row['VALUE'], row['SSHPRNAMT'], row['CIK'], row['PERIODOFREPORT']))
        except MySQLdb.Error as e:
            print(f"Error inserting into position_info: {e}")
    
    # Committing changes
    connection.commit()
    
    print(f"{quarter} completed.")

  infotable = pd.read_csv(os.path.join(quarter, 'INFOTABLE.tsv'), sep='\t', usecols=infotable_columns)


C:\Users\Jerry\Projects\tester_notebook\2018q1_form13f has length 0.
C:\Users\Jerry\Projects\tester_notebook\2018q1_form13f completed.
C:\Users\Jerry\Projects\tester_notebook\2018q2_form13f has length 471780.
0
5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
55000
60000
65000
70000
75000
80000
85000
90000
95000
100000
105000
110000
115000
120000
125000
130000
135000
140000
145000
150000
155000
160000
165000
170000
175000
180000
185000
190000
195000
200000
205000
210000
215000
220000
225000
230000
235000
240000
245000
250000
255000
260000
265000
270000
275000
280000
285000
290000
295000
300000
305000
310000
315000
320000
325000
330000
335000
340000
345000
350000
355000
360000
365000
370000
375000
380000
385000
390000
395000
400000
405000
410000
415000
420000
425000
430000
435000
440000
445000
450000
455000
460000
465000
470000
C:\Users\Jerry\Projects\tester_notebook\2018q2_form13f completed.
C:\Users\Jerry\Projects\tester_notebook\2018q3_form13f has length 482308.
0
5000
1000

  infotable = pd.read_csv(os.path.join(quarter, 'INFOTABLE.tsv'), sep='\t', usecols=infotable_columns)


C:\Users\Jerry\Projects\tester_notebook\2019q1_form13f has length 498672.
0
5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
55000
60000
65000
70000
75000
80000
85000
90000
95000
100000
105000
110000
115000
120000
125000
130000
135000
140000
145000
150000
155000
160000
165000
170000
175000
180000
185000
190000
195000
200000
205000
210000
215000
220000
225000
230000
235000
240000
245000
250000
255000
260000
265000
270000
275000
280000
285000
290000
295000
300000
305000
310000
315000
320000
325000
330000
335000
340000
345000
350000
355000
360000
365000
370000
375000
380000
385000
390000
395000
400000
405000
410000
415000
420000
425000
430000
435000
440000
445000
450000
455000
460000
465000
470000
475000
480000
485000
490000
495000
C:\Users\Jerry\Projects\tester_notebook\2019q1_form13f completed.
C:\Users\Jerry\Projects\tester_notebook\2019q2_form13f has length 518633.
0
5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
55000
60000
65000
70000
75000
80000
85000
90000
95

  infotable = pd.read_csv(os.path.join(quarter, 'INFOTABLE.tsv'), sep='\t', usecols=infotable_columns)


C:\Users\Jerry\Projects\tester_notebook\2020q3_form13f has length 560035.
0
5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
55000
60000
65000
70000
75000
80000
85000
90000
95000
100000
105000
110000
115000
120000
125000
130000
135000
140000
145000
150000
155000
160000
165000
170000
175000
180000
185000
190000
195000
200000
205000
210000
215000
220000
225000
230000
235000
240000
245000
250000
255000
260000
265000
270000
275000
280000
285000
290000
295000
300000
305000
310000
315000
320000
325000
330000
335000
340000
345000
350000
355000
360000
365000
370000
375000
380000
385000
390000
395000
400000
405000
410000
415000
420000
425000
430000
435000
440000
445000
450000
455000
460000
465000
470000
475000
480000
485000
490000
495000
500000
505000
510000
515000
520000
525000
530000
535000
540000
545000
550000
555000
560000
C:\Users\Jerry\Projects\tester_notebook\2020q3_form13f completed.
C:\Users\Jerry\Projects\tester_notebook\2020q4_form13f has length 570401.
0
5000
10000
15000
2

C:\Users\Jerry\Projects\tester_notebook\2022q3_form13f has length 713608.
0
5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
55000
60000
65000
70000
75000
80000
85000
90000
95000
100000
105000
110000
115000
120000
125000
130000
135000
140000
145000
150000
155000
160000
165000
170000
175000
180000
185000
190000
195000
200000
205000
210000
215000
220000
225000
230000
235000
240000
245000
250000
255000
260000
265000
270000
275000
280000
285000
290000
295000
300000
305000
310000
315000
320000
325000
330000
335000
340000
345000
350000
355000
360000
365000
370000
375000
380000
385000
390000
395000
400000
405000
410000
415000
420000
425000
430000
435000
440000
445000
450000
455000
460000
465000
470000
475000
480000
485000
490000
495000
500000
505000
510000
515000
520000
525000
530000
535000
540000
545000
550000
555000
560000
565000
570000
575000
580000
585000
590000
595000
600000
605000
610000
615000
620000
625000
630000
635000
640000
645000
650000
655000
660000
665000
670000
675000

  infotable = pd.read_csv(os.path.join(quarter, 'INFOTABLE.tsv'), sep='\t', usecols=infotable_columns)


C:\Users\Jerry\Projects\tester_notebook\2022q4_form13f has length 709778.
0
5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
55000
60000
65000
70000
75000
80000
85000
90000
95000
100000
105000
110000
115000
120000
125000
130000
135000
140000
145000
150000
155000
160000
165000
170000
175000
180000
185000
190000
195000
200000
205000
210000
215000
220000
225000
230000
235000
240000
245000
250000
255000
260000
265000
270000
275000
280000
285000
290000
295000
300000
305000
310000
315000
320000
325000
330000
335000
340000
345000
350000
355000
360000
365000
370000
375000
380000
385000
390000
395000
400000
405000
410000
415000
420000
425000
430000
435000
440000
445000
450000
455000
460000
465000
470000
475000
480000
485000
490000
495000
500000
505000
510000
515000
520000
525000
530000
535000
540000
545000
550000
555000
560000
565000
570000
575000
580000
585000
590000
595000
600000
605000
610000
615000
620000
625000
630000
635000
640000
645000
650000
655000
660000
665000
670000
675000

In [12]:
all_data

Unnamed: 0,ACCESSION_NUMBER,CUSIP,SSHPRNAMTTYPE,SUBMISSIONTYPE,CIK,PERIODOFREPORT,FILINGMANAGER_NAME,FILINGMANAGER_CITY,VALUE,SSHPRNAMT,INFOTABLE_SK
0,0000093751-23-000567,00090Q103,SH,13F-HR,93751,31-DEC-2022,STATE STREET CORP,BOSTON,44484847,4885514,79949291
1,0000093751-23-000567,001055102,SH,13F-HR,93751,31-DEC-2022,STATE STREET CORP,BOSTON,2131895283,29634352,79949326
2,0000093751-23-000567,001084102,SH,13F-HR,93751,31-DEC-2022,STATE STREET CORP,BOSTON,240541856,1734385,79949329
3,0000093751-23-000567,00123Q104,SH,13F-HR,93751,31-DEC-2022,STATE STREET CORP,BOSTON,164719567,15745160,79949339
4,0000093751-23-000567,00130H105,SH,13F-HR,93751,31-DEC-2022,STATE STREET CORP,BOSTON,976488338,33953002,79949315
...,...,...,...,...,...,...,...,...,...,...,...
10351,0001978011-23-000010,88160R101,SH,13F-HR,1978011,31-DEC-2022,"Abacus Wealth Partners, LLC",SANTA MONICA,253997,2062,80141776
10352,0001978011-23-000010,902252105,SH,13F-HR,1978011,31-DEC-2022,"Abacus Wealth Partners, LLC",SANTA MONICA,536168,1663,80141778
10353,0001978011-23-000010,907818108,SH,13F-HR,1978011,31-DEC-2022,"Abacus Wealth Partners, LLC",SANTA MONICA,662210,3198,80141779
10354,0001978011-23-000010,92826C839,SH,13F-HR,1978011,31-DEC-2022,"Abacus Wealth Partners, LLC",SANTA MONICA,325637,1567,80141786


# Close the connection

In [13]:
# Close the cursor and the connection
cursor.close()
connection.close()