In [1]:
import os
import pandas as pd
import numpy as np
import psycopg2
import psycopg2.extras as extras
import platform

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

conn = psycopg2.connect(
    database="graphyfinancials",
    host="localhost",
    user="derek",
    password="",
    port="5432"
)

cursor = conn.cursor()

In [2]:
def insert_into_db(conn, df, table:str):
    tuples = [tuple(x) for x in df.to_numpy()]
    cols = ','.join(list(df.columns))
    query = "INSERT INTO %s(%s) VALUES %%s " % (table,cols)
    cursor = conn.cursor()
    try:
        print('Starting insertion')
        extras.execute_values(cursor, query, tuples)
        conn.commit()
        print(f'Columns inserted:{cols}')
        print("the dataframe is inserted")
    except (Exception, psycopg2.DatabaseError) as error:
        print("error", error)
        conn.rollback()
        raise
    finally:
        cursor.close()
    cursor.close()

In [3]:

mac_data_directory = '/Users/derek/Documents/Github_Repos/graphy_financials/data'
# win_data_directory = '\Users\derek\Documents\Github_Repos\graphy_financials\data'

tsv_disclosure = []
tsv_coissuer_info = []
tsv_issuer_info = []
tsv_issuer_jurisdiction = []
tsv_issuer_signature = []
tsv_signature = []
tsv_submission = []

# def tsv_processor(arr_tsv: object):
#     for folder_name in 

for folder_name in os.listdir(mac_data_directory):
    folder_path = os.path.join(mac_data_directory, folder_name)
    
    if os.path.isdir(folder_path):
        for root, dirs, files in os.walk(folder_path):
            for file_name in files:
                if file_name.endswith('DISCLOSURE.tsv'):
                    tsv_path = os.path.join(root, file_name)
                    tsv_disclosure.append(tsv_path)
                if file_name.endswith('C_COISSUER_INFORMATION.tsv'):
                    tsv_path = os.path.join(root, file_name)
                    tsv_coissuer_info.append(tsv_path)
                if file_name.endswith('C_ISSUER_INFORMATION.tsv'):
                    tsv_path = os.path.join(root, file_name)
                    tsv_issuer_info.append(tsv_path)
                if file_name.endswith('ISSUER_JURISDICTIONS.tsv'):
                    tsv_path = os.path.join(root, file_name)
                    tsv_issuer_jurisdiction.append(tsv_path)
                if file_name.endswith('ISSUER_SIGNATURE.tsv'):
                    tsv_path = os.path.join(root, file_name)
                    tsv_issuer_signature.append(tsv_path)
                if file_name.endswith('C_SIGNATURE.tsv'):
                    tsv_path = os.path.join(root, file_name)
                    tsv_signature.append(tsv_path)
                if file_name.endswith('SUBMISSION.tsv'):
                    tsv_path = os.path.join(root, file_name)
                    tsv_submission.append(tsv_path)

In [4]:
def tsv_to_dataframe(tsv_list: object, multi_df=False):
    if multi_df == True:
        consolidated_df = []
        for tsv_file in tsv_list:
            df = pd.read_csv(tsv_file, sep='\t')
            consolidated_df.append(df)
        # print(consolidated_df)
        return pd.concat(consolidated_df, ignore_index=True)
    elif multi_df == False:
        df = pd.read_csv(tsv_list)
        return df

disclosure_dataframe = tsv_to_dataframe(tsv_disclosure, multi_df=True)
coissuer_info_dataframe = tsv_to_dataframe(tsv_coissuer_info, multi_df=True)
issuer_info_dataframe = tsv_to_dataframe(tsv_issuer_info, multi_df=True)
issuer_jurisdiction_dataframe = tsv_to_dataframe(tsv_issuer_jurisdiction, multi_df=True)
issuer_signature_dataframe = tsv_to_dataframe(tsv_issuer_signature, multi_df=True)
signature_dataframe = tsv_to_dataframe(tsv_signature, multi_df=True)
submission_dataframes = tsv_to_dataframe(tsv_submission, multi_df=True)

In [5]:
# COMPANY INFORMATION

df1 = issuer_info_dataframe
df2 = submission_dataframes
df = pd.merge(df1, df2, how='inner', on='ACCESSION_NUMBER')

columns_to_keep = ['NAMEOFISSUER', 'STREET1', 'STREET2', 'CITY', 'STATEORCOUNTRY', 'ZIPCODE', 'ISSUERWEBSITE', 'CIK', 'DATEINCORPORATION']

df = df[columns_to_keep]
df = df.replace(np.nan,None)
df = df.drop_duplicates(subset=['CIK'], keep='first')

df = df.rename(columns = {
    "NAMEOFISSUER": "company_title",
    "STREET1": "street_1",
    "STREET2": "street_2",
    "CITY": "city",
    "STATEORCOUNTRY": "state_or_country",
    "ZIPCODE": "zipcode",
    "ISSUERWEBSITE": "website",
    "CIK": "cik",
    "DATEINCORPORATION": "date_incorporation"
})
df.info()
df['cik'] = df['cik'].astype(str).str.replace(r'\..*','',regex=True)
# insert_into_db(conn, df,'companies')
company_df = df

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6215 entries, 0 to 23467
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   company_title       6214 non-null   object
 1   street_1            5795 non-null   object
 2   street_2            1716 non-null   object
 3   city                5795 non-null   object
 4   state_or_country    5795 non-null   object
 5   zipcode             5795 non-null   object
 6   website             5795 non-null   object
 7   cik                 6214 non-null   object
 8   date_incorporation  5795 non-null   object
dtypes: object(9)
memory usage: 485.5+ KB


In [10]:
# submission_dataframes.info()
# submission_dataframes.head()

submission_dataframes.rename(columns={
    'ACCESSION_NUMBER':'accession_number',
    'SUBMISSION_TYPE':'submission_type',
    'FILING_DATE':'filing_date',
    'CIK':'cik',
    'FILE_NUMBER':'file_number',
    'PERIOD':'period'
},inplace=True)

filtered_df = submission_dataframes[['accession_number','submission_type','cik','file_number','filing_date','period']]

filtered_df['filing_date'] = pd.to_datetime(filtered_df['filing_date'], format='%Y%m%d')
filtered_df['period'] = pd.to_datetime(filtered_df['period'], format='%Y%m%d')
filtered_df['period'].fillna('1900-01-01',inplace=True)

df = filtered_df.astype({
    'cik':'string',
    'filing_date':'object',
    'period':'object'
})

df['cik'] = df['cik'].str.replace(r'\..*','',regex=True)
df['cik'].fillna('',inplace=True)


submission_df = df


In [19]:
# DISCLOSURES

disclosure_dataframe.rename(columns = {
    'ACCESSION_NUMBER': 'accession_number',
    'COMPENSATIONAMOUNT': 'compensation_amount_description', 
    'FINANCIALINTEREST': 'financial_interest',
    'SECURITYOFFEREDTYPE': 'security_offered_type',
    'SECURITYOFFEREDOTHERDESC': 'security_offered_other_desc',
    'NOOFSECURITYOFFERED': 'no_of_security_offered',
    'PRICE': 'price',
    'PRICEDETERMINATIONMETHOD': 'price_determination_method', 
    'OFFERINGAMOUNT': 'offering_amount',
    'OVERSUBSCRIPTIONACCEPTED': 'oversubscription_accepted',
    'OVERSUBSCRIPTIONALLOCATIONTYPE': 'oversubscription_allocation_type',
    'DESCOVERSUBSCRIPTION': 'desc_oversubscription',
    'MAXIMUMOFFERINGAMOUNT': 'maximum_offering_amount',
    'DEADLINEDATE': 'deadline_date',
    'CURRENTEMPLOYEES': 'current_employees',
    'TOTALASSETMOSTRECENTFISCALYEAR': 'total_assets_most_recent_fiscal_year',
    'TOTALASSETPRIORFISCALYEAR': 'total_assets_prior_fiscal_year',
    'CASHEQUIMOSTRECENTFISCALYEAR': 'cash_equity_most_recent_fiscal_year',
    'CASHEQUIPRIORFISCALYEAR': 'cash_equity_prior_fiscal_year',
    'ACTRECEIVEDRECENTFISCALYEAR': 'act_received_recent_fiscal_year', 
    'ACTRECEIVEDPRIORFISCALYEAR': 'act_received_prior_fiscal_year',
    'SHORTTERMDEBTMRECENTFISCALYEAR': 'short_term_debt_recent_fiscal_year',
    'SHORTTERMDEBTPRIORFISCALYEAR': 'short_term_debt_prior_fiscal_year',
    'LONGTERMDEBTRECENTFISCALYEAR': 'long_term_debt_recent_fiscal_year',
    'LONGTERMDEBTPRIORFISCALYEAR': 'long_term_debt_prior_fiscal_year',
    'REVENUEMOSTRECENTFISCALYEAR': 'revenue_most_recent_fiscal_year',
    'REVENUEPRIORFISCALYEAR': 'revenue_prior_fiscal_year', 
    'COSTGOODSSOLDRECENTFISCALYEAR': 'cost_goods_sold_recent_fiscal_year',
    'COSTGOODSSOLDPRIORFISCALYEAR': 'cost_goods_sold_prior_fiscal_year',
    'TAXPAIDMOSTRECENTFISCALYEAR': 'tax_paid_most_recent_fiscal_year',
    'TAXPAIDPRIORFISCALYEAR': 'tax_paid_prior_fiscal_year',
    'NETINCOMEMOSTRECENTFISCALYEAR': 'net_income_most_recent_fiscal_year',
    'NETINCOMEPRIORFISCALYEAR': 'net_income_prior_fiscal_year'
}, inplace = True )

# change data type
disclosure_dataframe = disclosure_dataframe.astype({
    'accession_number': 'object',
    'compensation_amount_description': 'object',
    'financial_interest': 'object',
    'security_offered_type': 'object',
    'security_offered_other_desc': 'object',
    'no_of_security_offered': 'float',
    'price': 'float',
    'price_determination_method': 'object',
    'offering_amount': 'float64',
    'oversubscription_accepted': 'object',
    'oversubscription_allocation_type': 'object',
    'desc_oversubscription': 'object',
    'maximum_offering_amount': 'object',
    'deadline_date': 'object',
    'current_employees': 'object',
    'total_assets_most_recent_fiscal_year': 'float',
    'total_assets_prior_fiscal_year': 'float',
    'cash_equity_most_recent_fiscal_year': 'float',
    'cash_equity_prior_fiscal_year': 'float',
    'act_received_recent_fiscal_year': 'float',
    'act_received_prior_fiscal_year': 'float',
    'short_term_debt_recent_fiscal_year': 'float',
    'short_term_debt_prior_fiscal_year': 'float',
    'long_term_debt_recent_fiscal_year': 'float',
    'long_term_debt_prior_fiscal_year': 'float',
    'revenue_most_recent_fiscal_year': 'float',
    'revenue_prior_fiscal_year': 'float', 
    'cost_goods_sold_recent_fiscal_year': 'float',
    'cost_goods_sold_prior_fiscal_year': 'float',
    'tax_paid_most_recent_fiscal_year': 'float',
    'tax_paid_prior_fiscal_year': 'float',
    'net_income_most_recent_fiscal_year': 'float',
    'net_income_prior_fiscal_year': 'float'
})

# disclosure_dataframe.drop('deadline_date',axis=1,inplace=True)
# disclosure_dataframe.info()
df = disclosure_dataframe.where(pd.notna(disclosure_dataframe), None)
df = disclosure_dataframe.replace(np.nan,None)

# df.to_csv('/Users/derek/Downloads/disclosure_extract.csv')

disclosure_df = df[
        [
            'accession_number',
            'compensation_amount_description',
            'financial_interest',
            'security_offered_type',
            'security_offered_other_desc',
            'no_of_security_offered',
            'price',
            'price_determination_method',
            'offering_amount',
            'oversubscription_accepted',
            'oversubscription_allocation_type',
            'desc_oversubscription',
            'maximum_offering_amount',
            'deadline_date',
            'current_employees',
            'total_assets_most_recent_fiscal_year',
            'total_assets_prior_fiscal_year',
            'cash_equity_most_recent_fiscal_year',
            'cash_equity_prior_fiscal_year',
            'act_received_recent_fiscal_year',
            'act_received_prior_fiscal_year',
            'short_term_debt_recent_fiscal_year',
            'short_term_debt_prior_fiscal_year',
            'long_term_debt_recent_fiscal_year',
            'long_term_debt_prior_fiscal_year',
            'revenue_most_recent_fiscal_year',
            'revenue_prior_fiscal_year',
            'cost_goods_sold_recent_fiscal_year',
            'cost_goods_sold_prior_fiscal_year',
            'tax_paid_most_recent_fiscal_year',
            'tax_paid_prior_fiscal_year',
            'net_income_most_recent_fiscal_year',
            'net_income_prior_fiscal_year',
        ]
    ]

financial_df = pd.merge(disclosure_df, submission_df, how='left', on='accession_number')
financial_df.info()
recent_fiscal_year = [
    'cik',
    'period',
    'revenue_most_recent_fiscal_year',
    'cost_goods_sold_recent_fiscal_year',
    'net_income_most_recent_fiscal_year',
    'tax_paid_most_recent_fiscal_year',
    'total_assets_most_recent_fiscal_year',
    'cash_equity_most_recent_fiscal_year',
    'act_received_recent_fiscal_year',
    'short_term_debt_recent_fiscal_year',
    'long_term_debt_recent_fiscal_year'
]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23474 entries, 0 to 23473
Data columns (total 38 columns):
 #   Column                                Non-Null Count  Dtype 
---  ------                                --------------  ----- 
 0   accession_number                      23474 non-null  object
 1   compensation_amount_description       18671 non-null  object
 2   financial_interest                    15851 non-null  object
 3   security_offered_type                 18672 non-null  object
 4   security_offered_other_desc           8181 non-null   object
 5   no_of_security_offered                15020 non-null  object
 6   price                                 17692 non-null  object
 7   price_determination_method            10723 non-null  object
 8   offering_amount                       18672 non-null  object
 9   oversubscription_accepted             18672 non-null  object
 10  oversubscription_allocation_type      18447 non-null  object
 11  desc_oversubscription       

In [21]:
financial_df.describe()

Unnamed: 0,accession_number,compensation_amount_description,financial_interest,security_offered_type,security_offered_other_desc,no_of_security_offered,price,price_determination_method,offering_amount,oversubscription_accepted,oversubscription_allocation_type,desc_oversubscription,maximum_offering_amount,deadline_date,current_employees,total_assets_most_recent_fiscal_year,total_assets_prior_fiscal_year,cash_equity_most_recent_fiscal_year,cash_equity_prior_fiscal_year,act_received_recent_fiscal_year,act_received_prior_fiscal_year,short_term_debt_recent_fiscal_year,short_term_debt_prior_fiscal_year,long_term_debt_recent_fiscal_year,long_term_debt_prior_fiscal_year,revenue_most_recent_fiscal_year,revenue_prior_fiscal_year,cost_goods_sold_recent_fiscal_year,cost_goods_sold_prior_fiscal_year,tax_paid_most_recent_fiscal_year,tax_paid_prior_fiscal_year,net_income_most_recent_fiscal_year,net_income_prior_fiscal_year,submission_type,cik,file_number,filing_date,period
count,23474,18671,15851,18672,8181,15020.0,17692.0,10723,18672.0,18672,18447,10909,18447.0,18672,21948.0,21948.0,21948.0,21948.0,21948.0,21948.0,21948.0,21948.0,21948.0,21948.0,21948.0,21948.0,21948.0,21948.0,21948.0,21948.0,21948.0,21948.0,21948.0,23474,23474,23474,23474,23474
unique,23474,1343,502,4,604,1247.0,825.0,1307,806.0,2,3,162,1395.0,2092,133.0,8175.0,6414.0,7516.0,5949.0,3114.0,2399.0,6210.0,4865.0,4347.0,3374.0,5788.0,4535.0,4861.0,3903.0,1354.0,1034.0,8323.0,6582.0,11,6215,7319,1756,164
top,0001820003-20-000004,7.0 percent,No,Other,Simple Agreement for Future Equity (SAFE),50000.0,1.0,The Notes are being valued at their face value...,10000.0,Y,Other,"At issuer's discretion, with priority given to...",1070000.0,2022-04-30,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,C/A,1781955,020-24293,2022-05-02 00:00:00,1900-01-01 00:00:00
freq,1,1331,2941,8181,1634,1861.0,8800.0,1356,4227.0,18447,10909,3662,4172.0,883,3419.0,4304.0,8165.0,4939.0,8655.0,15037.0,16760.0,8710.0,11593.0,11839.0,14108.0,9647.0,12366.0,11783.0,13864.0,18772.0,19430.0,4368.0,7964.0,7863,47,36,195,20190


In [25]:
# ISSUER INFO

issuer_info_dataframe.rename(columns={
    'ACCESSION_NUMBER': 'accession_number',
    'ISAMENDMENT': 'is_amendment', 
    'PROGRESSUPDATE': 'progress_update',
    'NATUREOFAMENDMENT': 'nature_of_amendment',
    'NAMEOFISSUER': 'name_of_issuer',
    'LEGALSTATUSFORM': 'legal_status_form',
    'LEGALSTATUSOTHERDESC': 'legal_status_other_desc',
    'JURISDICTIONORGANIZATION': 'jurisdiction_organization',
    'DATEINCORPORATION': 'date_incorporation',
    'STREET1': 'street1',
    'STREET2': 'street2',
    'CITY': 'city',
    'STATEORCOUNTRY': 'state_or_country',
    'ZIPCODE': 'zipcode',
    'ISSUERWEBSITE': 'issuer_website',
    'COMPANYNAME': 'intermediary_name',
    'COMMISSIONCIK': 'intermediary_cik',
    'COMMISSIONFILENUMBER': 'commission_file_number',
    'CRDNUMBER': 'crd_number',
    'ISCOISSUER': 'is_co_issuer'
},inplace=True)

# issuer_info_dataframe.info()
df = issuer_info_dataframe.astype({
    'intermediary_cik':'string'
})


# data cleaning
df['date_incorporation'].fillna('1900-01-01',inplace=True)
df['intermediary_cik'] = df['intermediary_cik'].str.replace(r'\..*','',regex=True)
df['intermediary_cik'].fillna('',inplace=True)

df = df.where(pd.notna(df),None)

# df.to_csv('/Users/derek/Downloads/issuer_info_df.csv')
# insert_into_db(conn, df, 'temp_issuer_info')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23474 entries, 0 to 23473
Data columns (total 20 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   accession_number           23474 non-null  object 
 1   is_amendment               23474 non-null  int64  
 2   progress_update            3575 non-null   object 
 3   nature_of_amendment        7862 non-null   object 
 4   name_of_issuer             23472 non-null  object 
 5   legal_status_form          22587 non-null  object 
 6   legal_status_other_desc    262 non-null    object 
 7   jurisdiction_organization  22587 non-null  object 
 8   date_incorporation         23474 non-null  object 
 9   street1                    22586 non-null  object 
 10  street2                    7121 non-null   object 
 11  city                       22587 non-null  object 
 12  state_or_country           22587 non-null  object 
 13  zipcode                    22587 non-null  obj

In [16]:
# COISSUER

coissuer_info_dataframe.rename(columns={
    'ACCESSION_NUMBER':'accession_number',
    'ID':'id',
    'ISEDGARFILER':'is_edgar_filer',
    'COISSUERCIK':'co_issuer_cik',
    'NAMEOFCOISSUER':'name_of_co_issuer',
    'LEGALSTATUSFORM':'legal_status_form',
    'LEGALSTATUSOTHERDESC':'legal_status_other_desc',
    'JURISDICTIONORGANIZATION':'jurisdiction_organization',
    'DATEINCORPORATION':'date_incorporation',
    'STREET1':'street_1',
    'STREET2':'street_2',
    'CITY':'city',
    'STATEORCOUNTRY':'state_or_country',
    'ZIPCODE':'zipcode',
    'COISSUERWEBSITE':'co_issuer_website'
},inplace=True)

coissuer_info_dataframe['date_incorporation'] = pd.to_datetime(coissuer_info_dataframe['date_incorporation'], format='%d-%b-%Y')

df = coissuer_info_dataframe.astype({
    'accession_number':'object',
    'id':'object',
    'date_incorporation':'datetime64[ns]',
    'co_issuer_cik': 'object'
})

df = df.where(pd.notna(df), None)
# coissuer_info_dataframe

df = df.drop(columns=[
    'is_edgar_filer',
    'legal_status_form',
    'legal_status_other_desc',
    'jurisdiction_organization'
    ])

df = df.replace(np.nan, None)
# df.to_csv('/Users/derek/Downloads/coissuer_info_df.csv')
# df

# insert_into_db(conn, coissuer_info_dataframe,'temp_coissuer_info')