In [1]:
# general imports
from datetime import datetime
from datetime import timedelta
from multiprocessing import Pool


# data imports
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

# # ML imports
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, LSTM, GRU, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [22]:
# Get the stock data

# Define the folder path
folder_path = '../Data/dsws_Data/2021/'

# Get all .pkl files within folder
pickle_files = [f for f in os.listdir(folder_path) if f.endswith('.pkl')]

#cols = [
#    'month_id', 'fiscal_date', 'year_ws', 'FullName', 'isin', 'ibes_ticker',
#    'country', 'permno', 'cusip_8', 'cusip_9', 'region', 'siccd_numerical',
#    'fama_french_48', 'fama_french_12', 'ret_usd', 'date', 'year_id', 'month',
#    'day', 'monthyear', 'yearmonth', 'daymonthyear', 'yearmonthday', 'size',
#    'price', 'size_local', 'size_local_in_mio', 'interim_report', 'year_ff',
#    'rank_size', 'price_avg', 'siccd', 'sic1', 'sic2', 'sic3', 'ff48', 'ff12',
#    'ret_12_13', 'ret_3_12', 'ret_3_9', 'ret_37_136', 'ret_82_136', 
#    'ret_49_70', 'ret_61_120', 'ret_121_180'
#]

cols = [
    'dscd','year_id','month_id', 'FullName',
    'country','ric','permno', 'siccd',
    'ret_usd', 'date', 'month',
    'size','size_local_in_mio', 'interim_report'
]

df_list = []  #  Init empty list for the dataframes

# Loop through the pickle files and append them to a dataframe
for file in pickle_files:
    file_path = os.path.join(folder_path, file)  # Full path to the pickle file
    df = pd.read_pickle(file_path)  # Read the pickle file into a dataframe
    df_list.append(df[cols])  # Append the dataframe to the list

# Concatenate all dataframes into a single dataframe
df_dta = pd.concat(df_list, ignore_index=True)

# Create a new column 'quarter' that extracts the quarter number from 'fiscal_date'
df_dta['quarter'] = df_dta['date'].dt.quarter
df_dta['ticker'] = df_dta['ric'].apply(lambda x: x.split('.')[0])
df_dta.drop(columns=['ric'], inplace=True)


# Insert the 'quarter' column right after the 'fiscal_date' column
date_index = df_dta.columns.get_loc('date')  # Get index of 'date'
df_dta.insert(date_index + 1, 'quarter', df_dta.pop('quarter'))
ticker_index = df_dta.columns.get_loc('permno')  # Get index of 'ticker'
df_dta.insert(ticker_index + 1, 'ticker', df_dta.pop('ticker'))

In [23]:
df_dta.head(20)

Unnamed: 0,dscd,year_id,month_id,FullName,country,permno,ticker,siccd,ret_usd,date,quarter,month,size,size_local_in_mio,interim_report
0,912212,2021.0,734.0,Johnson and Johnson,usa,22111.0,JNJ,2834,0.03717,2021-03-31,1,3.0,432685.40625,432685408.0,Quarterly
1,500819,2021.0,734.0,Hallmark Financial Services,usa,75985.0,HALL,6399,0.05149,2021-03-31,1,3.0,70.389999,70390.0,Quarterly
2,906560,2021.0,734.0,Nordstrom,usa,57817.0,JWN,5651,0.038957,2021-03-31,1,3.0,5974.759766,5974760.0,Quarterly
3,992953,2021.0,734.0,Universal Health Services 'B',usa,79637.0,UHS,8062,0.065857,2021-03-31,1,3.0,10387.169922,10387170.0,Quarterly
4,912437,2021.0,734.0,Oxford Industries,usa,34948.0,OXM,5611,0.146642,2021-03-31,1,3.0,1480.079956,1480080.0,Quarterly
5,906931,2021.0,734.0,Viatris,usa,69550.0,VTRS,2834,-0.059259,2021-03-31,1,3.0,16862.949219,16862950.0,Quarterly
6,729665,2021.0,734.0,Harley-Davidson,usa,70033.0,HOG,3751,0.128861,2021-03-31,1,3.0,6145.359863,6145360.0,Quarterly
7,357144,2021.0,734.0,Century Casinos,usa,79791.0,CNTY,7011,0.274194,2021-03-31,1,3.0,303.75,303750.0,
8,50343V,2021.0,734.0,Limelight Networks,usa,92097.0,LLNW,7389,0.091745,2021-03-31,1,3.0,447.140015,447140.0,Quarterly
9,884770,2021.0,734.0,Silver Bull (Non-NASDAQ OTC) Resources,usa,91581.0,SVBL,1044,0.082887,2021-03-31,1,3.0,25.92,25920.0,Semi-Annually


In [16]:
len(df_dta.index)

5294

In [17]:
df_dta.tail(20)

Unnamed: 0,dscd,year_id,month_id,FullName,country,permno,siccd,ret_usd,date,quarter,month,size,size_local_in_mio,interim_report,ric,isin
5274,9801CJ,2021.0,737.0,Kraft Heinz,usa,15408.0,2035,-0.064464,2021-06-30,2,6.0,49879.898438,49879900.0,Semi-Annually,KHC.O,US5007541064
5275,944586,2021.0,737.0,Stifel Financial,usa,72996.0,6211,-0.063799,2021-06-30,2,6.0,6816.149902,6816150.0,Semi-Annually,SF,US8606301021
5276,951052,2021.0,737.0,State Street,usa,72726.0,6022,-0.048057,2021-06-30,2,6.0,28614.359375,28614360.0,Semi-Annually,STT,US8574771031
5277,8705DT,2021.0,737.0,Applied Genetic Technologies,usa,14528.0,8731,-0.024937,2021-06-30,2,6.0,167.169998,167170.0,Only Annually,AGTC.O,US03820J1007
5278,877896,2021.0,737.0,Silgan Holdings,usa,84563.0,3411,-0.014954,2021-06-30,2,6.0,4581.569824,4581570.0,Semi-Annually,SLGN.O,US8270481091
5279,98116Y,2021.0,737.0,Ollies Bargain Outlet Holding,usa,15535.0,5311,-0.026724,2021-06-30,2,6.0,5492.399902,5492400.0,Only Annually,OLLI.O,US6811161099
5280,35656C,2021.0,737.0,PTC Therapeutics,usa,13967.0,2834,0.076394,2021-06-30,2,6.0,2978.169922,2978170.0,Only Annually,PTCT.O,US69366J2006
5281,14698C,2021.0,737.0,Principal Financial Group,usa,89195.0,6321,-0.024639,2021-06-30,2,6.0,17188.449219,17188450.0,Semi-Annually,PFG.O,US74251V1026
5282,29796T,2021.0,737.0,Prestige Consumer Healthcare,usa,90564.0,2834,0.044717,2021-06-30,2,6.0,2608.129883,2608130.0,Only Annually,PBH,US74112D1019
5283,9813UY,2021.0,737.0,Sunrun,usa,15643.0,3674,0.247317,2021-06-30,2,6.0,11375.19043,11375190.0,Only Annually,RUN.O,US86771W1053


In [74]:
result = df_dta.dtypes
print(result)         

month_id                    float32
fiscal_date          datetime64[ns]
quarter                       int32
year_ws                     float32
FullName                     object
isin                         object
ibes_ticker                  object
country                      object
permno                      float64
cusip_8                      object
cusip_9                      object
region                       object
siccd_numerical             float64
fama_french_48              float32
fama_french_12              float32
ret_usd                     float32
date                 datetime64[ns]
year_id                     float32
month                       float32
day                         float32
monthyear                    object
yearmonth                    object
daymonthyear                 object
yearmonthday                 object
size                        float32
price                       float32
size_local                  float32
size_local_in_mio           

In [85]:
# Get the 10X data

folder_path = '../Data/10-X_C_2021/2021/'

pickle_files = [f for f in os.listdir(folder_path) if f.endswith('.pkl')]

df_list = []  #  Init empty list for the dataframes

# Loop through the pickle files and append them to a dataframe
for file in pickle_files:
    file_path = os.path.join(folder_path, file)  # Full path to the pickle file
    df = pd.read_pickle(file_path)  # Read the pickle file into a dataframe
    df_list.append(df)  # Append the dataframe to the list

# Concatenate all dataframes into a single dataframe
df_filings = pd.concat(df_list, ignore_index=True)
df_filings.rename(columns={"Quarter": "quarter","SIC Code": "siccd" })

Unnamed: 0,File Name,Company Name,siccd,Filing Type,Conformed Period of Report,Year,quarter,Formatted Date,Item 1A - Risk Factors,Item 1C - Cybersecurity,Item 7 - MD&A,Item 7A - Market Risk,Item 8 - Financial Statements
0,20210104_10-K_edgar_data_1041588_0001041588-21...,"Access-Power & Co., Inc.",4813,10-K,20201231,2020,4,2020-12-31,ITEM 1a. RISK FACTORS ACCR DOES NOT BELIEVE I...,,ITEM 7.\t MANAGEMENTS DISCUSSION AND ANALYSIS ...,ITEM 7a. QUANTITATIVE AND QUALITATIVE DISCLOSU...,ITEM 8.\t FINANCIAL STATEMENTS AND SUPPLEMENTA...
1,20210104_10-K_edgar_data_1604930_0001493152-21...,"Life Clips, Inc.",3861,10-K,20200630,2020,2,2020-06-30,Item 1A. Risk Factors 6,,Item 7. Management s Discussion and Analysis ...,Item 7A. Quantitative and Qualitative Disclos...,Item 8. Consolidated Financial Statements and...
2,20210104_10-Q_edgar_data_1098009_0001185185-20...,America Great Health,2834,10-Q,20190331,2019,1,2019-03-31,ITEM 1A Risk Factors 15,,ITEM 2 Management s Discussion and Analysis ...,ITEM 3 Quantitative and Qualitative Disclosu...,ITEM 1 Condensed Consolidated Financial Stat...
3,20210104_10-Q_edgar_data_1556179_0001104659-20...,"Rocky Mountain Industrials, Inc.",7380,10-Q,20190630,2019,2,2019-06-30,ITEM 1A. RISK FACTORS 10,,Item 2 - Management's Discussion and Analysis ...,ITEM 3. QUANTITATIVE AND QUALITATIVE DISCLOS...,ITEM 1. FINANCIAL STATEMENTS 5
4,20210104_10-Q_edgar_data_1556179_0001104659-20...,"Rocky Mountain Industrials, Inc.",7380,10-Q,20190930,2019,3,2019-09-30,ITEM 1A. RISK FACTORS 10,,Item 2 - Management's Discussion and Analysis ...,ITEM 3. QUANTITATIVE AND QUALITATIVE DISCLOS...,ITEM 1. FINANCIAL STATEMENTS 5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
29315,20211230_10-Q_edgar_data_1317839_0001477932-21...,Umatrin Holding Ltd,7389,10-Q,20210930,2021,3,2021-09-30,Item 1A. Risk Factors. 24 Item 2. Unregistered...,,Item 2. Management s Discussion and Analysis o...,Item 3. Quantitative and Qualitative Disclosur...,Item 1. Financial Statements. 3 Item 2. Manage...
29316,20211230_10-Q_edgar_data_1796160_0001096906-21...,QMIS TBS Capital Group Corp.,6211,10-Q,20210930,2021,3,2021-09-30,,,Item 2. Management s Discussion and Analysis o...,Item 3. Quantitative and Qualitative Disclosur...,Item 1. Financial Statements 4 Item 2. Managem...
29317,20211230_10-Q_edgar_data_1867956_0001683168-21...,Linktory Inc.,7370,10-Q,20211130,2021,4,2021-11-30,,,Item 2. Management s Discussion and Analysis...,Item 3. Quantitative and Qualitative Disclos...,Item 1. Financial Statements (Unaudited) 3...
29318,20211230_10-Q_edgar_data_1879373_0001493152-21...,Energem Corp,6770,10-Q,20210930,2021,3,2021-09-30,Item 1A. Risk Factors 18 Item 2. Unreg...,,Item 2. Management s Discussion and Analysis ...,Item 3. Quantitative and Qualitative Disclosu...,Item 1. Financial Statements 1 Balance s...


In [86]:
len(df_filings.index)

29320

In [77]:
df_filings.head()

Unnamed: 0,File Name,Company Name,SIC Code,Filing Type,Conformed Period of Report,Year,Quarter,Formatted Date,Item 1A - Risk Factors,Item 1C - Cybersecurity,Item 7 - MD&A,Item 7A - Market Risk,Item 8 - Financial Statements
0,20210104_10-K_edgar_data_1041588_0001041588-21...,"Access-Power & Co., Inc.",4813,10-K,20201231,2020,4,2020-12-31,ITEM 1a. RISK FACTORS ACCR DOES NOT BELIEVE I...,,ITEM 7.\t MANAGEMENTS DISCUSSION AND ANALYSIS ...,ITEM 7a. QUANTITATIVE AND QUALITATIVE DISCLOSU...,ITEM 8.\t FINANCIAL STATEMENTS AND SUPPLEMENTA...
1,20210104_10-K_edgar_data_1604930_0001493152-21...,"Life Clips, Inc.",3861,10-K,20200630,2020,2,2020-06-30,Item 1A. Risk Factors 6,,Item 7. Management s Discussion and Analysis ...,Item 7A. Quantitative and Qualitative Disclos...,Item 8. Consolidated Financial Statements and...
2,20210104_10-Q_edgar_data_1098009_0001185185-20...,America Great Health,2834,10-Q,20190331,2019,1,2019-03-31,ITEM 1A Risk Factors 15,,ITEM 2 Management s Discussion and Analysis ...,ITEM 3 Quantitative and Qualitative Disclosu...,ITEM 1 Condensed Consolidated Financial Stat...
3,20210104_10-Q_edgar_data_1556179_0001104659-20...,"Rocky Mountain Industrials, Inc.",7380,10-Q,20190630,2019,2,2019-06-30,ITEM 1A. RISK FACTORS 10,,Item 2 - Management's Discussion and Analysis ...,ITEM 3. QUANTITATIVE AND QUALITATIVE DISCLOS...,ITEM 1. FINANCIAL STATEMENTS 5
4,20210104_10-Q_edgar_data_1556179_0001104659-20...,"Rocky Mountain Industrials, Inc.",7380,10-Q,20190930,2019,3,2019-09-30,ITEM 1A. RISK FACTORS 10,,Item 2 - Management's Discussion and Analysis ...,ITEM 3. QUANTITATIVE AND QUALITATIVE DISCLOS...,ITEM 1. FINANCIAL STATEMENTS 5


In [82]:
# Merge them
####### ADD 'YEAR' TO THIS LATER #########
merged_df = pd.merge(df_filings, df_dta, left_on=['Company Name','Quarter'], 
                     right_on=['FullName','quarter'], 
                     how='left')

In [83]:
merged_df.head()

Unnamed: 0,File Name,Company Name,SIC Code,Filing Type,Conformed Period of Report,Year,Quarter,Formatted Date,Item 1A - Risk Factors,Item 1C - Cybersecurity,...,ff48,ff12,ret_12_13,ret_3_12,ret_3_9,ret_37_136,ret_82_136,ret_49_70,ret_61_120,ret_121_180
0,20210104_10-K_edgar_data_1041588_0001041588-21...,"Access-Power & Co., Inc.",4813,10-K,20201231,2020,4,2020-12-31,ITEM 1a. RISK FACTORS ACCR DOES NOT BELIEVE I...,,...,,,,,,,,,,
1,20210104_10-K_edgar_data_1604930_0001493152-21...,"Life Clips, Inc.",3861,10-K,20200630,2020,2,2020-06-30,Item 1A. Risk Factors 6,,...,,,,,,,,,,
2,20210104_10-Q_edgar_data_1098009_0001185185-20...,America Great Health,2834,10-Q,20190331,2019,1,2019-03-31,ITEM 1A Risk Factors 15,,...,,,,,,,,,,
3,20210104_10-Q_edgar_data_1556179_0001104659-20...,"Rocky Mountain Industrials, Inc.",7380,10-Q,20190630,2019,2,2019-06-30,ITEM 1A. RISK FACTORS 10,,...,,,,,,,,,,
4,20210104_10-Q_edgar_data_1556179_0001104659-20...,"Rocky Mountain Industrials, Inc.",7380,10-Q,20190930,2019,3,2019-09-30,ITEM 1A. RISK FACTORS 10,,...,,,,,,,,,,


In [59]:
print(merged_df.iloc[0])

month_id                                                                     734.0
fiscal_date                                                    2019-12-31 00:00:00
Quarter                                                                          1
year_ws                                                                     2019.0
FullName                                                    First Internet Bancorp
isin                                                                  US3205571017
ibes_ticker                                                                 @:FIBP
country                                                                        usa
permno                                                                     13797.0
cusip_8                                                                   32055710
cusip_9                                                                  320557101
region                                                                          NA
sicc

In [81]:
#merged_df.shape[0]
len(merged_df.index)

32

In [70]:
print(df_dta.loc[(df_dta['FullName'] == "Unknown")].head(1))
print(df_dta.loc[(df_dta['FullName'] == "Unknown")].head())

Empty DataFrame
Columns: [month_id, fiscal_date, quarter, year_ws, FullName, isin, ibes_ticker, country, permno, cusip_8, cusip_9, region, siccd_numerical, fama_french_48, fama_french_12, ret_usd, date, year_id, month, day, monthyear, yearmonth, daymonthyear, yearmonthday, size, price, size_local, size_local_in_mio, interim_report, year_ff, rank_size, price_avg, siccd, sic1, sic2, sic3, ff48, ff12, ret_12_13, ret_3_12, ret_3_9, ret_37_136, ret_82_136, ret_49_70, ret_61_120, ret_121_180]
Index: []

[0 rows x 46 columns]
Empty DataFrame
Columns: [month_id, fiscal_date, quarter, year_ws, FullName, isin, ibes_ticker, country, permno, cusip_8, cusip_9, region, siccd_numerical, fama_french_48, fama_french_12, ret_usd, date, year_id, month, day, monthyear, yearmonth, daymonthyear, yearmonthday, size, price, size_local, size_local_in_mio, interim_report, year_ff, rank_size, price_avg, siccd, sic1, sic2, sic3, ff48, ff12, ret_12_13, ret_3_12, ret_3_9, ret_37_136, ret_82_136, ret_49_70, ret_61_1

In [None]:
# Begin to create 

In [4]:
# Load FinBERT pre-trained model for sentiment analysis
tokenizer = BertTokenizer.from_pretrained("yiyanghkust/finbert-tone")
model = BertForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")

In [None]:
embeddings_path = '../Data/10-X_C_2021/2021/QTR1/'  
stock_data_path = '../Data/dsws_Data/2021/' 


In [None]:
def extract_company_and_sic(text_file_path):
    with open(text_file_path, 'r') as file:
        data = file.read()

    # Extract the content between <SEC-Header> and </SEC-Header>
    header_content = re.search(r'<SEC-Header>(.*?)</SEC-Header>', data, re.DOTALL)
    if header_content:
        header_data = header_content.group(1)
        
        # Extract company name
        company_match = re.search(r'COMPANY CONFORMED NAME:\s+(.+)', header_data)
        company_name = company_match.group(1).strip() if company_match else "Unknown"

        # Extract SIC code (Assuming the format: [xxxx])
        sic_match = re.search(r'STANDARD INDUSTRIAL CLASSIFICATION:.*\[(\d+)\]', header_data)
        sic_code = sic_match.group(1).strip() if sic_match else "Unknown"

        print(f"Company Name: {company_name}, SIC Code: {sic_code}")
        return company_name, sic_code
    else:
        print("SEC-Header not found in the document.")
        return None, None

# Example usage with your text file
company_name, sic_code = extract_company_and_sic('../Data/2021/20210104_10-K_edgar_data_1041588_0001041588-21-000001.txt')

In [None]:
import os
import re
import pandas as pd
import time

# Function to extract COMPANY CONFORMED NAME, SIC code, and main text (excluding <SEC-Header>)
def process_filing_file(file_path):
    ''' 
    This function ... TBD
    
    '''
    with open(file_path, 'r') as file:
        data = file.read()

    # Extract the content between <SEC-Header> and </SEC-Header>
    header_content = re.search(r'<SEC-Header>(.*?)</SEC-Header>', data, re.DOTALL)
    
    if header_content:
        header_data = header_content.group(1)

        # Ensure the filing type is either 10-Q or 10-K
        filing_type_match = re.search(r'CONFORMED SUBMISSION TYPE:\s+(10-Q|10-K)', header_data)
        if filing_type_match:
            filing_type = filing_type_match.group(1)

            # Extract company name
            company_match = re.search(r'COMPANY CONFORMED NAME:\s+(.+)', header_data)
            company_name = company_match.group(1).strip() if company_match else "Unknown"

            # Extract SIC code (Assuming the format: [xxxx])
            sic_match = re.search(r'STANDARD INDUSTRIAL CLASSIFICATION:.*\[(\d+)\]', header_data)
            sic_code = sic_match.group(1).strip() if sic_match else "Unknown"

            # Extract Conformed Period of Report
            period_match = re.search(r'CONFORMED PERIOD OF REPORT:\s+(\d{8})', header_data)
            period = period_match.group(1).strip() if period_match else None

            #  Extract year, quarter, and formatted date
            if period:
                # Use pandas to handle date parsing and quarter extraction
                date = pd.to_datetime(period, format='%Y%m%d')
                year = date.year
                quarter = date.quarter
                formatted_date = date.strftime('%Y-%m-%d')  # Format the date as YYYY-MM-DD
            else:
                year = None
                quarter = None
                formatted_date = None
            # Remove <SEC-Header> content 
            cleaned_text = re.sub(r'<SEC-Header>.*?</SEC-Header>', '', data, flags=re.DOTALL).strip()

            # Extract different items (sections of the filing)

            # Extract items based on filing type due to differences in 10K and 10Q filing structure
            if filing_type == '10-Q':
                item_1 = extract_item_section(cleaned_text, '1')  # Item 1 - Financial Statements
                item_2 = extract_item_section(cleaned_text, '2')  # Item 2 - MD&A
                item_3 = extract_item_section(cleaned_text, '3')  # Item 3 - Market Risk
                item_1a = extract_item_section(cleaned_text, '1A')  # Item 1A - Risk Factors
                return {
                    'File Name': os.path.basename(file_path),
                    'Company Name': company_name,
                    'SIC Code': sic_code,
                    'Filing Type': filing_type,
                    'Conformed Period of Report': period,
                    'Year': year,
                    'Quarter': quarter,
                    'Formatted Date': formatted_date,
                    'Item 8 - Financial Statements': item_1,
                    'Item 7 - MD&A': item_2,
                    'Item 7A - Market Risk': item_3,
                    'Item 1A - Risk Factors': item_1a
                }
            elif filing_type == '10-K':
                #item_1 = extract_item_section(cleaned_text, '1')  # Item 1 - Business
                item_1a = extract_item_section(cleaned_text, '1A')  # Item 1A - Risk Factors
                item_1c = extract_item_section(cleaned_text, '1C')  # Item 1C - Cybersecurity
                item_7 = extract_item_section(cleaned_text, '7')  # Item 7 - MD&A
                item_7a = extract_item_section(cleaned_text, '7A')  # Item 7A - Market Risk
                item_8 = extract_item_section(cleaned_text, '8')  # Item 8 - Financial Statements
                return {
                    'File Name': os.path.basename(file_path),
                    'Company Name': company_name,
                    'SIC Code': sic_code,
                    'Filing Type': filing_type,
                    'Conformed Period of Report': period,
                    'Year': year,
                    'Quarter': quarter,
                    'Formatted Date': formatted_date,
                    #'Item 1 - Business': item_1,
                    'Item 1A - Risk Factors': item_1a,
                    'Item 1C - Cybersecurity': item_1c,
                    'Item 7 - MD&A': item_7,
                    'Item 7A - Market Risk': item_7a,
                    'Item 8 - Financial Statements': item_8
                }   
        else:
            print(f"File {file_path} is neither a 10-Q nor 10-K filing.")
            return None
    else:
        print(f"SEC-Header not found in the document {file_path}.")
        return None

# Extract the sections based on item tag used in SEC filings using a matching method
def extract_item_section(text, item_number):
    # Matches text from Item X until the next item or end of text
    #pattern = fr"Item {item_number}[\s\S]*?(?=Item \d|\Z)"
    #pattern = fr"Item\s*{item_number}[\.\s\r\n]+[\s\S]*?(?=\nItem\s*\d|\Z)"
    
    pattern = fr"ITEM\s*{item_number}[\.\s\r\n]+[\s\S]*?(?=\nITEM\s*\d|\Z)"
    match = re.search(pattern, text, re.IGNORECASE)
    
    if match:
        # Get the text, split by the last newline, and replace other newlines with spaces
        #section_text = match.group(0)
        #section_text = section_text.rsplit('\n', 1)[0]  # Keep everything before the last newline
        #section_text = section_text.replace('\n', ' ')  # Replace all other newlines with spaces
        section_text = match.group(0).replace('\n', ' ').replace('\r', ' ')
        return section_text.strip()
    return None
    
# Recursively look through all text files in folder and process them
def process_all_filings(folder_path):
    ''' This function recursively looks through all files and recursively processes them into 
    dataframes using process_filing_file function. 
    '''
    data = []

   # Traverse through all subdolders and files
    for dirpath, _, filenames in os.walk(folder_path):
        for file_name in filenames:
            if file_name.endswith('.txt'):  # Using only .txt files
                file_path = os.path.join(dirpath, file_name)
                filing_data = process_filing_file(file_path)

                if filing_data:
                    data.append(filing_data) # Add to data list

    # Convert the list of dictionaries to a DataFrame
    df = pd.DataFrame(data)
    return df

folder_path = '../Data/10-X_C_2021/2021/'

# Record the start time
start_time = time.time()

# Process all the filings
df_filings = process_all_filings(folder_path)

# Record the end time
end_time = time.time()

# Calculate the time taken
execution_time = end_time - start_time
print(f"Time taken to process the filings: {execution_time} seconds")

# Display the resulting DataFrame
df_filings.head(20)

In [None]:
df_filings.tail(20)

In [15]:
df_10X.to_csv('10X_filings_data.csv', index=False)

In [13]:
# Create a sentiment analysis pipeline using the FinBERT model
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

def tags_filter(text, start_tag, end_tag):
    pattern = f'{start_tag}(.*?){end_tag}'
    matches = re.findall(pattern, text, re.DOTALL)  # re.DOTALL to match across newlines
    return matches

# Extract the stock ticker 
def stock_symbols(text):
    # Regex for stock symbols (e.g., AAPL, TSLA)
    return re.findall(r'\b[A-Z]{2,5}\b', text)

# Function to clean and process document text
def clean_text(text):
    # Remove unnecessary whitespaces and newlines
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# Function to perform sentiment analysis on each section related to a company
def analyze_sentiment(text, stock_symbols):
    results = []
    # Split text into sentences or chunks for analysis
    sentences = re.split(r'(?<=[.!?]) +', text)
    
    # Analyze each sentence for stock-related mentions
    for sentence in sentences:
        for symbol in stock_symbols:
            if symbol in sentence:
                # Perform sentiment analysis on the sentence
                sentiment = nlp(sentence)
                results.append({"Company": symbol, "Text": sentence, "Sentiment": sentiment[0]})
    return results

In [4]:
import dask.dataframe as dd
import pandas as pd


# Define the file path
file_path = '../Data/2021/2021_Q1.dta'

# Define the chunk size for reading the .dta file
chunk_size = 100000  # Adjust based on available memory

# Create an empty list to hold the chunks
chunk_list = []

# Read the .dta file in chunks using pandas
for chunk in pd.read_stata(file_path, chunksize=chunk_size):
    # Append each chunk to the list (you could process each chunk here as well)
    chunk_list.append(chunk)

# Concatenate all chunks into a single DataFrame
# Note: This is still pandas but allows us to load the large file in chunks
df_pandas = pd.concat(chunk_list, ignore_index=True)

# Now convert the Pandas DataFrame into a Dask DataFrame
df_dask = dd.from_pandas(df_pandas, npartitions=10)

# Process the Dask DataFrame, for example, getting a summary statistic
summary = df_dask.describe().compute()  # .compute() triggers actual computation

print(summary)

  x = np.divide(x1, x2, out)
  x = np.divide(x1, x2, out)


           month_id          fiscal_date       year_ws  firm_loeschen  \
count  1.034538e+07             10345379  1.034538e+07   1.034538e+07   
min    2.460000e+02  1980-01-03 00:00:00  1.980000e+03   0.000000e+00   
25%    4.990000e+02  2000-12-31 00:00:00  2.000000e+03   0.000000e+00   
50%    5.980000e+02  2008-03-31 00:00:00  2.008000e+03   0.000000e+00   
75%    6.630000e+02  2014-03-31 00:00:00  2.014000e+03   0.000000e+00   
max    7.370000e+02  2020-12-31 00:00:00  2.021000e+03   1.000000e+00   
mean   5.811437e+02                  NaN  2.006784e+03   7.709123e-02   
std    1.082745e+02                  NaN  9.009677e+00   2.667361e-01   

             permno  dummy_in_crsp  siccd_numerical  fama_french_48  \
count  1.831076e+06      1831076.0     1.033028e+07    1.031494e+07   
min    1.000100e+04            1.0     1.000000e+02    1.000000e+00   
25%    5.671000e+04            1.0     2.911000e+03    1.800000e+01   
50%    8.046200e+04            1.0     3.999000e+03    3.4

In [14]:
# Load the 10-K report data from file
file_path = '../Data/10-X_C_2023/2023/QTR1/20230103_10-K_edgar_data_1487931_0001477932-23-000012.txt'
with open(file_path, 'r', encoding='utf-8') as f:
    document_text = f.read()


In [16]:
cleaned_text = clean_text(document_text)

# start_tag = "<ITEM 1A>"  
# end_tag = "</ITEM 1A>"  
# tagged_texts = tags_filter(cleaned_text, start_tag, end_tag)


In [17]:

# Step 2: Extract stock symbols from the text (you can hardcode the company symbols if needed)
stock_symbols = extract_stock_symbols(cleaned_text)

# Step 3: Perform sentiment analysis for sentences containing stock symbols
sentiment_results = analyze_sentiment(cleaned_text, stock_symbols)

# Step 4: Create a DataFrame for better visualization of results
df_sentiment = pd.DataFrame(sentiment_results)

# Display the first few rows of sentiment results
print(df_sentiment.head())

# Step 5: (Optional) Save the results to a CSV for later analysis
df_sentiment.to_csv('sentiment_analysis_results.csv', index=False)

KeyboardInterrupt: 