In [None]:
import os
import re
import pandas as pd
import requests
import pickle
from PIL import Image
from io import BytesIO
from tqdm import tqdm
tqdm.pandas()
from bs4 import BeautifulSoup
headers={'User-Agent': 'email@gmail.com'}

## Get Filings From WRDS (I used DEF14A here. You can change it to include any SEC Filings)

In [None]:
import psycopg2
from psycopg2.extras import json as psycop_json

def get_filing():
    with psycopg2.connect(
        host="wrds-pgdata.wharton.upenn.edu",
        database='wrds',
        user='Provide your username',
        password='Provide your password',
        port=9737
    ) as conn:
        conn.autocommit = True
        with conn.cursor() as cursor:
            sql_query = """
             SELECT
                filing_view.form,
                filing_view.filing_date,
                filing_view.filing,
                filing_view.registrants
            FROM
                wrds_sec_search.filing_view
            JOIN
                wrds_sec_search.registrant ON registrant.accession = filing_view.accession
            WHERE
                filing_view.form = 'DEF 14A'
            AND
                filing_date > '2019-01-01'
            """
            cursor.execute(sql_query)
            results = cursor.fetchall()
            column_names = [desc[0] for desc in cursor.description]
            results=pd.DataFrame(results, columns=column_names)
            return results


Data=get_filing()

Data.registrants=[i[0] for i in Data.registrants]

Data = pd.concat([Data, Data['registrants'].apply(pd.Series)], axis=1)

In [None]:
def keep_compustat_firms(Data):
    #get gvkey_cik_link
    with psycopg2.connect(
        host="wrds-pgdata.wharton.upenn.edu",
        database='wrds',
        user='Provide your username',
        password='Provide your password',
        port=9737
    ) as conn:
        conn.autocommit = True
        with conn.cursor() as cursor:
            sql_query = """
            SELECT
                *
            FROM
                wrdssec.wciklink_gvkey
            Where
                flag in (2,3)
            """
            cursor.execute(sql_query)
            results = cursor.fetchall()
            column_names = [desc[0] for desc in cursor.description]
            link=pd.DataFrame(results, columns=column_names)

            link=link.drop_duplicates(subset=['gvkey', 'cik'])

            Data=Data[Data['cik'].isin(link['cik'].tolist())].reset_index(drop=True)

            return Data

Data=keep_compustat_firms(Data)

In [None]:
Data['cik']=Data.cik.astype(int)
Data['cik']=Data.cik.astype(str)
Data['accessionstr']=Data['accession'].str.replace("-","")
Data['link']='https://www.sec.gov/Archives/edgar/data/'+Data['cik']+ '/'+ Data['accession']+'-index.html'

In [None]:
def clean_text(text):
    if not text:
        return ''
    return re.sub('(\s+|\S*@\S*\s?|[\']|[•●\x92-\x98])', ' ', text)

def get_first_document(doc):
    soup = BeautifulSoup(doc, 'html.parser')
    documents = soup.find_all(re.compile("DOCUMENT", re.IGNORECASE))

    document_texts = []
    for document in documents:
        text_section = document.find('text')
        if text_section:
            document_texts.append(clean_text(text_section.text))

    return document_texts[0] if document_texts else None

Data['text']=Data.filing.progress_apply(lambda x: clean_text(x))

In [None]:
Data

# Llama Setup

# For this section, you will need



1.   Access to Llama-2 (https://ai.meta.com/resources/models-and-libraries/llama-downloads/)
2.   Access to Huggingace API Key
3.   Access to OpenAI API Key (Optional)





In [None]:
!pip install -qqq  transformers einops accelerate langchain bitsandbytes
!pip install -qqq transformers
!pip install -qqq llama-index
!pip install -qqq sentence_transformers
!pip install -qqq ipywidgets==7.7.1

In [None]:
import torch
import transformers

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.llms import HuggingFaceLLM

In [None]:
from llama_index.prompts.prompts import SimpleInputPrompt

system_prompt = "You are an Finance expert and Q&A assistant. Your goal is to answer questions as accurately as possible based on the instructions and context provided."

# This will wrap the default prompts that are internal to llama-index
query_wrapper_prompt = SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")

In [None]:
llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=512,
    generate_kwargs={"temperature": 0.0, "do_sample": False},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="meta-llama/Llama-2-7b-chat-hf",
    model_name="meta-llama/Llama-2-7b-chat-hf",
    device_map="auto",
    tokenizer_kwargs={"return_token_type_ids": False},
    model_kwargs={"torch_dtype": torch.float16 , "load_in_8bit":True}
)

In [None]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index import LangchainEmbedding, ServiceContext

embed_model = LangchainEmbedding(
  HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
)

In [None]:
service_context = ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
    embed_model=embed_model
)

# OpenAI Setup

In [None]:
!pip install openai

In [None]:
import os
import openai
openai.api_key = "sk-"

# Creating Pipeline

In [None]:
#You need to create a Folder called temp inside MD&A folder.
def qa_llama(context):
    try:
        with open(f'Your Working Folder/temp/temp.txt', 'w') as f:
            f.write(context)

        # use SimpleDirectoryReader to read the documents
        documents = SimpleDirectoryReader('Your Working Folder/temp').load_data()

        os.remove("Your Working Folder/temp/temp.txt")

        index = VectorStoreIndex.from_documents(documents, service_context=service_context)

        query_engine = index.as_query_engine()
        response = query_engine.query('''As a finance expert, using following text please answer the following question. How many directors are classified as independent director?''')
        return response.response
    except:
        print('error')
        return 'error'


In [None]:
def qa_openai(context):
  try:
    response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo-16k",
    temperature=0.0,
    messages=[
          {"role": "system", "content": "You are a helpful finance expert."},
          {"role": "user", "content": ''' "As a finance expert, using following text please answer the following question. How many directors are classified as independent director?''' '\n \n' + context }])
    return response["choices"][0]["message"]['content']

  except:
    return('Error. Please check 1) API Key 2)Token Size')

In [5]:
#Example using llama
print(qa_llama(Data.text[10]))

Based on the provided context information, 5 directors are independent.


In [4]:
#Example using openai
print(qa_openai(Data.text[10]))

Total 5 directors are classified as independent directors


In [None]:
#Get the information for all data
Data['output_llama']=Data.text.progress_apply(lambda x: qa_llama(x))

In [None]:
#Save updated data
Data.to_excel('updated_data.xlsx')