# mutual_fund_prospectuses scrape

In [None]:
## mutual_fund_prospectuses 자료구조
### company_name : 회사 이름
### {'0' : 
###        { mutual_fund_name : 펀드이름
###          investment_objective : 투자목적
###          principal_investment_strategies : 투자전략
###          principal_investment_risks : 투자 리스크
###         }
###  '1' : 
###       {
###          ....
###          ....

In [1]:
from sec_api import QueryApi

queryApi = QueryApi(api_key="비밀")

def mutual_fund_prospectuses_query(start_day, end_day):
    # 파라미터 예시 
    # query(2021-09-15, 2021-09-22)
    query = {
        "query": {
            "query_string": {
                "query": "formType:485BPOS AND filedAt: \
                          [{0}T14:00:00.000 TO {1}T19:00:00.000]".format(start_day, end_day)
                                                                                  
            }
        },
        "from": "0",
        "size": "20",
        "sort": [{ "filedAt": { "order": "desc" } }]
    }
    return query


In [2]:
def mutual_fund_link(filings):
    links = []
    for i in range(len(filings['filings'])):
        links.append(filings['filings'][i])
    return links

In [3]:
import requests_random_user_agent
import requests
from bs4 import BeautifulSoup

def beautiful_soup(links):
    s = requests.Session()
    soups = []
    for i in range(len(links)):
        endpoint = r"{0}".format(links[i]['linkToFilingDetails'])
        response = requests.get(url = endpoint)
        soups.append(BeautifulSoup(response.content, 'html.parser')) 
    return soups

In [4]:
import re
def clean_text(texts):
    corpus = ''
    for i in range(0, len(texts)):
        review = re.sub(r'[@%\\*=/~#&\+á?\xc3\xa1\-\|\:\;\!\-\,\_\~\$\'\"]', ' ',str(texts[i])) #remove punctuation     
        review = re.sub(r'\s+', ' ', review) #remove extra space 
        review = re.sub(r'<[^>]+>',' ',review) #remove Html tags 
        review = re.sub(r'\s+', ' ', review) #remove spaces 
        review = re.sub(r"^\s+", ' ', review) #remove space from start
        review = re.sub(r'\s+$', ' ', review) #remove space from the end
        review = re.sub(r'\n\n','',review)
        review = re.sub(r'\'' ,'',review)
        review = re.sub(r'\.\s+[0-9]+','',review)
        corpus += review
    return corpus

In [5]:
from pymongo import MongoClient
from pymongo.cursor import CursorType

class DBHandler:
    def __init__(self):
        host = "localhost"
        port = "27017"
        self.client = MongoClient(host, int(port))

    def insert_item(self, data, db_name=None, collection_name=None):
        result = self.client[db_name][collection_name].insert_one(data).inserted_id
        return result
    
    def find_item(self, db_name=None, collection_name=None):
        result = self.client[db_name][collection_name].find()
        return result
    
    def drop_collection():
        result = self.client[db_name][collection_name].insert_one(data).inserted_id
        return result

In [6]:
def mutual_fund_name(soup):
    mutual_fund_name_list = []
    for i in soup.find_all('ix:nonnumeric', {'name':'rr:RiskReturnHeading'}):
        review = clean_text(i.get_text())
        review = " ".join(review.split())
        mutual_fund_name_list.append(review)
    return mutual_fund_name_list

In [7]:
def investment_objective(soup):
    investment_objective_list = []
    for i in soup.find_all('ix:nonnumeric', {'name':'rr:ObjectivePrimaryTextBlock'}):
        review = clean_text(i.get_text())
        review = " ".join(review.split())  
        investment_objective_list.append(review)
    return investment_objective_list

In [8]:
def principal_investment_strategies(soup):
    principal_investment_strategies_list = []
    for i in soup.find_all('ix:nonnumeric', {'name':'rr:StrategyNarrativeTextBlock'}):
        review = clean_text(i.get_text())
        review = " ".join(review.split()) 
        principal_investment_strategies_list.append(review)
    return principal_investment_strategies_list

In [9]:
def principal_investment_risks(soup):
    principal_investment_risks = []    
    for i in soup.find_all('ix:nonnumeric', {'name':'rr:RiskNarrativeTextBlock'}):
        review = clean_text(i.get_text())
        review = " ".join(review.split()) 
        principal_investment_risks.append(review)
    return principal_investment_risks

In [23]:
def mutual_fund_prospectuses_text_cleaner(soups, links):
    seq = 0

    for soup in soups:
        flag = 0
        mutual_fund_prospectuses = dict()
        mutual_fund_list = []

        mutual_fund_name_list = mutual_fund_name(soup)
        if type(mutual_fund_name_list) == 'NoneType':
            continue

        for i in mutual_fund_name_list:
            if 'Fund' in i or 'FUND' in i:
                mutual_fund_list.append(i)

        for i in mutual_fund_list:    
            if 'Summary' in i or 'Overview' in i:
                mutual_fund_list.remove(i)

        for i in mutual_fund_list:
            i_index = mutual_fund_list.index(i)
            i = i.replace('SUMMARY SECTION','')
            i = i.replace('SUMMARY','')
            i = i.replace('—','').strip()
            mutual_fund_list[i_index] = i

        if len(mutual_fund_list) == 0:
            continue

        investment_objective_list = investment_objective(soup)
        principal_investment_strategies_list = principal_investment_strategies(soup)
        principal_investment_risks_list = principal_investment_risks(soup) 
        company_name = links[seq]['companyName']

        for i in range(len(mutual_fund_list)):
            #mutual_fund_prospectuses[i] = {
            #    'mutual_fund_name' : mutual_fund_list[i],
            #    'investment_objective' : investment_objective_list[i],
            #    'principal_investment_strategies' : principal_investment_strategies_list[i],
            #    'principal_investment_risks' : principal_investment_risks_list[i]
            #}

            data = {'company_name' : company_name, 'mutual_fund' : mutual_fund_list[i], 'investment_objective' : investment_objective_list[i], 'principal_investment_strategies': principal_investment_strategies_list[i], 'principal_investment_risks' : principal_investment_risks_list[i]}
            mongo = DBHandler()
            mongo.insert_item(data, 'quant_project', 'mutual_fund_prospectuses')

        
        
        seq+=1
    
    
    

In [24]:
def mutual_fund_prospectuses_main():
    filings = queryApi.get_filings(mutual_fund_prospectuses_query('2022-01-01', '2022-03-05'))
    links = mutual_fund_link(filings)
    soups = beautiful_soup(links)
    mutual_fund_prospectuses_text_cleaner(soups, links)

In [25]:
mutual_fund_prospectuses_main()

In [68]:
# DB 데이터 불러오기 (Test)

def test():
    mongo = DBHandler()
    find = mongo.find_item('quant', 'mutual_fund_prospectuses').limit(4)
    for i in find:
        print(i['mutual_fund_prospectuses'])

{0: {'mutual_fund_name': 'NASDAQ 100 INDEX FUND Ticker Symbols NASDX NDXKX and NQQQX', 'investment_objective': 'The Fund’s investment objective is to attempt to replicate the performance of the largest non financial companies as measured by the Nasdaq 100 Index®.', 'principal_investment_strategies': 'The Nasdaq 100 Index® includes 100 of the largest domestic and international non financial companies listed on The Nasdaq Stock Market based on market capitalization. Nasdaq which maintains the Index makes all determinations regarding the inclusion of stocks in the Index. Each stock is weighted in proportion to its total market value. The Fund is passively managed. It invests primarily in the stocks comprising the Index so that the weighting of each stock in the portfolio approximates the Index. Shelton Capital Management the investment advisor to the Fund seeks to maintain a return correlation of at least 0.95 to the Nasdaq 100 Index® (a return correlation of 1.00 is perfect). Under norma

# form 8-k scrape

In [258]:
from sec_api import QueryApi

queryApi = QueryApi(api_key="471ddc3381f3116476da998627416bb6ab57ab034e86f8671d8db4f3ed3f7e15")

def form_8k_query(start_day, end_day):
    # 파라미터 예시 
    # query(2021-09-15, 2021-09-22)
    query = {
        "query": {
            "query_string": { 
                "query": "formType:\"8-K\" AND filedAt: \
                          [{0}T14:00:00.000 TO {1}T19:00:00.000]".format(start_day, end_day)                                                                 
            }
        },
        "from": "0",
        "size": "20",
        "sort": [{ "filedAt": { "order": "desc" } }]
    }
    return query


In [259]:
def form_8k_link(filings):
    links = []
    for i in range(len(filings['filings'])):
        links.append(filings['filings'][i])
    return links

In [260]:
import requests_random_user_agent
import requests
from bs4 import BeautifulSoup
from sec_api import QueryApi

def beautiful_soup(links):
    s = requests.Session()

    soups = []
    for i in range(len(links)):
        endpoint = r"{0}".format(links[i]['linkToFilingDetails'])
        response = requests.get(url = endpoint)
        soups.append(BeautifulSoup(response.content, 'html.parser'))
    return soups

In [261]:
import re
def clean_text(texts):
    corpus = ''
    for i in range(0, len(texts)):
        review = re.sub(r'[@%\\*=/~#&\+á?\xc3\xa1\|\!\$]', ' ',str(texts[i])) #remove punctuation
        
        review = re.sub(r'\s+', ' ', review) #remove extra space 
        review = re.sub(r'<[^>]+>',' ',review) #remove Html tags 
        review = re.sub(r'\s+', ' ', review) #remove spaces 
        review = re.sub(r"^\s+", ' ', review) #remove space from start
        review = re.sub(r'\s+$', ' ', review) #remove space from the end
        review = re.sub(r'\n\n','',review)
        corpus += review
    return corpus

In [262]:
check_list = ['Entry into a Material Definitive Agreement', 'Termination of a Material Definitive Agreement',
                  'Bankruptcy or Receivership', 'Mine Safety - Reporting of Shutdowns and Patterns of Violations',
                  'Completion of Acquisition or Disposition of Assets', 'Results of Operations and Financial Condition',
                  'Creation of a Direct Financial Obligation or an Obligation under an Off-Balance Sheet Arrangement of a Registrant',
                  'Triggering Events That Accelerate or Increase a Direct Financial Obligation or an Obligation under an Off-Balance Sheet Arrangement',
                  'Costs Associated with Exit or Disposal Activities', 'Material Impairments',
                  'Notice of Delisting or Failure to Satisfy a Continued Listing Rule or Standard; Transfer of Listing',
                  'Unregistered Sales of Equity Securities', 'Material Modification to Rights of Security Holders',
                  'Changes in Registrant\'s Certifying Accountant',
                  'Non-Reliance on Previously Issued Financial Statements or a Related Audit Report or Completed Interim Review',
                  'Change in Control of Registrant',
                  'Departure of Directors or Certain Officers; Election of Directors; Appointment of Certain Officers; Compensatory Arrangements of Certain Officers',
                  'Amendments to Articles of Incorporation or Bylaws; Change in Fiscal Year',
                  'Temporary Suspension of Trading Under Registrant\'s Employee Benefit Plans',
                  'Amendment to Registrant\'s Code of Ethics, or Waiver of a Provision of the Code of Ethics',
                  'Change in Shell Company Status', 'Submission of Matters to a Vote of Security Holders',
                  'Shareholder Director Nominations', 'ABS Informational and Computational Material',
                  'Change of Servicer or Trustee', 'Change in Credit Enhancement or Other External Support',
                  'Failure to Make a Required Distribution', 'Securities Act Updating Disclosure',
                  'Regulation FD Disclosure',
                  'Other Events',
                  'Financial Statements and Exhibits']
   

In [263]:
def form_8k_text_cleaner(text):
    text = text.split('.')

    strip_text = []
    for i in text:
        i = clean_text(i)
        i = i.strip()
        i = " ".join(i.split()) 
        strip_text.append(i)

    flag = 0
    flag_append = 0
    form_content = []
    clean_content = []
    for i in strip_text:

        if 'Item 9' == i:
            break

        if flag:
            form_content.append(i)

        if flag == 0:    
            if 'Item' in i:
                form_content.append(i)
                flag = 1

    fflag = 0
    for i in form_content:
        if fflag:
            clean_content.append(i)

        if fflag == 0:
            for j in check_list:
                if j in i:
                    fflag = 1
                    clean_content.append(i)



    clean_string = []
    for i in clean_content:
        if 'Item ' == i[:5]:
            continue
        clean_string.append(i)


    c_text = []

    for i in clean_string:
        cflag = 1
        for j in check_list:
            if j in i:
                idx = i.find(j)
                i = i[idx + len(j):]
                i = i.strip()
                c_text.append(i)
                cflag = 0
                #break

        if cflag:
            i = i.strip()
            c_text.append(i)

    cc_text = []
    for i in c_text:
        if 'SIGNATURE' in i:
            break
        i = i.strip()
        cc_text.append(i)


    cc_text = '.'.join(cc_text)
    
    return cc_text[1:]


In [264]:
def form_8k_db_insert(soups):
    seq = 0
    for soup in soups:

        text = soup.get_text().strip()
        text = form_8k_text_cleaner(text)
        mongo = DBHandler()
        company_name = links[seq]['companyName']

        data = {'company_name' : company_name, 'mutual_fund_prospectuses' : text}

        mongo.insert_item(data, 'quant_project', 'form_8k')
        seq += 1

In [265]:
def form8k_main():
    filings = queryApi.get_filings(form_8k_query('2022-01-01', '2022-03-05'))
    links = form_8k_link(filings)
    soups = beautiful_soup(links)
    form_8k_db_insert(soups)

In [266]:
form8k_main()