# Scraping financial data from Yahoo Finance

In [4]:
# importing required libraries 
import pandas as pd
import requests
from bs4 import BeautifulSoup
from collections import defaultdict

In [3]:
# Setting header values required for accessing webpages using Beautiful
headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;exchange;v=b3',
                   'Accept-Encoding': 'gzip, deflate, br',
                   'Accept-Language': 'en-US,en;q=0.9',
                   'Cache-Control': 'max-age=0',
                   'Connection': 'close',
                   'DNT': '1', # Do Not Track Request Header
                   'Pragma': 'no-cache',
                   'Referrer': 'https://google.com',
                   'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'
                  }

## Specifying input file and output file by
#### Mutual Funds
Either

In [5]:
# Manually specifiying ticker symbols, or
mf_tickers = []

Or

In [11]:
# Alternatively, extracting ticker symbols from input file
mf_tickers = list(pd.read_csv('Input/MutualFunds_data.csv')['fund_symbol'][0:])

In [13]:
len(mf_tickers)

23437

In [14]:
# writing ticker values to txt file to be used as input for mapreduce jobs
for value in mf_tickers:
    with open('test_in_out/MF_tickers.txt','a') as input_file:
        ticker = value +'\n'
        input_file.write(ticker)

#### ETFs

In [16]:
etf_tickers = list(pd.read_csv('Input/ETF_tickers.csv')['ETF_ticker'][0:])

In [17]:
len(etf_tickers)

3035

In [18]:
for value in etf_tickers:
    with open('test_in_out/ETF_tickers.txt','a') as input_file:
        ticker = value +'\n'
        input_file.write(ticker)

## Creating web scraper using Beautifulsoup as a mapreduce job

In [153]:
%%file mf_data_scraper.py
from mrjob.job import MRJob
from mrjob.protocol import JSONValueProtocol
from mrjob.step import MRStep
import requests
from bs4 import BeautifulSoup
import time
import json
from collections import defaultdict

class MRMutualFundDataScraper(MRJob):
    
    def mapper_read_ticker(self,_,ticker):
        # accessing ticker symbol from text file
        yield None,ticker
        
    def mapper_2(self,_,ticker):
        # Setting header values required for accessing webpages using Beautiful
        headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;exchange;v=b3',
                   'Accept-Encoding': 'gzip, deflate, br',
                   'Accept-Language': 'en-US,en;q=0.9',
                   'Cache-Control': 'max-age=0',
                   'Connection': 'close',
                   'DNT': '1', # Do Not Track Request Header
                   'Pragma': 'no-cache',
                   'Referrer': 'https://google.com',
                   'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'
                  }
        # accessing html pages
        summary_tab = BeautifulSoup(requests.get('https://finance.yahoo.com/quote/{}?p={}'.format(ticker,ticker),headers=headers).text,'html.parser')
        summary_tab = BeautifulSoup(requests.get('https://finance.yahoo.com/quote/{}?p={}'.format(ticker,ticker),headers=headers).text,'html.parser')
        profile_tab = BeautifulSoup(requests.get('https://finance.yahoo.com/quote/{}/profile?p={}'.format(ticker,ticker),headers=headers).text, 'html.parser')
        risk_tab = BeautifulSoup(requests.get('https://finance.yahoo.com/quote/{}/risk?p={}'.format(ticker,ticker),headers=headers).text, 'html.parser')
        holdings_tab = BeautifulSoup(requests.get('https://finance.yahoo.com/quote/{}/holdings?p={}'.format(ticker,ticker),headers=headers).text, 'html.parser')
        performance_tab = BeautifulSoup(requests.get('https://finance.yahoo.com/quote/{}/performance?p={}'.format(ticker,ticker),headers=headers).text, 'html.parser')
        # defining start time
        start = time.time()
        # accessing attribute values
        #summary tab
        yield ticker,{'mf_name':summary_tab.find_all('h1')[0].text}
        # profile tab - fund overview section
        yield ticker,{'category':profile_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[-1].text}
        yield ticker,{'fund_family':profile_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[1].find_all('span')[-1].text}
        yield ticker,{'net_assets':profile_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[2].find_all('span')[-1].text}
        yield ticker,{'yield_income':profile_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[4].find_all('span')[-1].text}
        yield ticker,{'morningstar_rating':profile_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[5].find_all('span')[-1].text}
        yield ticker,{'inception_date':profile_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[6].find_all('span')[-1].text}
        # profile tab - fund operations section
        yield ticker,{'holdings_t/o':profile_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[2].find_all('span')[-1].text}
        # profile tab - fees & expenses section
        yield ticker,{'expense_ratio':profile_tab.find_all('div',"Mb(25px)")[2].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[-2].text}
        # holdings tab - portfolio composition section
        yield ticker,{'cash_%':holdings_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[-1].text}
        yield ticker,{'stocks_%':holdings_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[1].find_all('span')[-1].text}
        yield ticker,{'bonds_%':holdings_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[2].find_all('span')[-1].text}
        yield ticker,{'preferred_%':holdings_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[4].find_all('span')[-1].text}
        yield ticker,{'convertible_%':holdings_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[5].find_all('span')[-1].text}
        # holdings tab - sector weightings section
        yield ticker,{'basic_materials_%':holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[-1].text}
        yield ticker,{'consumer_cyclical_%':holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[1].find_all('span')[-1].text}
        yield ticker,{'financial_services_%':holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[2].find_all('span')[-1].text}
        yield ticker,{'real_estate_%':holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[3].find_all('span')[-1].text}
        yield ticker,{'consumer_defensive_%':holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[4].find_all('span')[-1].text}
        yield ticker,{'healthcare_%':holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[5].find_all('span')[-1].text}
        yield ticker,{'utilities_%':holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[6].find_all('span')[-1].text}
        yield ticker,{'communication_services_%':holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[7].find_all('span')[-1].text}
        yield ticker,{'energy_%':holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[8].find_all('span')[-1].text}
        yield ticker,{'industries_%':holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[9].find_all('span')[-1].text}
        yield ticker,{'technology_%':holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[10].find_all('span')[-1].text}
        # performance tab - trailing returns section
        yield ticker,{'ytd':performance_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[-2].text}
        yield ticker,{'1_month':performance_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[1].find_all('span')[-2].text}
        yield ticker,{'3_month':performance_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[2].find_all('span')[-2].text}
        yield ticker,{'1_year':performance_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[3].find_all('span')[-2].text}
        yield ticker,{'3_year':performance_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[4].find_all('span')[-2].text}
        yield ticker,{'5_year':performance_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[5].find_all('span')[-2].text}
        yield ticker,{'10_year':performance_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[6].find_all('span')[-2].text}
        yield ticker,{'last_bull_market':performance_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[7].find_all('span')[-2].text}
        yield ticker,{'last_bear_market':performance_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[8].find_all('span')[-2].text}
        # performance tab - trailing returns section
        annual_returns_hist = defaultdict(list)
        year = 0
        if len(performance_tab.find_all('div',"Mb(25px)")[2].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)"))>0:
            for i in range(len(performance_tab.find_all('div',"Mb(25px)")[2].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)"))): 
                year = int(performance_tab.find_all('div',"Mb(25px)")[2].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[i-1].find_all('span')[0].text)
                val = performance_tab.find_all('div',"Mb(25px)")[2].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[i-1].find_all('span')[-2].text
                annual_returns_hist[year].append(val)
        yield ticker,{'annual_returns_history':annual_returns_hist}
        # risk tab - risk overview section
        yield ticker,{'morningstar_risk_rating':risk_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[-1].text}
        yield ticker,{'number_of_years_up':risk_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[1].find_all('span')[-1].text}
        yield ticker,{'number_of_years_down':risk_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[2].find_all('span')[-1].text}
        # risk tab - risk statistics section
        yield ticker,{'3_year_alpha':risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[1].text}
        yield ticker,{'5_year_alpha':risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[4].text}
        yield ticker,{'10_year_alpha':risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[7].text}
        yield ticker,{'3_year_BETA':risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[1].find_all('span')[1].text}
        yield ticker,{'5_year_BETA':risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[1].find_all('span')[4].text}
        yield ticker,{'10_year_BETA':risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[1].find_all('span')[7].text}
        yield ticker,{'3_year_mean_annual_return':risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[2].find_all('span')[1].text}
        yield ticker,{'5_year_mean_annual_return':risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[2].find_all('span')[4].text}
        yield ticker,{'10_year_mean_annual_return':risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[2].find_all('span')[7].text}
        yield ticker,{'3_year_r-squared':risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[3].find_all('span')[1].text}
        yield ticker,{'5_year_r-squared':risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[3].find_all('span')[4].text}
        yield ticker,{'10_year_r-squared':risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[3].find_all('span')[7].text}
        yield ticker,{'3_year_std_deviation':risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[4].find_all('span')[1].text}
        yield ticker,{'5_year_std_deviation':risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[4].find_all('span')[4].text}
        yield ticker,{'10_year_std_deviation':risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[4].find_all('span')[7].text}
        yield ticker,{'3_year_sharpe_ratio':risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[5].find_all('span')[1].text}
        yield ticker,{'5_year_sharpe_ratio':risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[5].find_all('span')[4].text}
        yield ticker,{'10_year_sharpe_ratio':risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[5].find_all('span')[7].text}
        yield ticker,{'3_year_treynor_ratio':risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[6].find_all('span')[1].text}
        yield ticker,{'5_year_treynor_ratio':risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[6].find_all('span')[4].text}
        yield ticker,{'10_year_treynor_ratio':risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[6].find_all('span')[7].text}
        yield ticker,{'time':time.time()-start}
    def steps(self):
        return [
            MRStep(mapper=self.mapper_read_ticker),
            MRStep(mapper=self.mapper_2)
        ]

if __name__ == "__main__":
    MRMutualFundDataScraper.run()

Overwriting mf_data_scraper.py


# Running mapreduce job

In [154]:
! python mf_data_scraper.py -r local --jobconf mapreduce.job.maps=5 <test_in_out/MF_tickers_1to10.txt >test_in_out/MF_data_v8.txt

No configs found; falling back on auto-configuration
No configs specified for local runner
Creating temp directory /var/folders/jc/hcdncf153dddzghwy087kzww0000gn/T/mf_data_scraper.celinegeorgethokalath.20230225.094357.729473
Running step 1 of 2...
reading from STDIN
Running step 2 of 2...
job output is in /var/folders/jc/hcdncf153dddzghwy087kzww0000gn/T/mf_data_scraper.celinegeorgethokalath.20230225.094357.729473/output
Streaming final output from /var/folders/jc/hcdncf153dddzghwy087kzww0000gn/T/mf_data_scraper.celinegeorgethokalath.20230225.094357.729473/output...
Removing temp directory /var/folders/jc/hcdncf153dddzghwy087kzww0000gn/T/mf_data_scraper.celinegeorgethokalath.20230225.094357.729473...


# testing / decoding / debugging section

In [76]:
ticker=mf_tickers[0]
summary_tab = BeautifulSoup(requests.get('https://finance.yahoo.com/quote/{}?p={}'.format(ticker,ticker),headers=headers).text,'html.parser')
profile_tab = BeautifulSoup(requests.get('https://finance.yahoo.com/quote/{}/profile?p={}'.format(ticker,ticker),headers=headers).text, 'html.parser')
risk_tab = BeautifulSoup(requests.get('https://finance.yahoo.com/quote/{}/risk?p={}'.format(ticker,ticker),headers=headers).text, 'html.parser')
holdings_tab = BeautifulSoup(requests.get('https://finance.yahoo.com/quote/{}/holdings?p={}'.format(ticker,ticker),headers=headers).text, 'html.parser')
performance_tab = BeautifulSoup(requests.get('https://finance.yahoo.com/quote/{}/performance?p={}'.format(ticker,ticker),headers=headers).text, 'html.parser')

In [77]:
summary_tab.find_all('h1')[0].text

'Lord Abbett Growth Leaders Fund (LGLRX)'

### Profile tab: Fund Overview section - 6 attributes (all except attribute with index value [3]

In [18]:
# category
profile_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[-1].text

'World Allocation'

In [78]:
# Fund Family
profile_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[1].find_all('span')[-1].text

'Lord Abbett'

In [79]:
# Net Assets
profile_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[2].find_all('span')[-1].text

'5.77B'

In [80]:
# (income) Yield
profile_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[4].find_all('span')[-1].text

'0.00%'

In [81]:
# Morningstar Rating
profile_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[5].find_all('span')[-1].text

'★★★★'

In [82]:
# Inception Date
profile_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[6].find_all('span')[-1].text

'Jun 29, 2011'

### Profile Tab: Fund Operations section - attribute with index [2]

In [83]:
# Holdings Turnover
profile_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[2].find_all('span')[-1].text

'93.00%'

### Profile tab: Fess & Expenses section - attribute with index [0]

In [84]:
# Annual Report Expense atio (net)
profile_tab.find_all('div',"Mb(25px)")[2].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[-2].text

'0.64%'

### Holdings tab: Portfolio composition % : 5 attributes (all except attribute with index value [3])

In [85]:
# cash %
holdings_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[-1].text

'0.00%'

In [86]:
# stocks %
holdings_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[1].find_all('span')[-1].text

'100.00%'

In [87]:
# Bonds %
holdings_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[2].find_all('span')[-1].text

'0.00%'

In [88]:
# Preferred %
holdings_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[4].find_all('span')[-1].text

'0.00%'

In [89]:
# Convertible %
holdings_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[5].find_all('span')[-1].text

'0.00%'

### Holdings tab: Sector Weightings % : all 11 attributes

In [90]:
# Basic Materials %
holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[-1].text

'0.00%'

In [91]:
# Consumer Cyclical %
holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[1].find_all('span')[-1].text

'16.53%'

In [92]:
# Financial Services %
holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[2].find_all('span')[-1].text

'5.95%'

In [93]:
# Real Estate %
holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[3].find_all('span')[-1].text

'1.10%'

In [94]:
# Consumer Defensive %
holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[4].find_all('span')[-1].text

'0.00%'

In [95]:
# Healthcare %
holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[5].find_all('span')[-1].text

'21.00%'

In [96]:
# Utilities %
holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[6].find_all('span')[-1].text

'0.00%'

In [97]:
# Communication Services
holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[7].find_all('span')[-1].text

'5.37%'

In [98]:
# Energy %
holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[8].find_all('span')[-1].text

'0.50%'

In [99]:
# Industries %
holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[9].find_all('span')[-1].text

'9.11%'

In [100]:
# Technology %
holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[10].find_all('span')[-1].text

'40.43%'

### Performanca tab: Trailing Returns - 9 attributes

In [101]:
# YTD
performance_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[-2].text

'7.60%'

In [102]:
# 1-Month
performance_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[1].find_all('span')[-2].text

'7.73%'

In [103]:
# 3-Month
performance_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[2].find_all('span')[-2].text

'6.21%'

In [104]:
# 1-Year
performance_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[3].find_all('span')[-2].text

'-25.38%'

In [105]:
# 3-Year
performance_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[4].find_all('span')[-2].text

'5.28%'

In [106]:
# 5-Year
performance_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[5].find_all('span')[-2].text

'13.13%'

In [107]:
# 10-Year
performance_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[6].find_all('span')[-2].text

'13.38%'

In [108]:
# Last Bull Market
performance_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[7].find_all('span')[-2].text

'23.30%'

In [109]:
# Last Bear Market
performance_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[8].find_all('span')[-2].text

'-15.04%'

### Performance tab: Annual Total Returns History section - Values for each of the years starting 2001 till 2020, if available

In [114]:
len(performance_tab.find_all('div',"Mb(25px)")[2].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)"))

11

In [121]:
performance_tab.find_all('div',"Mb(25px)")[2].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[11-1].find_all('span')[0].text

'2011'

In [120]:
performance_tab.find_all('div',"Mb(25px)")[2].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[11-1].find_all('span')[-2].text

'N/A'

In [124]:
annual_returns_hist = defaultdict(list)
year = 0
if len(performance_tab.find_all('div',"Mb(25px)")[2].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)"))>0:
    for i in range(len(performance_tab.find_all('div',"Mb(25px)")[2].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)"))): 
        year = int(performance_tab.find_all('div',"Mb(25px)")[2].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[i-1].find_all('span')[0].text)
        val = performance_tab.find_all('div',"Mb(25px)")[2].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[i-1].find_all('span')[-2].text
        annual_returns_hist[year].append(val)

In [125]:
annual_returns_hist

defaultdict(list,
            {2011: ['N/A'],
             2021: ['N/A'],
             2020: ['76.57%'],
             2019: ['34.37%'],
             2018: ['-1.18%'],
             2017: ['30.91%'],
             2016: ['0.31%'],
             2015: ['6.38%'],
             2014: ['9.89%'],
             2013: ['46.44%'],
             2012: ['10.10%']})

### Risk tab: Risk Overview section - 3 attributes

In [None]:
# Morningstar Risk Rating
risk_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[-1].text

In [None]:
# Number of Years Up
risk_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[1].find_all('span')[-1].text

In [None]:
# Number of Years Down
risk_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[2].find_all('span')[-1].text

### Risk tab: Risk Statistics section - 7 attributes and sub  (with3-year, 5-year and 10-year values at index values [1], [4], and [7]

#### Alpha

In [None]:
# 3 Years Alpha
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[1].text

In [None]:
# 5 Years Alpha
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[4].text

In [None]:
# 10 Years Alpha
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[7].text

#### BETA

In [None]:
# 3 Years BETA
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[1].find_all('span')[1].text

In [None]:
# 5 Years BETA
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[1].find_all('span')[4].text

In [None]:
# 10 Years BETA
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[1].find_all('span')[7].text

#### Mean Annual Return

In [None]:
# 3 Years Mean Annual Return
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[2].find_all('span')[1].text

In [None]:
# 5 Years Mean Annual Return
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[2].find_all('span')[4].text

In [None]:
# 10 Years Mean Annual Return
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[2].find_all('span')[7].text

#### R-squared

In [None]:
# 3 Years R-squared
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[3].find_all('span')[1].text

In [None]:
# 5 Years R-squared
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[3].find_all('span')[4].text

In [None]:
# 10 Years R-squared
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[3].find_all('span')[7].text

#### Standard Deviation

In [None]:
# 3 Years Standard Deviation
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[4].find_all('span')[1].text

In [None]:
# 5 Years Standard Deviation
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[4].find_all('span')[4].text

In [None]:
# 10 Years Standard Deviation
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[4].find_all('span')[7].text

#### Sharpe Ratio

In [None]:
# 3 Years Sharpe Ratio
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[5].find_all('span')[1].text

In [None]:
# 5 Years Sharpe Ratio
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[5].find_all('span')[4].text

In [None]:
# 10 Years Sharpe Ratio
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[5].find_all('span')[7].text

#### Treynor Ratio

In [None]:
# 3 Years Treynor Ration
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[6].find_all('span')[1].text

In [None]:
# 5 Years Treynor Ration
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[6].find_all('span')[4].text

In [None]:
# 10 Years Treynor Ration
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[6].find_all('span')[7].text