# Specifying input file and output file

In [7]:
# importing required libraries 
import pandas as pd
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
# Setting header values required for accessing webpages using Beautiful
headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;exchange;v=b3',
                   'Accept-Encoding': 'gzip, deflate, br',
                   'Accept-Language': 'en-US,en;q=0.9',
                   'Cache-Control': 'max-age=0',
                   'Connection': 'close',
                   'DNT': '1', # Do Not Track Request Header
                   'Pragma': 'no-cache',
                   'Referrer': 'https://google.com',
                   'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'
                  }

In [None]:
# Manually specifiying ticker symbols, or
mf_tickers_test = ['SPRVX','VTSAX']

In [None]:
# Alternatively, extracting ticker symbols from input file
mf_tickers = list(pd.read_csv('Input/MutualFunds_data.csv')['fund_symbol'][0:10])

In [20]:
# writing ticker values to txt file to be used as input for mapreduce jobs
for value in mf_tickers:
    with open('test_in_out/MF_tickers_1to10.txt','a') as input_file:
        ticker = value +'\n'
        input_file.write(ticker)

# Creating web scraper using Beautifulsoup

In [28]:
%%file mf_data_scraper.py
from mrjob.job import MRJob
from mrjob.protocol import JSONValueProtocol
from mrjob.step import MRStep
import requests
from bs4 import BeautifulSoup
import time
import json
from collections import defaultdict

class MRMutualFundDataScraper(MRJob):
    
    def mapper1(self,_,ticker):
        # accessing ticker symbol from text file
        yield None,ticker
    
    def mapper2(self,_,ticker):
        # setting headers for accessing html tags using Beautifulsoup
        headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;exchange;v=b3',
                   'Accept-Encoding': 'gzip, deflate, br',
                   'Accept-Language': 'en-US,en;q=0.9',
                   'Cache-Control': 'max-age=0',
                   'Connection': 'close',
                   'DNT': '1', # Do Not Track Request Header
                   'Pragma': 'no-cache',
                   'Referrer': 'https://google.com',
                   'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'
                  }
        # Obtaining html page of required webpages of Yahoo Finance
        summary_tab = BeautifulSoup(requests.get('https://finance.yahoo.com/quote/{}?p={}'.format(ticker,ticker),headers=headers).text, 'html.parser')
        profile_tab = BeautifulSoup(requests.get('https://finance.yahoo.com/quote/{}/profile?p={}'.format(ticker,ticker),headers=headers).text, 'html.parser')
        holdings_tab = BeautifulSoup(requests.get('https://finance.yahoo.com/quote/{}/holdings?p={}'.format(ticker,ticker),headers=headers).text, 'html.parser')
        performance_tab = BeautifulSoup(requests.get('https://finance.yahoo.com/quote/{}/performance?p={}'.format(ticker,ticker),headers=headers).text, 'html.parser')
        risk_tab = BeautifulSoup(requests.get('https://finance.yahoo.com/quote/{}/risk?p={}'.format(ticker,ticker),headers=headers).text, 'html.parser')
        # defining start time
        start = time.time()
        # defining required variables 
        mf_name = summary_tab.find_all('h1')[0].text
        # Attributes on profile tab's Funds Overview section
        category_name = profile_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[-1].text
        family=profile_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[1].find_all('span')[-1].text
        net_assets_value=profile_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[2].find_all('span')[-1].text
        income_yield = profile_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[4].find_all('span')[-1].text
        morning_star_rating=profile_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[5].find_all('span')[-1].text
        inception_date=profile_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[6].find_all('span')[-1].text
        # Attributes on profile tab's Fund Operations section
        fund_holdings_turnover=profile_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[2].find_all('span')[-1].text
        # Attributes in profile tab's Fees & Expenses section
        fund_expense_ratio=profile_tab.find_all('div',"Mb(25px)")[2].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[-2].text
        # Attributes on Holdings tab's Portfolio Composition section
        cash=holdings_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[-1].text
        stock=holdings_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[1].find_all('span')[-1].text
        bond=holdings_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[2].find_all('span')[-1].text
        preferred=holdings_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[4].find_all('span')[-1].text
        convertible=holdings_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[5].find_all('span')[-1].text
        # Attributes on Holdings tab's Sector Weightings section
        basic_materials=holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[-1].text
        consumer_cyclical=holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[1].find_all('span')[-1].text
        financial_services=holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[2].find_all('span')[-1].text
        real_estate=holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[3].find_all('span')[-1].text
        consumer_defensive=holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[4].find_all('span')[-1].text
        healthcare=holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[5].find_all('span')[-1].text
        utilities=holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[6].find_all('span')[-1].text
        cummunication_services=holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[7].find_all('span')[-1].text
        energy=holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[8].find_all('span')[-1].text
        industries=holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[9].find_all('span')[-1].text
        technology=holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[10].find_all('span')[-1].text
        # Attributes on Performance tab's Trailing Returns section 
        ytd=performance_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[-2].text
        m1=performance_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[1].find_all('span')[-2].text
        m3=performance_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[2].find_all('span')[-2].text
        y1=performance_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[3].find_all('span')[-2].text
        y3=performance_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[4].find_all('span')[-2].text
        y5=performance_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[5].find_all('span')[-2].text
        y10=performance_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[6].find_all('span')[-2].text
        last_bull_mkt=performance_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[7].find_all('span')[-2].text
        last_bear_mkt=performance_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[8].find_all('span')[-2].text
        # Attributes on Performance tab's Trailing Returns section 
        annual_returns_hist = defaultdict(list)
        year = 0
        if len(performance_tab.find_all('div',"Mb(25px)")[2].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)"))>0:
            for i in range(len(performance_tab.find_all('div',"Mb(25px)")[2].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)"))): 
                year = int(performance_tab.find_all('div',"Mb(25px)")[2].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[i-1].find_all('span')[0].text)
                val = performance_tab.find_all('div',"Mb(25px)")[2].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[i-1].find_all('span')[-2].text
                annual_returns_hist[year].append(val)
        # Attributes on Risk tab's Risk Overview section 
        morning_star_risk_rating=risk_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[-1].text
        number_of_years_up=risk_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[1].find_all('span')[-1].text
        number_of_years_down=risk_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[2].find_all('span')[-1].text
        # Attributes on Risk tab's Risk Statistics section 
        alpha_3_year=risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[1].text
        alpha_5_year=risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[4].text
        alpha_10_year=risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[7].text
        beat_3_year=risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[1].find_all('span')[1].text
        beta_5_year=risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[1].find_all('span')[4].text
        beta_10_year=risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[1].find_all('span')[7].text
        mean_annual_ret_3_year=risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[2].find_all('span')[1].text
        mean_annual_ret_5_year=risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[2].find_all('span')[4].text
        mean_annual_ret_10_year=risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[2].find_all('span')[7].text
        rSq_3_year=risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[3].find_all('span')[1].text
        rSq_5_year=risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[3].find_all('span')[4].text
        rSq_10_year=risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[3].find_all('span')[7].text
        std_dev_3_year=risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[4].find_all('span')[1].text
        std_dev_5_year=risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[4].find_all('span')[4].text
        std_dev_10_year=risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[4].find_all('span')[7].text
        sharpe_3_year=risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[5].find_all('span')[1].text
        sharpe_5_year=risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[5].find_all('span')[4].text
        sharpe_10_year=risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[5].find_all('span')[7].text
        treynor_3_year=risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[6].find_all('span')[1].text
        treynor_5_year=risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[6].find_all('span')[4].text
        treynor_10_year=risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[6].find_all('span')[7].text
        yield ticker,("mf name",mf_name,
                      "category_name",category_name,
                      "family",family,
                      "net assets value",net_assets_value,
                      "yield (income)",income_yield,
                      "morning star rating",morning_star_rating,
                      "inception date",inception_date,
                      "fund holdings turnover",fund_holdings_turnover,
                      "fund expense ratio",fund_expense_ratio,
                      "cash%",cash,
                      "stock%",stock,
                      "bond%",bond,
                      "preferred%",preferred,
                      "convertible%",convertible,
                      "basic materials",basic_materials,
                      "consumer cyclical",consumer_cyclical,
                      "financial services",financial_services,
                      "real estate",real_estate,
                      "consumer defensive",consumer_defensive,
                      "healthcare",healthcare,
                      "utilities",utilities,
                      "cummunication services",cummunication_services,
                      "energy",energy,
                      "industries",industries,
                      "technology",technology,
                      "ytd",ytd,
                      "1m ret",m1,
                      "3m ret",m3,
                      "1y ret",y1,
                      "3y ret",y3,
                      "5y ret",y5,
                      "10y ret",y10,
                      "annual_returns_hist",annual_returns_hist,
                      "last bull mkt ret",last_bull_mkt,
                      "last bear mkt ret",last_bear_mkt,
                      "morning star risk rating",morning_star_risk_rating,
                      "number of years up",number_of_years_up,
                      "number of years down",number_of_years_down,
                      "alpha 3 year",alpha_3_year,
                      "alpha 5 year",alpha_5_year,
                      "alpha 10 year",alpha_10_year,
                      "beat 3 year",beat_3_year,
                      "beta 5 year",beta_5_year,
                      "beta 10 year",beta_10_year,
                      "mean annual ret 3 year",mean_annual_ret_3_year,
                      "mean annual ret 5 year",mean_annual_ret_5_year,
                      "mean annual ret 10 year",mean_annual_ret_10_year,
                      "r-sq 3 year",rSq_3_year,
                      "r-sq 5 year",rSq_5_year,
                      "r-sq 10 year",rSq_10_year,
                      "std dev 3 year",std_dev_3_year,
                      "std dev 5 year",std_dev_5_year,
                      "std dev 10 year",std_dev_10_year,
                      "sharpe 3 year",sharpe_3_year,
                      "sharpe 5 year",sharpe_5_year,
                      "sharpe 10 year",sharpe_10_year,
                      "treynor 3 year",treynor_3_year,
                      "treynor 5 year",treynor_5_year,
                      "treynor 10 year",treynor_10_year,
                      "time",time.time()-start,
                     )

    def steps(self):
        return [
            MRStep(mapper=self.mapper1),
            MRStep(mapper=self.mapper2)
        ]

if __name__ == "__main__":
    MRMutualFundDataScraper.run()

Overwriting mf_data_scraper.py


# Running mapreduce job

In [29]:
! python mf_data_scraper.py -r local --jobconf mapreduce.job.maps=5 <test_in_out/MF_tickers_1to10.txt >test_in_out/MF_data_v7.txt

No configs found; falling back on auto-configuration
No configs specified for local runner
Creating temp directory /var/folders/jc/hcdncf153dddzghwy087kzww0000gn/T/mf_data_scraper.celinegeorgethokalath.20230225.094754.568061
Running step 1 of 2...
reading from STDIN
Running step 2 of 2...
job output is in /var/folders/jc/hcdncf153dddzghwy087kzww0000gn/T/mf_data_scraper.celinegeorgethokalath.20230225.094754.568061/output
Streaming final output from /var/folders/jc/hcdncf153dddzghwy087kzww0000gn/T/mf_data_scraper.celinegeorgethokalath.20230225.094754.568061/output...
Removing temp directory /var/folders/jc/hcdncf153dddzghwy087kzww0000gn/T/mf_data_scraper.celinegeorgethokalath.20230225.094754.568061...


# testing / decoding / debugging section

In [3]:
ticker="AAAAX"
summary_tab = BeautifulSoup(requests.get('https://finance.yahoo.com/quote/{}?p={}'.format(ticker,ticker),headers=headers).text,'html.parser')
profile_tab = BeautifulSoup(requests.get('https://finance.yahoo.com/quote/{}/profile?p={}'.format(ticker,ticker),headers=headers).text, 'html.parser')
risk_tab = BeautifulSoup(requests.get('https://finance.yahoo.com/quote/{}/risk?p={}'.format(ticker,ticker),headers=headers).text, 'html.parser')
holdings_tab = BeautifulSoup(requests.get('https://finance.yahoo.com/quote/{}/holdings?p={}'.format(ticker,ticker),headers=headers).text, 'html.parser')
performance_tab = BeautifulSoup(requests.get('https://finance.yahoo.com/quote/{}/performance?p={}'.format(ticker,ticker),headers=headers).text, 'html.parser')
documents_set = [summary_tab,profile_tab,risk_tab,holdings_tab,performance_tab]

In [4]:
a = summary_tab.find_all('h1')[0].text

In [5]:
a

'DWS RREEF Real Assets Fund - Class A (AAAAX)'

### Profile tab: Fund Overview section - 6 attributes (all except attribute with index value [3]

In [18]:
# category
profile_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[-1].text

'World Allocation'

In [None]:
# Fund Family
profile_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[1].find_all('span')[-1].text

In [None]:
# Net Assets
profile_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[2].find_all('span')[-1].text

In [None]:
# (income) Yield
profile_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[4].find_all('span')[-1].text

In [None]:
# Morningstar Rating
profile_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[5].find_all('span')[-1].text

In [None]:
# Inception Date
profile_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[6].find_all('span')[-1].text

### Profile Tab: Fund Operations section - attribute with index [2]

In [None]:
# Holdings Turnover
profile_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[2].find_all('span')[-1].text

### Profile tab: Fess & Expenses section - attribute with index [0]

In [None]:
# Annual Report Expense atio (net)
profile_tab.find_all('div',"Mb(25px)")[2].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[-2].text

### Holdings tab: Portfolio composition % : 5 attributes (all except attribute with index value [3])

In [None]:
# cash %
holdings_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[-1].text

In [None]:
# stocks %
holdings_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[1].find_all('span')[-1].text

In [None]:
# Bonds %
holdings_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[2].find_all('span')[-1].text

In [None]:
# Preferred %
holdings_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[4].find_all('span')[-1].text

In [None]:
# Convertible %
holdings_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[5].find_all('span')[-1].text

### Holdings tab: Sector Weightings % : all 11 attributes

In [None]:
# Basic Materials %
holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[-1].text

In [None]:
# Consumer Cyclical %
holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[1].find_all('span')[-1].text

In [None]:
# Financial Services %
holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[2].find_all('span')[-1].text

In [None]:
# Real Estate %
holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[3].find_all('span')[-1].text

In [None]:
# Consumer Defensive %
holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[4].find_all('span')[-1].text

In [None]:
# Healthcare %
holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[5].find_all('span')[-1].text

In [None]:
# Utilities %
holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[6].find_all('span')[-1].text

In [None]:
# Communication Services
holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[7].find_all('span')[-1].text

In [None]:
# Energy %
holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[8].find_all('span')[-1].text

In [None]:
# Industries %
holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[9].find_all('span')[-1].text

In [None]:
# Technology %
holdings_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[10].find_all('span')[-1].text

### Performanca tab: Trailing Returns - 9 attributes

In [None]:
# YTD
performance_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[-2].text

In [None]:
# 1-Month
performance_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[1].find_all('span')[-2].text

In [None]:
# 3-Month
performance_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[2].find_all('span')[-2].text

In [None]:
# 1-Year
performance_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[3].find_all('span')[-2].text

In [None]:
# 3-Year
performance_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[4].find_all('span')[-2].text

In [None]:
# 5-Year
performance_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[5].find_all('span')[-2].text

In [None]:
# 10-Year
performance_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[6].find_all('span')[-2].text

In [None]:
# Last Bull Market
performance_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[7].find_all('span')[-2].text

In [None]:
# Last Bear Market
performance_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[8].find_all('span')[-2].text

### Performance tab: Annual Total Returns History section - Values for each of the years starting 2001 till 2020, if available

In [None]:
len(performance_tab.find_all('div',"Mb(25px)")[2].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)"))

In [None]:
year = performance_tab.find_all('div',"Mb(25px)")[2].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[0].text

In [None]:
performance_tab.find_all('div',"Mb(25px)")[2].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[1].find_all('span')[-2].text

In [None]:
annual_returns_hist = defaultdict(list)
year = int(performance_tab.find_all('div',"Mb(25px)")[2].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[0].text)
for i in range(len(performance_tab.find_all('div',"Mb(25px)")[2].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)"))): 
    val = performance_tab.find_all('div',"Mb(25px)")[2].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[i].find_all('span')[-2].text
    annual_returns_hist[year].append(val)
    year -= 1

In [None]:
annual_returns_hist

### Risk tab: Risk Overview section - 3 attributes

In [None]:
# Morningstar Risk Rating
risk_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[-1].text

In [None]:
# Number of Years Up
risk_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[1].find_all('span')[-1].text

In [None]:
# Number of Years Down
risk_tab.find_all('div',"Mb(25px)")[0].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[2].find_all('span')[-1].text

### Risk tab: Risk Statistics section - 7 attributes and sub  (with3-year, 5-year and 10-year values at index values [1], [4], and [7]

#### Alpha

In [None]:
# 3 Years Alpha
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[1].text

In [None]:
# 5 Years Alpha
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[4].text

In [None]:
# 10 Years Alpha
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[0].find_all('span')[7].text

#### BETA

In [None]:
# 3 Years BETA
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[1].find_all('span')[1].text

In [None]:
# 5 Years BETA
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[1].find_all('span')[4].text

In [None]:
# 10 Years BETA
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[1].find_all('span')[7].text

#### Mean Annual Return

In [None]:
# 3 Years Mean Annual Return
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[2].find_all('span')[1].text

In [None]:
# 5 Years Mean Annual Return
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[2].find_all('span')[4].text

In [None]:
# 10 Years Mean Annual Return
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[2].find_all('span')[7].text

#### R-squared

In [None]:
# 3 Years R-squared
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[3].find_all('span')[1].text

In [None]:
# 5 Years R-squared
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[3].find_all('span')[4].text

In [None]:
# 10 Years R-squared
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[3].find_all('span')[7].text

#### Standard Deviation

In [None]:
# 3 Years Standard Deviation
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[4].find_all('span')[1].text

In [None]:
# 5 Years Standard Deviation
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[4].find_all('span')[4].text

In [None]:
# 10 Years Standard Deviation
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[4].find_all('span')[7].text

#### Sharpe Ratio

In [None]:
# 3 Years Sharpe Ratio
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[5].find_all('span')[1].text

In [None]:
# 5 Years Sharpe Ratio
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[5].find_all('span')[4].text

In [None]:
# 10 Years Sharpe Ratio
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[5].find_all('span')[7].text

#### Treynor Ratio

In [None]:
# 3 Years Treynor Ration
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[6].find_all('span')[1].text

In [None]:
# 5 Years Treynor Ration
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[6].find_all('span')[4].text

In [None]:
# 10 Years Treynor Ration
risk_tab.find_all('div',"Mb(25px)")[1].find_all('div',"Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)")[6].find_all('span')[7].text