# Data Wrangling

In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import numpy as np
import requests

## Gather

### Scrape BMO mutual funds [the site](https://www.bmo.com/home/personal/banking/investments/mutual-funds/navigator/funds/mutual-funds-list/funds-overview")

The website lists all BMO mutuals in the market. Since it is in a tabular format (`tr` and `td` tags), it's easy to parse the HTML and extract the data. Here is a quick summary of the process:

1. Create a Chrome driver using `Selenium`, and go to the specified URL.
2. Parse the HTML, and find all `tr` tags with the `valign` attribute set to `center`.
3. Extract data from each `tr` into a list. 
4. Convert the list into a Pandas DataFrame.

In [2]:
def extractData():
    """Extracts the data from the BMO website.
    
    Returns:
        A list of dictionary
    """

    # create a list to store cleaned data
    results = list()
    
    # wait until the tr appear
    trs = WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.XPATH, '//tr[@valign="center"]'))
    )
    
    for tr in trs:
    
        # the BMO fund ID
        fundID = tr.get_attribute('id')
        fundID = fundID.rsplit("_")[-1]

        # locate the <a> and extract href
        profileURL = tr.find_element_by_tag_name('a').get_attribute('href')    

        # the attribtues are stored in the listCellWithBorders class
        # find them all, and extract text

        attrs = tr.find_elements_by_class_name('listCellWithBorders')
        fundName, _, price, assetClass, assets, dateCreated = list(map(lambda x: x.text, attrs))

        # insert the data into the list
        results.append({
            'Fund ID': fundID,
            'Fund Name': fundName,
            'Price': price,
            'Asset Class': assetClass,
            'Date Started': dateCreated,
            'Fund Profile': profileURL
        })
        
    return results

In [3]:
url = 'https://www.bmo.com/home/personal/banking/investments/mutual-funds/navigator/funds/mutual-funds-list/funds-overview'

In [4]:
driver = webdriver.Chrome('./chromedriver')
driver.get(url)

In [5]:
bmo = extractData()

###  Scrape BMO portfolios 

The portfolio data is under the drop-down at the top right. The code below does the following actions:

1. Locate the drop-down.
2. Go to each option in the drop-down, and extract the data.
3. Append the data to the original list

In [6]:
# skip the first two options
page_index = 3 

In [7]:
while page_index < 10:
    
    # locate the drop-down     
    select = Select(driver.find_element_by_id('portfolio'))
    select.select_by_index(page_index)
    
    # increment the page counter     
    page_index += 1
    
    # append new data to the original list
    bmo += extractData()

In [8]:
driver.quit()

In [9]:
# conver the data into a pandas dataframe
bmo = pd.DataFrame(bmo)

## Assess

In [10]:
bmo.head()

Unnamed: 0,Fund ID,Fund Name,Price,Asset Class,Date Started,Fund Profile
0,17705,BMO Money Market Fund,$1.0000,Canadian Money Market,05/1988,https://www.bmo.com/home/personal/banking/inve...
1,88977,BMO Money Market Fund Series M (formerly BMO M...,$1.0000,Canadian Money Market,04/2012,https://www.bmo.com/home/personal/banking/inve...
2,17695,BMO Bond Fund,$13.8541,Canadian Fixed Income,05/1988,https://www.bmo.com/home/personal/banking/inve...
3,63123,BMO Diversified Income Portfolio,$6.4430,Global Neutral Balanced,06/2006,https://www.bmo.com/home/personal/banking/inve...
4,93718,BMO Diversified Income Portfolio Series R,$4.0128,Global Neutral Balanced,03/2013,https://www.bmo.com/home/personal/banking/inve...


## Clean

#### Define

#### Code

#### Test