In [11]:
# This notebook calculates the average of top 125 salaries 
# of MLB players for the 2016 season. The first step is to 
# retrieve the data from the web.

import requests
from bs4 import BeautifulSoup

URL = "https://questionnaire-148920.appspot.com/swe/data.html"
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")

In [12]:
# Now we parse the raw data into usable format.

from collections import namedtuple


def validate_salary(row):
    """
    Checks whether the salary row data is in proper format.
    
    Parameters:
        row (list): List contain player name, salary, year, and level
    
    Returns:
        (boolean): Whether the row was successfully validated. 
    """
    # We expect every row to have the form [player, salary, year, level]
    if len(row) != 4:
        return False
    
    player, salary, year, level = row
    
    # We should not have any empty data
    # or invalid data formats
    if any([x == "" for x in row]):
        return False
    
    if not salary.replace("$", "").replace(",", "").isdigit():
        return False
    
    if year != "2016" or level != "MLB":
        return False
    
    return True


Salary = namedtuple("Salary", ["player", "salary", "year", "level"])

salaries_table = soup.find(id="salaries-table")
salaries_rows = salaries_table.find_all("tr")
salaries = []
invalid_data = []

for tr in salaries_rows:
    td = tr.find_all("td")
    row = [x.text for x in td]
    
    if not validate_salary(row):
        invalid_data.append(row)
    else:
        player, salary, year, level = row
        salary = int(salary.replace("$", "").replace(",", ""))
        salaries.append(Salary(player, salary, year, level))

salaries.sort(reverse=True, key=lambda x: x.salary)

In [13]:
# Generate the average of the top 125 salaries
TOP_COUNT = 125

if len(salaries) < TOP_COUNT:
    print("Insufficient data")

average = sum([x.salary for x in salaries[:TOP_COUNT]]) / TOP_COUNT

print(average)
    
    
    
    
    
    
    
    
    

16672222.712
