In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import re

In [2]:
# This code will get the url and check if it ok to scrape the data from the website. If we get an r.status_code of 200, then we are good to go!
url = 'https://www.baseball-reference.com/leaders/earned_run_avg_season.shtml'
r = requests.get(url)
r.status_code

200

In [3]:
# the pd.read_html(url) is a builtiin function that scrapes the tables from a specific website. I will then save it as a list of tables.
dfs = pd.read_html(url)

In [4]:
# Since I am only worried about the data in the first table, I will create a table/dataframe will the first table which is received by calling the table at the 0 index.
era = dfs[0]

In [5]:
# This step focuses on cleaning the data so that it can be used in an analysis later on.
# If a player was a hall of famer, they got a '+' sign next to their name, and I want to remove that so I can split the data later on.
era['Player (age that year)'] = era['Player (age that year)'].str.replace('+','')
# This command will filter out all rows that do not have numerical data inside them due to the repetition of the column names throughout the dataframe.
era = era[(era['Year'] != 'Year')]
# This updates the rank of the players so that it lines up accurately with the athletes 1-500
lst = list(np.arange(1,500+1))
era['Rank'] = lst
# These three command change the type of the columns to numbers. This makes it possible to perform exploratory data analyses later on.
era['Year'] = pd.to_numeric(era['Year'])
era['Earned Run Average'] = pd.to_numeric(era['Earned Run Average'])
era['IP'] = pd.to_numeric(era['IP'])

  era['Player (age that year)'] = era['Player (age that year)'].str.replace('+','')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  era['Rank'] = lst
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  era['Year'] = pd.to_numeric(era['Year'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  era['Earned Run Average'] = pd.to_numeric(era['Earned Run Aver

In [6]:
# Since this next step will focus on cleaning the data on a more detailed level, I want to make a copy of the data frame so that if a mistake is made, it won't be too big of a problem!
p = era.copy()

In [7]:
# Almost every player has the age of the player during the year that they had the Earned Run Average. I am going to create a separate column with the ages of the players
# This command creates a new column with the values in the player column split before and after the first parenthesis 
p['age1'] = p['Player (age that year)'].apply(lambda x: x.split("("))
# This command creates a new column with all the values that only contain the ages of the players not their names.
p['Age'] = p['age1'].apply(lambda x:x[-1])
# Some players did not have an age, so I assumed that the average age would be around 27 and added this value in place of their ages.
p.iloc[0,7] = '27)'
p.iloc[30,7] = '27)'
p.iloc[150,7] = '27)'
p.iloc[306,7] = '27)'
p.iloc[320,7] = '27)'
p.iloc[323,7] = '27)'
p.iloc[346,7] = '27)'
p.iloc[470,7] = '27)'
# This command finishes cleaning the column so that only the number is in the column.
p['Age'] = p['Age'].str.replace(')','')
# This changes the type of the column to numeric, so we can perform analyses with this column.
p['Age'] = pd.to_numeric(p['Age'])


  p['Age'] = p['Age'].str.replace(')','')


In [8]:
# This drops the intermediate column I created to isolate the ages of the players. 
p = p.drop(columns=['age1'])
# This saves all the changes from the previous steps back to the original era dataframe.
era = p
# This saves the dataframe as a csv file
era.to_csv("era.csv", index=False)