# Web Scraper for Baseball Stats - WMT websites

### Import Beautiful Soup Library and Parser

In [1]:
from bs4 import BeautifulSoup

from lxml import html

import unicodedata

import requests
import re

import pandas as pd 

from pandas import Series, DataFrame

### Read in URL, parse the data and check that it's reading the type properly

In [13]:
url = 'https://www.liberty.edu/wwwadmin/globals/templates/1912/docs/stats/baseteamcume31120.htm'

page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')

type(soup)

bs4.BeautifulSoup



### Parse Data from the 'td' tags into a variabe

In [15]:
all_stats = soup.find_all('td')

all_stats

[<td><center> 
 <h3><font face="verdana">Overall Statistics</font></h3>
  </center></td>,
 <td align="left" bgcolor="#00008b"><font color="#ffffff" face="verdana" size="1"><b>Player               </b></font></td>,
 <td align="right" bgcolor="#00008b"><font color="#ffffff" face="verdana" size="1"><b>avg </b></font></td>,
 <td align="center" bgcolor="#00008b"><font color="#ffffff" face="verdana" size="1"><b>gp-gs </b></font></td>,
 <td align="right" bgcolor="#00008b"><font color="#ffffff" face="verdana" size="1"><b>ab </b></font></td>,
 <td align="right" bgcolor="#00008b"><font color="#ffffff" face="verdana" size="1"><b>r </b></font></td>,
 <td align="right" bgcolor="#00008b"><font color="#ffffff" face="verdana" size="1"><b>h </b></font></td>,
 <td align="right" bgcolor="#00008b"><font color="#ffffff" face="verdana" size="1"><b>2b </b></font></td>,
 <td align="right" bgcolor="#00008b"><font color="#ffffff" face="verdana" size="1"><b>3b </b></font></td>,
 <td align="right" bgcolor="#00008

In [16]:
# Checking that 'all_stats' is a Result Set
type(all_stats)

bs4.element.ResultSet

### Get the text from inside all of the 'td' tags

In [17]:
# Initialize an empty list that will house the text from inside the 'td' tags
stats=[]

# Loop through the 'all_stats', get the text, remove '\n' and append the text into the 'stats' list
for stat in all_stats:
    stats.append(stat.text.replace('\n', ' ').strip())

# Remove 'Overall Statistics' from the beginning of the list so the stats can eventually be broken up evenly
stats.pop(0)

# Verify 'stats' contains text
print(stats)

['Player', 'avg', 'gp-gs', 'ab', 'r', 'h', '2b', '3b', 'hr', 'rbi', 'tb', 'slg%', 'bb', 'hbp', 'so', 'gdp', 'ob%', 'sf', 'sh', 'sb-att', 'po', 'a', 'e', 'fld%', 'HIGHFILL, Ben', '.317', '17-17', '60', '10', '19', '6', '1', '3', '12', '36', '.600', '10', '1', '9', '1', '.405', '3', '0', '0-0', '20', '47', '2', '.971', 'LOCKLEAR, Cam', '.316', '17-17', '57', '7', '18', '4', '0', '1', '9', '25', '.439', '7', '9', '15', '0', '.466', '0', '0', '3-8', '24', '51', '4', '.949', 'MCDYRE, Trey', '.288', '16-15', '52', '11', '15', '5', '0', '0', '6', '20', '.385', '9', '1', '15', '0', '.397', '1', '1', '1-1', '29', '35', '3', '.955', 'GULAKOWSKI, Brady', '.267', '16-16', '60', '8', '16', '6', '0', '2', '9', '28', '.467', '5', '0', '21', '0', '.313', '2', '1', '0-1', '103', '13', '2', '.983', 'GUY, Jaylen', '.245', '17-17', '53', '6', '13', '4', '0', '1', '5', '20', '.377', '3', '5', '14', '0', '.344', '0', '2', '1-3', '33', '1', '3', '.919', 'WILKINSON, Garrett', '.244', '16-15', '45', '5', '11',

In [18]:
# verify that 'stats' is a list
type(stats)

list

In [19]:
# Remove both instances of the dash dividing line and put the stats into a new list
clean_stats = [ elem for elem in stats if elem != '----------']

print(clean_stats)

['Player', 'avg', 'gp-gs', 'ab', 'r', 'h', '2b', '3b', 'hr', 'rbi', 'tb', 'slg%', 'bb', 'hbp', 'so', 'gdp', 'ob%', 'sf', 'sh', 'sb-att', 'po', 'a', 'e', 'fld%', 'HIGHFILL, Ben', '.317', '17-17', '60', '10', '19', '6', '1', '3', '12', '36', '.600', '10', '1', '9', '1', '.405', '3', '0', '0-0', '20', '47', '2', '.971', 'LOCKLEAR, Cam', '.316', '17-17', '57', '7', '18', '4', '0', '1', '9', '25', '.439', '7', '9', '15', '0', '.466', '0', '0', '3-8', '24', '51', '4', '.949', 'MCDYRE, Trey', '.288', '16-15', '52', '11', '15', '5', '0', '0', '6', '20', '.385', '9', '1', '15', '0', '.397', '1', '1', '1-1', '29', '35', '3', '.955', 'GULAKOWSKI, Brady', '.267', '16-16', '60', '8', '16', '6', '0', '2', '9', '28', '.467', '5', '0', '21', '0', '.313', '2', '1', '0-1', '103', '13', '2', '.983', 'GUY, Jaylen', '.245', '17-17', '53', '6', '13', '4', '0', '1', '5', '20', '.377', '3', '5', '14', '0', '.344', '0', '2', '1-3', '33', '1', '3', '.919', 'WILKINSON, Garrett', '.244', '16-15', '45', '5', '11',

### Split stats list into a sublist containing player offensive stats only

In [21]:
# Initialize a new list
# Iterate through the length of 'clean_stats' and place text into new lists of 24 items inside the 'chunks' list
# There are 24 columns for each baseball player including their name
chunks = [clean_stats[x:x+24] for x in range(0, len(clean_stats), 24)]

# Get the first list in chunks and put in into a new list for use as DataFrame column headers
offense_header = chunks[0]

# Slice out the lists for the players with ofensive stats and place into another list for use in a DataFrame
offensive_stats = chunks[1:17]

# Check that 'chunks' contains lists of text
#print(chunks)

# Check the 'offense header' list contains the correct text
#print(offense_header)

# Check that 'offensive stats' contains lists for each player needed. Adjust the range above if you're missing someone
print(offensive_stats)

[['HIGHFILL, Ben', '.317', '17-17', '60', '10', '19', '6', '1', '3', '12', '36', '.600', '10', '1', '9', '1', '.405', '3', '0', '0-0', '20', '47', '2', '.971'], ['LOCKLEAR, Cam', '.316', '17-17', '57', '7', '18', '4', '0', '1', '9', '25', '.439', '7', '9', '15', '0', '.466', '0', '0', '3-8', '24', '51', '4', '.949'], ['MCDYRE, Trey', '.288', '16-15', '52', '11', '15', '5', '0', '0', '6', '20', '.385', '9', '1', '15', '0', '.397', '1', '1', '1-1', '29', '35', '3', '.955'], ['GULAKOWSKI, Brady', '.267', '16-16', '60', '8', '16', '6', '0', '2', '9', '28', '.467', '5', '0', '21', '0', '.313', '2', '1', '0-1', '103', '13', '2', '.983'], ['GUY, Jaylen', '.245', '17-17', '53', '6', '13', '4', '0', '1', '5', '20', '.377', '3', '5', '14', '0', '.344', '0', '2', '1-3', '33', '1', '3', '.919'], ['WILKINSON, Garrett', '.244', '16-15', '45', '5', '11', '0', '0', '1', '7', '14', '.311', '12', '1', '16', '0', '.407', '1', '0', '0-0', '78', '3', '2', '.976'], ['ROHRER, Brandon', '.230', '17-17', '61',

In [22]:
# Double checking that 'offensive_stats' is a list
type(offensive_stats)

list

In [32]:
# Double checking that 'offensive_stats' is a list
type(offense_header)

list

### Find index of 'ERA' in order to know what index number to use to start spliting up the pitching stats

In [23]:
# Get the index value of 'era'. Subtract 1 from the index value to find the start of the Pitching Stats table
clean_stats.index('era')

458

### Split stats list into a sublist containing player defensive stats only

In [26]:
# Initialize a new list
# Iterate through the length of 'clean_stats' starting at index number before 'era'  place text into new lists
# The new lists should contain 23 items  are 23 baseball pitching stats 
pchunks = [clean_stats[x:x+23] for x in range(457, len(clean_stats), 23)]

pitching_header = pchunks[0]

pitching_stats = pchunks[1:14]

#print(pitching_header)

#print(pchunks)

print(pitching_stats)

[['ADAMETZ III, Joe', '1.59', '1-1', '4-4', '0', '0/1', '0', '22.2', '24', '6', '4', '4', '21', '6', '0', '0', '90', '.267', '2', '0', '0', '0', '0'], ['SKIRROW, Noah', '1.96', '1-1', '4-4', '0', '0/0', '0', '23.0', '14', '8', '5', '13', '20', '3', '0', '1', '78', '.179', '0', '4', '0', '0', '3'], ['HAND, Mason', '2.18', '3-1', '5-4', '0', '0/1', '0', '20.2', '17', '6', '5', '9', '15', '2', '0', '0', '78', '.218', '2', '1', '0', '1', '0'], ['MEYER, Mason', '3.38', '1-3', '4-4', '0', '0/0', '0', '21.1', '25', '10', '8', '4', '20', '4', '0', '2', '87', '.287', '1', '1', '0', '0', '2'], ['BARKER, Logan', '0.00', '0-0', '2-0', '0', '0/0', '0', '2.2', '1', '0', '0', '0', '1', '1', '0', '0', '9', '.111', '0', '0', '0', '0', '0'], ['RILEY, Landon', '1.46', '2-0', '7-0', '0', '0/1', '4', '12.1', '6', '2', '2', '3', '15', '2', '0', '0', '44', '.136', '0', '0', '0', '0', '0'], ['BRITTS, Troy', '1.54', '0-0', '9-0', '0', '0/1', '1', '11.2', '6', '2', '2', '5', '4', '1', '0', '1', '37', '.162', '0

## Create a DataFrame with the columns

In [27]:
df = pd.DataFrame(offensive_stats, columns=offense_header)

df

Unnamed: 0,Player,avg,gp-gs,ab,r,h,2b,3b,hr,rbi,...,so,gdp,ob%,sf,sh,sb-att,po,a,e,fld%
0,"HIGHFILL, Ben",0.317,17-17,60,10,19,6,1,3,12,...,9,1,0.405,3,0,0-0,20,47,2,0.971
1,"LOCKLEAR, Cam",0.316,17-17,57,7,18,4,0,1,9,...,15,0,0.466,0,0,3-8,24,51,4,0.949
2,"MCDYRE, Trey",0.288,16-15,52,11,15,5,0,0,6,...,15,0,0.397,1,1,1-1,29,35,3,0.955
3,"GULAKOWSKI, Brady",0.267,16-16,60,8,16,6,0,2,9,...,21,0,0.313,2,1,0-1,103,13,2,0.983
4,"GUY, Jaylen",0.245,17-17,53,6,13,4,0,1,5,...,14,0,0.344,0,2,1-3,33,1,3,0.919
5,"WILKINSON, Garrett",0.244,16-15,45,5,11,0,0,1,7,...,16,0,0.407,1,0,0-0,78,3,2,0.976
6,"ROHRER, Brandon",0.23,17-17,61,9,14,4,0,1,7,...,13,1,0.273,1,4,0-0,28,2,0,1.0
7,"BETTS, Gray",0.172,17-17,64,6,11,2,0,0,4,...,10,1,0.312,0,2,0-1,46,4,1,0.98
8,"SHILLING, Matt",0.176,8-6,17,4,3,0,0,1,2,...,8,0,0.417,0,1,0-0,4,0,0,1.0
9,"WILSON, Owen",0.143,9-5,14,3,2,0,0,0,1,...,6,0,0.278,1,1,1-1,0,0,0,0.0


### Create new DataFrames to split columns containing 2 stats into 2 columns with 1 stat each

In [28]:
tmp_df = df["gp-gs"].str.split("-", n=1, expand=True)
df["gp"]= tmp_df[0]
df["gs"]= tmp_df[1]

tmp_df2 = df["sb-att"].str.split("-", expand=True)
df["sb"] = tmp_df2[0]
df["att"] = tmp_df2[1]


df

Unnamed: 0,Player,avg,gp-gs,ab,r,h,2b,3b,hr,rbi,...,sh,sb-att,po,a,e,fld%,gp,gs,sb,att
0,"HIGHFILL, Ben",0.317,17-17,60,10,19,6,1,3,12,...,0,0-0,20,47,2,0.971,17,17,0,0
1,"LOCKLEAR, Cam",0.316,17-17,57,7,18,4,0,1,9,...,0,3-8,24,51,4,0.949,17,17,3,8
2,"MCDYRE, Trey",0.288,16-15,52,11,15,5,0,0,6,...,1,1-1,29,35,3,0.955,16,15,1,1
3,"GULAKOWSKI, Brady",0.267,16-16,60,8,16,6,0,2,9,...,1,0-1,103,13,2,0.983,16,16,0,1
4,"GUY, Jaylen",0.245,17-17,53,6,13,4,0,1,5,...,2,1-3,33,1,3,0.919,17,17,1,3
5,"WILKINSON, Garrett",0.244,16-15,45,5,11,0,0,1,7,...,0,0-0,78,3,2,0.976,16,15,0,0
6,"ROHRER, Brandon",0.23,17-17,61,9,14,4,0,1,7,...,4,0-0,28,2,0,1.0,17,17,0,0
7,"BETTS, Gray",0.172,17-17,64,6,11,2,0,0,4,...,2,0-1,46,4,1,0.98,17,17,0,1
8,"SHILLING, Matt",0.176,8-6,17,4,3,0,0,1,2,...,1,0-0,4,0,0,1.0,8,6,0,0
9,"WILSON, Owen",0.143,9-5,14,3,2,0,0,0,1,...,1,1-1,0,0,0,0.0,9,5,1,1


### Delete any unneeded columns and rearrange the column order in a new DataFrame for exporting

In [34]:
# Drop the columns from the Dataframe
new_df = df.drop(["gp-gs","sb-att", "a", "e", "fld%", "po", "tb","slg%","avg","ob%"], axis=1)

# Rearrange Dataframe column order
new_df = new_df[['Player','gp', 'gs', 'ab', 'r', 'h', '2b', '3b', 'hr', 'rbi','bb','hbp','so', 'gdp', 'sf', 'sh','sb','att']]

# Display new Dataframe
new_df

Unnamed: 0,Player,gp,gs,ab,r,h,2b,3b,hr,rbi,bb,hbp,so,gdp,sf,sh,sb,att
0,"HIGHFILL, Ben",17,17,60,10,19,6,1,3,12,10,1,9,1,3,0,0,0
1,"LOCKLEAR, Cam",17,17,57,7,18,4,0,1,9,7,9,15,0,0,0,3,8
2,"MCDYRE, Trey",16,15,52,11,15,5,0,0,6,9,1,15,0,1,1,1,1
3,"GULAKOWSKI, Brady",16,16,60,8,16,6,0,2,9,5,0,21,0,2,1,0,1
4,"GUY, Jaylen",17,17,53,6,13,4,0,1,5,3,5,14,0,0,2,1,3
5,"WILKINSON, Garrett",16,15,45,5,11,0,0,1,7,12,1,16,0,1,0,0,0
6,"ROHRER, Brandon",17,17,61,9,14,4,0,1,7,4,0,13,1,1,4,0,0
7,"BETTS, Gray",17,17,64,6,11,2,0,0,4,11,2,10,1,0,2,0,1
8,"SHILLING, Matt",8,6,17,4,3,0,0,1,2,6,1,8,0,0,1,0,0
9,"WILSON, Owen",9,5,14,3,2,0,0,0,1,2,1,6,0,1,1,1,1


# Export out the new DataFrame as a csv file
## Don't forget to double check the path and name

In [30]:
new_df.to_csv('D:\From_HOME\Liberty_Baseball_Stats.csv',na_rep='Unknown')