<h1><center>Data Analytics on Player Performance in Major League Baseball</center></h1>
<h2><center>Chris Emm</center></h2>

In [None]:
import sqlite3
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.ticker as mticker

### Introduction

Baseball.

### Part I: Scraping Batting Statistics

In [27]:
# Create dataframe of player information like name and age
player_table = pd.read_csv('tables/People.csv')
shortened_player = player_table[['playerID','nameFirst', 'nameLast', 'birthYear']]
shortened_player = shortened_player.assign(Name = shortened_player.nameFirst.str.cat(shortened_player.nameLast,sep=' '))
shortened_player = shortened_player[['playerID', 'Name', 'birthYear']]

# Create dataframe of batting statistics
batting_table = pd.read_csv('tables/Batting.csv')
batting_table = batting_table[batting_table.yearID > 1995]

# Create dataframe of pitching statistics
pitching_table = pd.read_csv('tables/Pitching.csv')
pitching_table = pitching_table[pitching_table.yearID > 1995]

# Shorten the pitching dataframe to only have playerID, yearID, teamID, and BFP (batter's faced)
shortened_pitching = pitching_table[['playerID', 'yearID', 'teamID','BFP']]

# Combine the shortened pitching table with the batting table and only include rows where
# the player faced less than 10 batters (likely means that they are primarily a batter
both = pd.merge(batting_table, shortened_pitching, on=['playerID', 'yearID', 'teamID'])

# Combine the pitching and batting stats
batting_shortened_pitching_table = pd.merge(both, shortened_player, on='playerID')

# Shohei Ohtani is the game's only true 2-way player. Since pitchers normally are terrible at hitting,
# we want to focus only on players who don't pitch (and just bat). To do this, we are going to remove all rows
# that have over 10 batters faced for the year. I made a limit at 10 because sometimes position players do pitch 
# if the game is a blow-out or goes into extra innings. Since Shohei is excellent at batting and pitching, and does both,
# we needed to store his rows so we can add them back to the table after removing pitchers from the table.
shohei = batting_shortened_pitching_table[batting_shortened_pitching_table.Name == 'Shohei Ohtani']
full_batting = batting_shortened_pitching_table[batting_shortened_pitching_table.BFP < 10]

# Add the data fro Shohei onto the full batting table
full_batting = pd.concat([full_batting, shohei], axis=0)

# Rename columns
full_batting = full_batting.rename(columns={'teamID':'Team', 'lgID':'Lg', 'yearID':'Year'})

# Calculate the age of a player for the given season and add it to a new column
full_batting = full_batting.assign(Age = full_batting.Year - full_batting.birthYear)

# Grabbing only needed columns
full_batting = full_batting[['Name', 'Age', 'Year','Team', 'Lg', 'G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SB', \
                             'CS', 'BB', 'SO', 'IBB', 'HBP', 'SH', 'SF', 'GIDP']]

# Resetting the index
full_batting = full_batting.reset_index(drop=True)

full_batting.head()

Unnamed: 0,Name,Age,Year,Team,Lg,G,AB,R,H,2B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
0,Mike Aldrete,35.0,1996,NYA,AL,32,68,11,17,5,...,12.0,0.0,1.0,9,15.0,0.0,0.0,0.0,0.0,1.0
1,Manny Alexander,25.0,1996,BAL,AL,54,68,6,7,0,...,4.0,3.0,3.0,3,27.0,0.0,0.0,2.0,0.0,2.0
2,Andy Ashby,37.0,2004,SDN,NL,2,0,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,Robbie Beckett,25.0,1997,COL,NL,2,0,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
4,Alan Benes,27.0,1999,SLN,NL,2,0,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
