# Fantasy Baseball Optimization
*Allyson Tom, Drew Pearson, Jacob Adams*

In [194]:
import pandas as pd
import re
import numpy as np
from __future__ import division
import os
from matplotlib import pyplot as plt
%matplotlib inline

We need two subsets of data for this portion of our dataset: hitter data and pitcher data.

### Hitter Data
We will first create a pandas DataFrame that holds the following hitting statistics for players for the 2010 through 2016 seasons:
- Player Name
- Player Age
- Doubles (2B)
- Triples (3B)
- Runs (R)
- Homeruns (HR)
- Walks (BB)
- Strikeouts (SO)
- Runs Batted In (RBI)
- Stolen Bases (SB)
- Batting Average (AVG)
- On-Base Percentage (OBP)
- Slugging Percentage (SLG)
- Season/Year
- At Bats (AB)
- Position (Pos Summary)

In [195]:
# ================================== CREATE DATAFRAME OF ALL HITTER INFORMATION =====================================

# empty list to append to for each year of information
hittersDF = []
# list of years to loop through
year_list = [2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009]
for year in year_list:
    # open the file PlayerData/2016Hitters (or whatever year)
    with open('CSV_files/PlayerData/' + str(year)+'Hitters') as inFile:
        # convert csv file to pandas DataFrame, specifying datatypes
        DF2016 = pd.read_csv(inFile)
        # create year column
        DF2016["Year"] = year
        # append to master list
        hittersDF.append(DF2016)
#concatenate master list into pandas DataFrame
hittersDF = pd.concat(hittersDF)

# remove all except columns of interest for our particular project (files have more stats than we are interested in)
hittersDF = hittersDF[["Name", "Age", "2B", "3B", "BB", "SO", "OBP", "SLG", "Tm", "R", "HR", "RBI", "SB", "BA", \
                       "AB", "Pos Summary", "Year"]]

Now that we have all the hitting data in a Pandas DataFrame, we need to clean it. Right now, we have 10,766 rows and 10 columns in our hitters DataFrame. Before we get into the cleaning, here is a sample of what the hittersDF looks like, and the null values that we have at this point.

In [196]:
hittersDF.head()

Unnamed: 0,Name,Age,2B,3B,BB,SO,OBP,SLG,Tm,R,HR,RBI,SB,BA,AB,Pos Summary,Year
0,Fernando Abad*\abadfe01,30,0.0,0.0,0.0,1.0,0.0,0.0,TOT,0.0,0.0,0.0,0.0,0.0,1.0,1,2016
1,Fernando Abad*\abadfe01,30,0.0,0.0,0.0,1.0,0.0,0.0,MIN,0.0,0.0,0.0,0.0,0.0,1.0,1,2016
2,Fernando Abad*\abadfe01,30,0.0,0.0,0.0,0.0,,,BOS,0.0,0.0,0.0,0.0,,0.0,1,2016
3,Jose Abreu\abreujo02,29,32.0,1.0,47.0,125.0,0.353,0.468,CHW,67.0,25.0,100.0,0.0,0.293,624.0,*3/D,2016
4,A.J. Achter\achteaj01,27,0.0,0.0,0.0,0.0,,,LAA,0.0,0.0,0.0,0.0,,0.0,1,2016


In [197]:
print hittersDF.isnull().sum()

Name              0
Age               0
2B              139
3B              139
BB              139
SO              139
OBP            3472
SLG            3507
Tm                0
R               139
HR              139
RBI             139
SB              139
BA             3507
AB              139
Pos Summary      37
Year              0
dtype: int64


As seen above, there are a few issues with our DataFrame as it currently stands. We decided that we would record names in all lowercase letters for uniformity; there are also symbols and letters appearing in the Name column that need to be removed. There are several players who have more than one row because they were traded mid-way through a season. For that reason, we drop all rows that are not the player's total for the given year. We also wanted to create a name column that has first initial and last name for later use. Finally, we remove all pitchers from the hitters dataframe and we limit the hitters dataframe to only players with more than 200 at bats in a given season. We will address these issues in the code that follows.

Note: We keep player position for later use. The symbols denote the following:
- \* indicates the player played 2/3 or more of the season there
- positions after \ indicate that a player played less than 10 games there

In [198]:
# functions that will be used to separate Names into 'First' and 'Last' name columms
def lower_names(string):
    split_string = string.split(',')
    return str.lower(split_string[0])
def split_names_first(string):
    split_string = string.split(' ')
    first = str.lower(split_string[0])
    return first
def split_names_first_initial(string):
    split_string = string.split(' ')
    first = str.lower(list(split_string[0])[0]) + '.'
    return first
def split_names_last(string):
    split_string = string.split(' ')
    last = str.lower(' '.join(split_string[1:]))
    return last

In [199]:
# replace names with corrected version by removing the unnecessary portions that appear in the DF above
hittersDF['Name'] = hittersDF['Name'].str.replace(r'[*|\\|#|\+].*', '')
# change all letters in names to lower case
hittersDF['Name'] = hittersDF['Name'].apply(lower_names)
# create columns separating out pieces of player names
hittersDF['First'], hittersDF['First_initial'], hittersDF['Last'] = \
        hittersDF['Name'].apply(split_names_first),hittersDF['Name'].apply(split_names_first_initial),\
        hittersDF['Name'].apply(split_names_last)
hittersDF['Abbr_Name'] = hittersDF['First_initial'] + ' ' + hittersDF['Last']
del hittersDF['First'], hittersDF['First_initial'], hittersDF['Last']

# drop duplicates - keep first occurence, which is the one we want (total)
for yr in year_list:
    hittersDF[hittersDF["Year"] == yr] = hittersDF[hittersDF["Year"] == yr].drop_duplicates('Name')

# drop rows with null values in the Name column
hittersDF = hittersDF.dropna(subset = ["Name"], axis=0)
# drop Team column
hittersDF = hittersDF.drop("Tm", axis=1)

In [200]:
# drop players with position = '1' (pitchers)
hittersDF = hittersDF[(hittersDF["Pos Summary"] != "1") & (hittersDF["Pos Summary"] != "/1")]

# drop player with fewer than 200 at-bats
hittersDF = hittersDF[hittersDF["AB"] > 200]
# Reset the index values
hittersDF = hittersDF.reset_index(drop=True)

In [201]:
# assign columns the correct datatypes

hittersDF["Name"] = hittersDF["Name"].astype(str)
hittersDF["R"] = hittersDF["R"].astype(int)
hittersDF["HR"] = hittersDF["HR"].astype(int)
hittersDF["RBI"] = hittersDF["RBI"].astype(int)
hittersDF["SB"] = hittersDF["SB"].astype(int)
hittersDF["AB"] = hittersDF["AB"].astype(int)
hittersDF["Pos Summary"] = hittersDF["Pos Summary"].astype(str)
hittersDF["Year"] = hittersDF["Year"].astype(int)
hittersDF["Abbr_Name"] = hittersDF["Abbr_Name"].astype(str)

With all of these corrections, our hitters dataframe is ready to use. We now have 2,313 rows and 9 columns. Here is a cleaned sample:

In [202]:
print hittersDF.shape
hittersDF.head()

(2647, 17)


Unnamed: 0,Name,Age,2B,3B,BB,SO,OBP,SLG,R,HR,RBI,SB,BA,AB,Pos Summary,Year,Abbr_Name
0,jose abreu,29.0,32.0,1.0,47.0,125.0,0.353,0.468,67,25,100,0,0.293,624,*3/D,2016,j. abreu
1,cristhian adames,24.0,7.0,3.0,24.0,47.0,0.304,0.302,25,2,17,2,0.218,225,645,2016,c. adames
2,matt adams,27.0,18.0,0.0,25.0,81.0,0.309,0.471,37,16,54,0,0.249,297,3,2016,m. adams
3,nick ahmed,26.0,9.0,1.0,15.0,58.0,0.265,0.299,26,4,20,5,0.218,284,6,2016,n. ahmed
4,yonder alonso,29.0,34.0,0.0,45.0,74.0,0.316,0.367,52,7,56,3,0.253,482,*3/5D,2016,y. alonso


Here is a table of some basic summary statistics for the dataset of actual hitter statistics as well, followed by a sum of null values to demonstrate that they have all been handled. It can also be seen from the summary statistics that nothing seems out of range, and the values reported make sense given the context.

In [203]:
hittersDF.describe()

Unnamed: 0,Age,2B,3B,BB,SO,OBP,SLG,R,HR,RBI,SB,BA,AB,Year
count,2647.0,2647.0,2647.0,2647.0,2647.0,2647.0,2647.0,2647.0,2647.0,2647.0,2647.0,2647.0,2647.0,2647.0
mean,28.687193,21.826974,2.306385,39.010956,86.077824,0.326362,0.413285,54.970533,13.042312,52.760484,7.652437,0.260409,416.784662,2012.499056
std,4.00373,9.590062,2.405314,20.582179,34.609177,0.03579,0.069297,23.763302,9.401381,25.433269,9.767483,0.031363,130.942264,2.292463
min,19.0,2.0,0.0,4.0,21.0,0.174,0.187,4.0,0.0,8.0,0.0,0.146,201.0,2009.0
25%,26.0,14.0,1.0,24.0,60.0,0.303,0.365,36.0,6.0,32.0,1.0,0.24,298.0,2010.0
50%,28.0,21.0,2.0,35.0,81.0,0.325,0.409,52.0,11.0,49.0,4.0,0.26,417.0,2013.0
75%,31.0,28.0,3.0,51.0,107.0,0.349,0.457,72.0,18.0,70.0,11.0,0.282,532.0,2014.0
max,43.0,56.0,16.0,143.0,223.0,0.474,0.658,136.0,54.0,141.0,70.0,0.365,684.0,2016.0


In [204]:
print hittersDF.isnull().sum()

Name           0
Age            0
2B             0
3B             0
BB             0
SO             0
OBP            0
SLG            0
R              0
HR             0
RBI            0
SB             0
BA             0
AB             0
Pos Summary    0
Year           0
Abbr_Name      0
dtype: int64


We see that after cleaning our hitters data we do not have any null values. We did not deliberately drop na values, but we did drop pitchers and players that will not add value in the scope of this project. As a result we have a nice clean dataset. 

### Pitcher Data

We now create a pandas DataFrame that holds the following pitching statistics for players (pitchers only) for the 2010 through 2016 seasons:
- Player Name
- Strikeouts (K)
- Wins (W)
- Saves (SV)
- Earned Run Average (ERA)
- Walks plus Hits per Inning Pitched (WHIP)
- Season/Year
- Innings Pitched (IP)

In [205]:
# ================================== CREATE DATAFRAME OF ALL PITCHER INFORMATION =====================================

# empty list to append to for each year of information
pitchersDF = []
# loop through years of interest
for year in year_list:
    # open the file PlayerData/2016Pitchers (or whatever year)
    with open('CSV_files/PlayerData/'+str(year)+'Pitchers') as inFile:
        # convert csv file to pandas DataFrame
        DF2016 = pd.read_csv(inFile)
        # create year column
        DF2016["Year"] = year
        # append to master list
        pitchersDF.append(DF2016)
#concatenate master list into pandas DataFrame
pitchersDF = pd.concat(pitchersDF)
# remove all except columns of interest for our particular project (files have more stats than we are interested in)
pitchersDF = pitchersDF[["Name", "Age", "Tm", "Year", "SO", "W", "SV", "ERA", "WHIP", "IP", "H", "BB", "FIP", "BF"]]

Now that we have all of the pitching data in a Pandas dataframe, we need to clean it. Right now, we have 5,625 rows and 9 columns in our pitchers dataframe. Before we get into the cleaning, here is a sample of what the pitchersDF looks like, and the null values that it contains at this point.

In [206]:
pitchersDF.head()

Unnamed: 0,Name,Age,Tm,Year,SO,W,SV,ERA,WHIP,IP,H,BB,FIP,BF
0,Fernando Abad*\abadfe01,30,TOT,2016,41,1,1,3.66,1.329,46.2,40,22,3.98,198
1,Fernando Abad*\abadfe01,30,MIN,2016,29,1,1,2.65,1.206,34.0,27,14,3.44,138
2,Fernando Abad*\abadfe01,30,BOS,2016,12,0,0,6.39,1.658,12.2,13,8,5.44,60
3,A.J. Achter\achteaj01,27,LAA,2016,14,1,0,3.11,1.46,37.2,43,12,5.85,160
4,Austin Adams\adamsau01,29,CLE,2016,17,0,0,9.82,1.855,18.1,27,7,5.98,88


In [207]:
print pitchersDF.isnull().sum()

Name    0
Age     0
Tm      0
Year    0
SO      0
W       0
SV      0
ERA     2
WHIP    6
IP      0
H       0
BB      0
FIP     6
BF      0
dtype: int64


Again, we have several of the same issues as we saw above in the hitters dataframe. We will make all names lowercase, fix their formatting, and add a column for first initial and last name only. We again drop rows representing trades and keep only player totals across any given season. We will address these issues in the code that follows.

In [208]:
# replace names with corrected version by removing the unnecessary portions that appear in the DF above
pitchersDF['Name'] = pitchersDF['Name'].str.replace(r'[*|\\|#|\+].*', '')
# convert all names to lower case
pitchersDF['Name'] = pitchersDF['Name'].apply(lower_names)
# create columns separating out pieces of player names
pitchersDF['First'], pitchersDF['First_initial'], pitchersDF['Last'] = \
        pitchersDF['Name'].apply(split_names_first),pitchersDF['Name'].apply(split_names_first_initial),\
        pitchersDF['Name'].apply(split_names_last)
pitchersDF['Abbr_Name'] = pitchersDF['First_initial'] + ' ' + pitchersDF['Last']
del pitchersDF['First'], pitchersDF['First_initial'], pitchersDF['Last']

# drop duplicates - default is to keep first occurence
for yr in year_list:
    pitchersDF[pitchersDF["Year"] == yr] = pitchersDF[pitchersDF["Year"] == yr].drop_duplicates('Name')
# drop rows with null values in the Name column
pitchersDF = pitchersDF.dropna(subset = ["Name"], axis=0)
# drop Team column
pitchersDF = pitchersDF.drop("Tm", axis=1)

We had a few missing data points in the pitchers dataframe. In every case, the pitchers for which this occurred had no statistics for the given season. Thus, we dropped those pitchers from the dataframe for the respective years, especially because this only occured for 6 observations out of about 5,600. Lastly, we drop pitchers with fewer than 25 innings pitched in a given season, as these are not going to be significant players in the given season.

In [209]:
# Drop rows where pitcher had no stats recorded for the year
pitchersDF = pitchersDF[pitchersDF.ERA.notnull()]
pitchersDF = pitchersDF[pitchersDF.WHIP.notnull()]
# Drop pitchers with fewer than 25 innings pitched in a given season
pitchersDF = pitchersDF[pitchersDF["IP"] > 25]
# Reset the index values
pitchersDF = pitchersDF.reset_index(drop=True)

In [210]:
# Assign columns to the correct datatypes
pitchersDF["Name"] = pitchersDF["Name"].astype(str)
pitchersDF["Year"] = pitchersDF["Year"].astype(int)
pitchersDF["SO"] = pitchersDF["SO"].astype(int)
pitchersDF["W"] = pitchersDF["W"].astype(int)
pitchersDF["SV"] = pitchersDF["SV"].astype(int)
pitchersDF["Abbr_Name"] = pitchersDF["Abbr_Name"].astype(str)

With all of these corrections, our pitchers dataframe is ready to use. We now have 3,157 rows and 8 columns. Here is a cleaned sample:

In [211]:
pitchersDF.head()

Unnamed: 0,Name,Age,Year,SO,W,SV,ERA,WHIP,IP,H,BB,FIP,BF,Abbr_Name
0,fernando abad,30.0,2016,41,1,1,3.66,1.329,46.2,40.0,22.0,3.98,198.0,f. abad
1,a.j. achter,27.0,2016,14,1,0,3.11,1.46,37.2,43.0,12.0,5.85,160.0,a. achter
2,tim adleman,28.0,2016,47,4,0,4.0,1.206,69.2,64.0,20.0,5.3,287.0,t. adleman
3,matt albers,33.0,2016,30,2,0,6.31,1.675,51.1,67.0,19.0,5.8,237.0,m. albers
4,cody allen,27.0,2016,87,3,32,2.51,1.0,68.0,41.0,27.0,3.31,264.0,c. allen


Here is a table of some basic summary statistics for the dataset of actual pitcher statistics as well. It can be seen from the summary statistics that nothing seems out of range, and the values reported make sense given the context.

In [212]:
pitchersDF.describe()

Unnamed: 0,Age,Year,SO,W,SV,ERA,WHIP,IP,H,BB,FIP,BF
count,3607.0,3607.0,3607.0,3607.0,3607.0,3607.0,3607.0,3607.0,3607.0,3607.0,3607.0,3607.0
mean,28.252564,2012.553368,75.792903,5.204879,2.727474,3.995761,1.32687,90.604464,87.65567,30.487386,4.017444,384.876074
std,4.104058,2.297571,51.230825,4.437449,8.327682,1.324327,0.229476,58.628642,58.182512,18.69538,0.968475,244.190186
min,19.0,2009.0,7.0,0.0,0.0,0.45,0.565,25.1,11.0,2.0,0.78,90.0
25%,25.0,2011.0,38.0,2.0,0.0,3.09,1.1765,47.2,44.0,17.0,3.37,205.0
50%,27.0,2013.0,59.0,4.0,0.0,3.86,1.315,66.2,62.0,25.0,3.97,279.0
75%,31.0,2015.0,101.0,7.0,1.0,4.75,1.463,130.2,131.0,41.0,4.62,561.0
max,49.0,2016.0,301.0,24.0,51.0,10.69,2.374,251.0,262.0,105.0,9.23,1009.0


As shown below, we now have a nice clean pitchers dataset, free of null values.

In [213]:
print pitchersDF.isnull().sum()

Name         0
Age          0
Year         0
SO           0
W            0
SV           0
ERA          0
WHIP         0
IP           0
H            0
BB           0
FIP          0
BF           0
Abbr_Name    0
dtype: int64


## Statistical Projection Data

We will scrape statistical projections for each of the seasons 2010-2015 from FanGraphs, Steamer, Guru, Marcel, CBS, and ESPN. We follow the same pattern for all 6 of the prediction methods. So, rather than printing out an example of each dataframe, its size, its null values, etc., we will give the summary of each dataframe here and explain any issues that arose. Then, we will go through the cleaning process for the Marcel dataframe in detail, which outlines the process we followed for the other 5 prediction methods. The rest will simply follow in code.

- Marcel: 
    - This dataset was the hardest to clean. The dataset had a lot of features that are irrelevant to our problem, but this dataset was also missing two integral features: AVG and WHIP. We were able to engineer these features from the other features provided from this dataset. 
    - The pitchers dataframe was missing Saves for about 350 rows of the 2,000 rows of data. 
    - After cleaning, the hitters dataframe has about 1,800 rows and the pitchers has about 2,300 rows, which is 300/year and 380/year, respectively.
- Fangraphs:
    - Fangraphs did not predict saves for 2010 and 2011.
    - After cleaning, the pitchers dataframe has about 1,500 rows and the hitters dataframe has about 1,200 rows. 
- Steamer:
    - Steamer was the largest dataset at the outset. This is because it predicted statistics for every possible player. 
    - Steamer did not predict Saves.
    - After cleaning, the hitters dataframe has about 1,900 rows and the pitchers dataframe has about 2,500 rows.
- Guru:
    - For non-closers, the Guru data had "-" for saves instead of 0's. So, we replaced those.
    - Guru had about 300 rows of missing data. However, on inspection of these players we found that these players are minimal playing time players and bench players, thus they will not be important for fantasy baseball as only the top 300-400 players are drafted. So, we dropped those players and were left with no missing values.
    - After cleaning, the hitters dataframe has about 1,600 rows and the pitchers dataframe has about 2,300 rows.
- ESPN: 
    - ESPN is missing Saves for 1 year.
    - ESPN, like Guru, included some players of minor importance and instead of being Nan values they had "--". We treated this like Nan values and again dropped them because they are not of importance for fantasy baseball purposes. 
    - After cleaning, the hitters dataframe has about 1,800 rows and the pitchers dataframe has about 1,800 rows.
- CBS:
    - CBS is missing Runs for 1 year.
    - After cleaning, the hitters dataframe has about 1,800 rows and the pitchers dataframe has about 1,800 rows.

As mentioned above, a few of the projection methods did not predict Saves or are missing Saves for a season or two. This will not be a serious issue for us, as we plan to run models on each statistic seperately. So, in the case of Steamer (no Saves predictions), we simply won't include this method when we try to determine who the best is at predicting Saves. As for the methods that are missing Saves for a couple of seasons, or CBS is missing runs for a season, this will not be a showstopper. There is no way to engineer these values as they are created from a specific model that we do not have access to, doing so would skew our data greatly, and in the case of Saves they are completely independent of the other statistics. So, we won't include that year when analyzing that particular statistic, i.e. Fangraphs is missing saves for 2010, so we won't include Fangraphs 2010 when measuring Fangraphs ability to predict saves. We do not drop these years or rows entirely as they still contain the predictions of the other statistics and provide extreme value. 

Another note about the data is the size of our dataframes. While we would ideally like to have more datapoints, prediction data is hard to obtain. Plus, most predictions are only concerned with the top 300 players, so for 6 years that is only 1,800 rows of data. We were able to obtain predictions for the years 2010-2015, which results in about 1,400-2,100 rows of data for each dataframe. These are not as large as we like, but we will still move forward. In addition, we were unable to acquire projections for 2016. This data was not available anywhere. While unfortunate, this does not restrict our methods or models we plan on using, but it is important to note. We have not obtained the 2017 predictions yet as they have not all been published, because spring training is currently happening and that affects some of the prediction methods. We will obtain this data in a couple of weeks when spring training is completed and the predictions are completed. Also, while we are watching for these to appear, we will keep an eye open for the 2016 data.

While our dataset is smaller than what is ideal, and we do not have 2016 projections, we can be sure that this data is very reliable. First off, our official statistics dataframes come from baseball-reference.com, which is the leading site in baseball statistics. Every baseball site, blog, and analyst including the leaders such as ESPN, CBS sports, Fox Sports, etc. use baseball-reference.com. The actual projections come from the various websites and many articles on Bleacher Report, Razzball, Fantasy Pros, etc. reference these different projection methods. So, we assume that these are reliable data. A potential bias could occur, if, say ESPN went back and changed their projections for previous years to appear more accurate. However, we believe this will not be an issue as we obtained the data from a 3rd party that does not have immediate ties to any of the 6 projection methods. This 3rd party is reliable as it is referenced in many articles from the leading sports pages.

In [214]:
# list for looping through seasons in cleaning
season = np.arange(2010,2016)

We realized that we did not have need for predictions for every single player in the MLB because our league will only draft a given number of players.  As such, we chose to use the hitters and pitchers with at bats and pitches above a given threshold.  Using this list of players for every given year we created a mask and selected these players from each of our 6 sites.  Due to naming conventions we had to use an extra function to check matching names on both a complete name basis as well as using a first initial and last name on the subset of names leftover from the first check.

In [215]:
#Rename the statistics in the hittersDF so we can differentiate the actual and predicted statistics when we merge
hittersDF = hittersDF.rename(columns = {'Year':'Season', 'R' : 'actual_R', 'HR' : "actual_HR", \
                                        'RBI': 'actual_RBI', 'SB': 'actual_SB', 'BA': 'actual_AVG', 'AB':'actual_AB',\
                                       '2B': 'actual_2B', '3B': 'actual_3B', 'Age': 'actual_age', 'BB': 'actual_BB',\
                                       'OBP': 'actual_OBP', 'SLG': 'actual_SLG', 'SO': 'actual_SO'})

#Rename the statistics in the pitchersDF so we can differentiate the actual and predicted statistics when we merge
pitchersDF = pitchersDF.rename(columns = {'Year':'Season', 'SO' : 'actual_K', 'W' : "actual_W", 'ERA': 'actual_ERA',\
                                          'WHIP': 'actual_WHIP', 'IP': 'actual_IP', 'SV':'actual_SV', \
                                          'FIP':'actual_FIP', 'BB': 'actual_BB', 'BF': 'actual_BF', 'H': 'actual_H'})

In [216]:
def find_names(df, site_df, season=season):
    
    # we create blank data frames that we will be using to store the necessary data, we might need to work on naming
    #     conventions...
    
    # dataframe containing Name hits from both the site DFs and the PlayerData DFs
    new_site_df_1 = pd.DataFrame()
    
    # dataframe containing Name hits NOT IN PlayerData DF
    not_site_df_1 = pd.DataFrame()
    
    # dataframe containing names in PlayerData DF and not the site DF
    allyson_not = pd.DataFrame()
    
    # Observe these next three dataframes are to see the same information as before but for Abbr_Name and within the
    #     intersection of the previous dataframes
    new_site_df_2 = pd.DataFrame()
    not_site_df_2 = pd.DataFrame()
    allyson_not_site = pd.DataFrame()

    
    # loop through every season.  Observe a couple sites are missing data for different seasons which is why we allow
    #     the user to pass in the appropriate seasons they wish to use.
    for i in season:
        
        # checking for names that exist in both pitchersDF and espn_pitchers
        dfcheck = df[df['Season']==i]
        site_df_check = site_df[site_df['Season']==i]

        # we were using more complex 'isin' checks but opted to just use a pd.merge because we eventually needed to 
        #     combine the site data with the actual player DF, same applies to code below
        df_subset_1_TEST = pd.merge(site_df_check, dfcheck, on=['Name', 'Season'])
        new_site_df_1 = new_site_df_1.append(df_subset_1_TEST)
        ####df_subset_1 = site_df[(site_df['Name'].isin(dfcheck['Name'])) & (site_df['Season']==i)]
        ####new_site_df_1 = new_site_df_1.append(df_subset_1)        
        
        not_df_subset_1 = site_df[(site_df['Name'].isin(dfcheck['Name'])==False) & (site_df['Season']==i)]
        not_site_df_1 = not_site_df_1.append(not_df_subset_1)

        not_allyson_subset = df[(df['Name'].isin(site_df_check['Name'])==False)& (df['Season']==i)]
        allyson_not = allyson_not.append(not_allyson_subset)

        # checking for names that did not exist in both pitchersDF and source, but do match first initials
        allysoncheck = allyson_not[allyson_not['Season']==i]
        sitecheck = not_site_df_1[not_site_df_1['Season']==i]

        site_df_subset_TEST = pd.merge(not_site_df_1, allysoncheck, on=['Abbr_Name', 'Season'])
        new_site_df_2 = new_site_df_2.append(site_df_subset_TEST)
        ####site_df_subset = not_site_df_1[(not_site_df_1['Abbr_Name'].isin(allysoncheck['Abbr_Name'])) & \
        ####                               (not_site_df_1['Season']==i)]
        ####new_site_df_2 = new_site_df_2.append(site_df_subset)

        not_df_subset_2 = not_site_df_1[(not_site_df_1['Abbr_Name'].isin(allysoncheck['Abbr_Name'])==False) & \
                                        (not_site_df_1['Season']==i)]
        not_site_df_2 = not_site_df_2.append(not_df_subset_2)

        not_allyson_subset_2 = allyson_not[(allyson_not['Abbr_Name'].isin(sitecheck['Abbr_Name'])==False)& \
                                           (allyson_not['Season']==i)]
        allyson_not_site = allyson_not_site.append(not_allyson_subset_2)

        
    # observe that because we did not merge on all columns with a common name we ended up with several columns
    #     with a naming convention such as '_y' so we use some regex to clean these up for applicable dataframes
    
    for col in new_site_df_1.columns:
        if '_x' in col:
            new_site_df_1[col[:-2]] = new_site_df_1[col]
    for col in new_site_df_2.columns:
        if '_x' in col:
            new_site_df_2[col[:-2]] = new_site_df_2[col]
    
    new_site_df_1 = new_site_df_1.select(lambda x: not re.search(r'.*\_y', x), axis=1)
    new_site_df_2 = new_site_df_2.select(lambda x: not re.search(r'.*\_y', x), axis=1)
    
    new_site_df_1 = new_site_df_1.select(lambda x: not re.search(r'.*\_x', x), axis=1)
    new_site_df_2 = new_site_df_2.select(lambda x: not re.search(r'.*\_x', x), axis=1)
    
    # observe we are returning 6 components but only use 2 for our final dataframes.  The reason for these extra 
    #     dataframes was for checking to ensure our function was properly working
    return new_site_df_1, not_site_df_1, allyson_not, new_site_df_2, not_site_df_2, allyson_not_site

#### signifies code we are not currently using but might need in the future

In [217]:
def find_names2(df, site_df, season=season):
    site_df_replace = site_df.copy()
    df['Prediction_Season'] = df['Season']+1
    site_df_replace['Prediction_Season'] = site_df['Season']
    
    for col in site_df_replace.columns:
        if 'actual_' in col:
            del site_df_replace[col]
    
    # this is the the same as find_names but is combining the current year's predictions with the previous year's 
    #     actual statistics
    
    #print df[df['Abbr_Name']=='a. pujols'][['Abbr_Name', 'actual_HR', 'Prediction_Season']]
    
    # we create blank data frames that we will be using to store the necessary data, we might need to work on naming
    #     conventions...
    
    # dataframe containing Name hits from both the site DFs and the PlayerData DFs
    new_site_df_1 = pd.DataFrame()
    new_site_df_new_stats_1 = pd.DataFrame()
    
    # dataframe containing Name hits NOT IN PlayerData DF
    not_site_df_1 = pd.DataFrame()
    
    # dataframe containing names in PlayerData DF and not the site DF
    allyson_not = pd.DataFrame()
    
    # Observe these next three dataframes are to see the same information as before but for Abbr_Name and within the
    #     intersection of the previous dataframes
    new_site_df_2 = pd.DataFrame()
    
    new_site_df_new_stats_2 = pd.DataFrame()
    
    not_site_df_2 = pd.DataFrame()
    allyson_not_site = pd.DataFrame()

    
    # loop through every season.  Observe a couple sites are missing data for different seasons which is why we allow
    #     the user to pass in the appropriate seasons they wish to use.
    for i in season:
        
        # checking for names that exist in both pitchersDF and espn_pitchers
        dfcheck = df[df['Prediction_Season']==i]
        site_df_check = site_df_replace[site_df_replace['Prediction_Season']==i]
        
        #dfcheck_new_stats = df[df['Season']==i-1]
        #site_df_check_new_stats = site_df[site_df['Season']==i]

        # we were using more complex 'isin' checks but opted to just use a pd.merge because we eventually needed to 
        #     combine the site data with the actual player DF, same applies to code below
        df_subset_1_TEST = pd.merge(site_df_check, dfcheck, on=['Name', 'Prediction_Season'])
        new_site_df_1 = new_site_df_1.append(df_subset_1_TEST)
        
        #df_subset_1_TEST_new_stats = pd.merge(site_df_check, dfcheck_new_stats, on=['Name', 'Prediction_season'])
        #new_site_df_new_stats_1 = new_site_df_new_stats_1.append(df_subset_1_TEST_new_stats)
        ####df_subset_1 = site_df[(site_df['Name'].isin(dfcheck['Name'])) & (site_df['Season']==i)]
        ####new_site_df_1 = new_site_df_1.append(df_subset_1)        
        
        not_df_subset_1 = site_df_replace[(site_df_replace['Name'].isin(dfcheck['Name'])==False) & (site_df_replace['Prediction_Season']==i)]
        not_site_df_1 = not_site_df_1.append(not_df_subset_1)

        not_allyson_subset = df[(df['Name'].isin(site_df_check['Name'])==False) & (df['Prediction_Season']==i)]
        allyson_not = allyson_not.append(not_allyson_subset)
        
        ####not_ally_subset_new = df[(df['Name'].isin(site_df_check_new_stats['Name'])==False) & df[]]
        
        
        # checking for names that did not exist in both pitchersDF and source, but do match first initials
        allysoncheck = allyson_not[allyson_not['Prediction_Season']==i]
        sitecheck = not_site_df_1[not_site_df_1['Prediction_Season']==i]
        

        site_df_subset_TEST = pd.merge(not_site_df_1, allysoncheck, on=['Abbr_Name', 'Prediction_Season'])
        new_site_df_2 = new_site_df_2.append(site_df_subset_TEST)
        
        #site_df_subset_TEST_new_stats = pd.merge(not_site_df_1, allysoncheck, on=['Abbr_Name', 'Prediction_season'])
        #new_site_df_new_stats_2 = new_site_df_2.append(site_df_subset_TEST)
        ####site_df_subset = not_site_df_1[(not_site_df_1['Abbr_Name'].isin(allysoncheck['Abbr_Name'])) & \
        ####                               (not_site_df_1['Season']==i)]
        ####new_site_df_2 = new_site_df_2.append(site_df_subset)

        not_df_subset_2 = not_site_df_1[(not_site_df_1['Abbr_Name'].isin(allysoncheck['Abbr_Name'])==False) & \
                                        (not_site_df_1['Prediction_Season']==i)]
        not_site_df_2 = not_site_df_2.append(not_df_subset_2)

        not_allyson_subset_2 = allyson_not[(allyson_not['Abbr_Name'].isin(sitecheck['Abbr_Name'])==False)& \
                                           (allyson_not['Prediction_Season']==i)]
        allyson_not_site = allyson_not_site.append(not_allyson_subset_2)

        
    # observe that because we did not merge on all columns with a common name we ended up with several columns
    #     with a naming convention such as '_y' so we use some regex to clean these up for applicable dataframes
    
    for col in new_site_df_1.columns:
        if '_x' in col:
            new_site_df_1[col[:-2]] = new_site_df_1[col]
    for col in new_site_df_2.columns:
        if '_x' in col:
            new_site_df_2[col[:-2]] = new_site_df_2[col]

    new_site_df_1 = new_site_df_1.select(lambda x: not re.search(r'.*\_y', x), axis=1)
    new_site_df_1 = new_site_df_1.select(lambda x: not re.search(r'.*\_x', x), axis=1)
    
    new_site_df_2 = new_site_df_2.select(lambda x: not re.search(r'.*\_y', x), axis=1)
    new_site_df_2 = new_site_df_2.select(lambda x: not re.search(r'.*\_x', x), axis=1)
    
    for col in new_site_df_1.columns:
        if 'actual_' in col:
            new_site_df_1 = new_site_df_1.rename(columns={col:'previous_'+col[7:]})
    for col in new_site_df_2.columns:
        if 'actual_' in col:
            new_site_df_2 = new_site_df_2.rename(columns={col:'previous_'+col[7:]})
    
    # observe we are returning 6 components but only use 2 for our final dataframes.  The reason for these extra 
    #     dataframes was for checking to ensure our function was properly working
    return new_site_df_1, not_site_df_1, allyson_not, new_site_df_2, not_site_df_2, allyson_not_site

#### signifies code we are not currently using but might need in the future

In [218]:
def add_correct_column(df,statistics):
    """This function adds columns to our dataframes that classify whether a prediction is correct or not. We pass
    in a dataframe and a dictionary of statistics, where the keys are stats and the values are the epsilon balls.
    Then we make a column and add the value 1 if the projection for a statistic is within the epsilon ball of the 
    actual stat and a 0 if it is outside the epsilon ball.
    Parameters
    df: dataframe we want to adjust
    statistic: dict where keys are stats and values are epsilon ball values
    returns: Nothing, but it adds columns to the dataframe that are binary with 1's = a correct prediction"""
    for i in statistics:
        difference = statistics.get(i)
        df['correct_'+i] = ((df['actual_'+i]<df[i]+difference)&(df['actual_'+i]>df[i]-difference)).astype('int')
    return None
        

We now detail the process of cleaning the Marcel data.

### Marcel
Marcel is a baseball statistic projection system that claims to be very basic in its methods. It uses the past 3 years to predict each current year, weighting recent data more heavily. We now read in the Marcel data and put it into pandas dataframes.

In [219]:
# ============================ CREATE DATAFRAMES OF MARCEL PITCHER & HITTER PROJECTIONS ===============================

# empty lists to append to for each year of information
marcel_hitters = []
marcel_pitchers = []

# append to pitchers dataframe
for ID in os.listdir('CSV_files/marcel_pitchers/'):
    print ID
    if ID not in '.listing':
        with open(os.path.join('./CSV_files/marcel_pitchers/', ID)) as inFile:
            marcel_pitchers.append(pd.read_csv(inFile))
            
# append to hitters dataframe
for ID in os.listdir('CSV_files/marcel_hitters/'):
    if ID not in '.listing':
        with open(os.path.join('./CSV_files/marcel_hitters/', ID)) as inFile:
            marcel_hitters.append(pd.read_csv(inFile))

# concatenate master lists into dataframes
marcel_hitters = pd.concat(marcel_hitters)
marcel_pitchers = pd.concat(marcel_pitchers)

# check the sizes of our dataframes
print "marcel hitters", marcel_hitters.shape
print "marcel pitchers", marcel_pitchers.shape

marcel_pitchers_2013.csv
marcel_pitchers_2014.csv
marcel_pitchers_2015.csv
marcel_pitchers_2010.csv
marcel_pitchers_2012.csv
marcel_pitchers_2011.csv
marcel hitters (6792, 26)
marcel pitchers (5716, 28)


Now that we have all the Marcel projection data in two Pandas dataframes, we need to clean them. Right now, we have 6,158 rows and 27 columns in our hitters dataframe and we have 5,716 rows and 29 columns in our pitchers dataframe.
Before we get into the cleaning, here is a sample of what the Marcel hitters dataframe looks like, and its null values.

In [220]:
marcel_hitters.head()

Unnamed: 0,AB,AVG,Age,HR,Name,R,RBI,SB,Season,m2B,...,mIBB,mPA,mSF,mSH,mSO,nameFirst,nameLast,playerID,reliability,wOBA
0,180,0.261,,6,Brent Clevlen,24,22,4,2013,,...,,,,,,,,,,
1,216,0.245,,7,Steven Tolleson,24,24,4,2013,,...,,,,,,,,,,
2,190,0.258,,5,Andrew Romine,24,21,5,2013,,...,,,,,,,,,,
3,303,0.234,,4,Lou Marson,38,28,7,2013,,...,,,,,,,,,,
4,181,0.254,,6,Joe Thurston,23,22,4,2013,,...,,,,,,,,,,


In [221]:
print marcel_hitters.isnull().sum()

AB                0
AVG            2560
Age            4232
HR                0
Name           2560
R                 0
RBI               0
SB                0
Season            0
m2B            4232
m3B            4232
mBB            4232
mCS            4232
mGIDP          4232
mH             4232
mHBP           4232
mIBB           4232
mPA            4232
mSF            4232
mSH            4232
mSO            4232
nameFirst      4232
nameLast       4232
playerID       4232
reliability    4232
wOBA           4232
dtype: int64


Here is a sample of the Marcel pitcher data, and its null values.

In [222]:
marcel_pitchers.head()

Unnamed: 0,Age,ERA,IP,K,Name,SV,Season,W,WHIP,bsrER,...,mHR,mIBB,mL,mR,mRepl,mWP,nameFirst,nameLast,playerID,reliability
0,,4.32,25.0,21,Sean Gallagher,1.0,2013,1,1.36,,...,,,,,,,,,,
1,,4.74,38.0,34,Yoshinori Tateyama,1.0,2013,2,1.315789,,...,,,,,,,,,,
2,,4.15,26.0,21,Danny Herrera,1.0,2013,1,1.346154,,...,,,,,,,,,,
3,,3.72,58.0,50,Jose Mijares,1.0,2013,3,1.362069,,...,,,,,,,,,,
4,,4.29,161.7,137,Philip Hughes,1.0,2013,13,1.280148,,...,,,,,,,,,,


In [223]:
print marcel_pitchers.isnull().sum()

Age            2864
ERA               0
IP                0
K                 0
Name           2852
SV              950
Season            0
W                 0
WHIP           2852
bsrER          2864
lgID           2864
mBB            2864
mBK            2864
mER            2864
mG             3828
mGS            3828
mH             2864
mHBP           2864
mHR            2864
mIBB           2864
mL             2864
mR             2864
mRepl          4764
mWP            2864
nameFirst      2864
nameLast       2864
playerID       2864
reliability    2864
dtype: int64


We will now clean the Marcel data. First of all, this dataset is missing WHIP for pitchers and AVG for hitters, which we will derive from the other features that are available. We also create a Name column that includes both first and last names together, and make that all lower case. Secondly, the dataset as is contains much more information than we need, so we will drop all columns except those of particular interest to us. Thirdly, we change player names to all lowercase for the sake of uniformity within the dataset and among our other datasets, and create a column holding the first initial and last name of each player.

In [224]:
# engineering of WHIP and AVG, engineering of single name column
marcel_pitchers['Name'] = np.where(marcel_pitchers['Name'].isnull(), \
                        marcel_pitchers['nameFirst'] + ' ' + marcel_pitchers['nameLast'], marcel_pitchers['Name'])
marcel_pitchers['WHIP'] = np.where(marcel_pitchers['WHIP'].isnull(), \
                        (marcel_pitchers['mBB']+marcel_pitchers['mH'])/marcel_pitchers['IP'], marcel_pitchers['WHIP'])
marcel_hitters['Name'] = np.where(marcel_hitters['Name'].isnull(), \
                        marcel_hitters['nameFirst'] + ' ' + marcel_hitters['nameLast'], marcel_hitters['Name'])
marcel_hitters['AVG'] = np.where(marcel_hitters['AVG'].isnull(), \
                        (marcel_hitters['mH'])/marcel_hitters['AB'], marcel_hitters['AVG'])

# keep only columns includings stats we care about
marcel_pitchers = marcel_pitchers[['Name', 'K', 'W', 'IP', 'ERA', 'WHIP', 'Season']]
marcel_pitchers.columns = ['Name', 'K', 'W', 'IP', 'ERA', 'WHIP', 'Season']
marcel_hitters = marcel_hitters[['Name', 'AB', 'RBI', 'R','HR', 'SB', 'AVG', 'Season']]
marcel_hitters.columns = ['Name', 'AB', 'RBI', 'R','HR', 'SB', 'AVG', 'Season']

# convert names to all lower case letters for uniformity
marcel_pitchers['Name'] = marcel_pitchers['Name'].apply(lower_names)
marcel_hitters['Name'] = marcel_hitters['Name'].apply(lower_names)

# create a column for first initial and last name for each player
marcel_pitchers['First'], marcel_pitchers['First_initial'], marcel_pitchers['Last'] = \
        marcel_pitchers['Name'].apply(split_names_first),marcel_pitchers['Name'].apply(split_names_first_initial),\
        marcel_pitchers['Name'].apply(split_names_last)
marcel_pitchers['Abbr_Name'] = marcel_pitchers['First_initial'] + ' ' + marcel_pitchers['Last']
marcel_hitters['First'], marcel_hitters['First_initial'], marcel_hitters['Last'] = \
        marcel_hitters['Name'].apply(split_names_first),marcel_hitters['Name'].apply(split_names_first_initial),\
        marcel_hitters['Name'].apply(split_names_last)
marcel_hitters['Abbr_Name'] = marcel_hitters['First_initial'] + ' ' + marcel_hitters['Last']

# delete the extra columns we made to get the abbr_name column
del marcel_hitters['First'], marcel_hitters['First_initial'], marcel_hitters['Last']
del marcel_pitchers['First'], marcel_pitchers['First_initial'], marcel_pitchers['Last']

In [225]:
# assign columns to the correct datatypes
marcel_hitters["AB"] = pd.to_numeric(marcel_hitters.AB)
marcel_hitters["AB"] = marcel_hitters["AB"].astype(int)
marcel_hitters["RBI"] = marcel_hitters["RBI"].astype(int)
marcel_hitters["R"] = marcel_hitters["R"].astype(int)
marcel_hitters["HR"] = marcel_hitters["HR"].astype(int)
marcel_hitters["SB"] = marcel_hitters["SB"].astype(int)
marcel_hitters["Season"] = marcel_hitters["Season"].astype(int)

marcel_pitchers["K"] = marcel_pitchers["K"].astype(int)
marcel_pitchers["W"] = marcel_pitchers["W"].astype(int)
marcel_pitchers["Season"] = marcel_pitchers["Season"].astype(int)

In [226]:
marcel_hitters_2017 = marcel_hitters[marcel_hitters['Season']==2017]
new_df_hit_predict = find_names2(hittersDF, marcel_hitters_2017, [2017])
marcel_hitters_2017 = new_df_hit_predict[0].append(new_df_hit_predict[3])

new_df_pitch = find_names(pitchersDF, marcel_pitchers, season)
new_df_hit = find_names(hittersDF, marcel_hitters, season)

marcel_pitchers = new_df_pitch[0].append(new_df_pitch[3])
marcel_hitters = new_df_hit[0].append(new_df_hit[3])

Now, the Marcel data is cleaned and ready to use. The pitchers dataframe is 2,354 rows and 8 columns and the hitters dataframe is 1,876 rows and 9 columns. Below are samples of each (hitters, then pitchers), which will show the cleaned dataframes and the fact that we have taken care of all null values.

In [227]:
marcel_hitters.head()

Unnamed: 0,AB,AVG,Abbr_Name,HR,Name,Pos Summary,Prediction_Season,R,RBI,SB,...,actual_AVG,actual_BB,actual_HR,actual_OBP,actual_R,actual_RBI,actual_SB,actual_SLG,actual_SO,actual_age
0,508,0.322835,a. pujols,34,albert pujols,*3,2011,96,104,9,...,0.312,103.0,42,0.414,115,118,14,0.596,76.0,30.0
1,528,0.289773,p. fielder,37,prince fielder,*3/D,2011,88,106,3,...,0.261,114.0,32,0.401,94,83,1,0.471,138.0,26.0
2,521,0.320537,h. ramirez,24,hanley ramirez,*6,2011,97,75,30,...,0.3,64.0,21,0.378,92,76,32,0.475,93.0,26.0
3,447,0.286353,a. rodriguez,31,alex rodriguez,*5D,2011,85,93,15,...,0.27,59.0,30,0.341,74,125,4,0.506,98.0,34.0
4,522,0.312261,m. holliday,24,matt holliday,*7/D,2011,90,93,16,...,0.312,69.0,28,0.39,95,103,9,0.532,93.0,30.0


In [228]:
print marcel_hitters.isnull().sum()

AB                   0
AVG                  0
Abbr_Name            0
HR                   0
Name                 0
Pos Summary          0
Prediction_Season    0
R                    0
RBI                  0
SB                   0
Season               0
actual_2B            0
actual_3B            0
actual_AB            0
actual_AVG           0
actual_BB            0
actual_HR            0
actual_OBP           0
actual_R             0
actual_RBI           0
actual_SB            0
actual_SLG           0
actual_SO            0
actual_age           0
dtype: int64


In [229]:
marcel_pitchers.head()

Unnamed: 0,Abbr_Name,Age,ERA,IP,K,Name,Season,W,WHIP,actual_BB,actual_BF,actual_ERA,actual_FIP,actual_H,actual_IP,actual_K,actual_SV,actual_W,actual_WHIP
0,t. lincecum,26.0,2.88,195.0,218,tim lincecum,2010,13,1.138462,76.0,897.0,3.43,3.15,194.0,212.1,231,0,16,1.272
1,c. carpenter,35.0,3.01,157.0,115,chris carpenter,2010,12,1.152866,63.0,969.0,3.22,3.69,214.0,235.0,179,0,16,1.179
2,a. bailey,26.0,3.09,67.0,63,andrew bailey,2010,4,1.119403,13.0,189.0,1.47,2.96,34.0,49.0,42,25,1,0.959
3,j. broxton,26.0,3.09,70.0,83,jonathan broxton,2010,5,1.157143,28.0,271.0,4.04,3.01,64.0,62.1,73,22,5,1.476
4,m. adams,31.0,3.15,50.0,48,mike adams,2010,2,1.14,23.0,268.0,1.76,2.31,48.0,66.2,73,0,4,1.065


In [230]:
print marcel_pitchers.isnull().sum()

Abbr_Name      0
Age            0
ERA            0
IP             0
K              0
Name           0
Season         0
W              0
WHIP           0
actual_BB      0
actual_BF      0
actual_ERA     0
actual_FIP     0
actual_H       0
actual_IP      0
actual_K       0
actual_SV      0
actual_W       0
actual_WHIP    0
dtype: int64


Here is a table of some basic summary statistics for the Marcel hitters data, followed by the same for the Marcel pitchers data. It can also be seen from the summary statistics that nothing seems out of range, and the values reported make sense given the context.

In [231]:
marcel_hitters.describe()

Unnamed: 0,AB,AVG,HR,Prediction_Season,R,RBI,SB,Season,actual_2B,actual_3B,...,actual_AVG,actual_BB,actual_HR,actual_OBP,actual_R,actual_RBI,actual_SB,actual_SLG,actual_SO,actual_age
count,1871.0,1871.0,1871.0,1871.0,1871.0,1871.0,1871.0,1871.0,1871.0,1871.0,...,1871.0,1871.0,1871.0,1871.0,1871.0,1871.0,1871.0,1871.0,1871.0,1871.0
mean,400.636024,0.264954,12.459647,2013.490647,53.72047,51.006948,7.979156,2012.490647,21.86852,2.265099,...,0.259387,38.706574,12.680385,0.324669,54.342598,52.334046,7.714057,0.408976,85.894174,28.95938
std,105.615495,0.019227,6.645199,1.702717,17.546833,18.837681,7.421539,1.702717,9.543081,2.390323,...,0.031349,20.161645,9.13139,0.035828,23.408388,24.950885,9.838591,0.069251,34.362779,3.884352
min,177.0,0.204698,2.0,2011.0,19.0,17.0,1.0,2010.0,2.0,0.0,...,0.146,4.0,0.0,0.174,4.0,8.0,0.0,0.187,21.0,20.0
25%,320.0,0.251424,7.0,2012.0,40.0,36.0,3.0,2011.0,14.0,0.0,...,0.239,24.0,6.0,0.301,35.5,32.0,1.0,0.361,60.0,26.0
50%,418.0,0.264916,11.0,2014.0,54.0,50.0,5.0,2013.0,21.0,2.0,...,0.259,35.0,11.0,0.323,52.0,48.0,4.0,0.403,82.0,29.0
75%,491.0,0.277039,17.0,2015.0,67.0,65.0,10.0,2014.0,28.0,3.0,...,0.281,50.0,18.0,0.347,72.0,69.0,11.0,0.453,106.0,32.0
max,593.0,0.329218,39.0,2016.0,104.0,119.0,51.0,2015.0,55.0,16.0,...,0.359,143.0,54.0,0.474,136.0,139.0,68.0,0.649,222.0,43.0


In [232]:
marcel_pitchers.describe()

Unnamed: 0,Age,ERA,IP,K,Season,W,WHIP,actual_BB,actual_BF,actual_ERA,actual_FIP,actual_H,actual_IP,actual_K,actual_SV,actual_W,actual_WHIP
count,2354.0,2354.0,2354.0,2354.0,2354.0,2354.0,2354.0,2354.0,2354.0,2354.0,2354.0,2354.0,2354.0,2354.0,2354.0,2354.0,2354.0
mean,28.781223,3.893755,91.398343,75.630416,2012.525064,5.287596,1.310539,31.328802,405.76763,3.891619,3.908008,92.197961,95.849618,80.121071,3.07859,5.53441,1.30951
std,3.998424,0.514722,50.613519,42.257817,1.700606,3.420166,0.094917,19.083233,253.188384,1.293373,0.941681,60.344611,60.833606,52.789,8.898122,4.590207,0.225391
min,20.0,2.21,25.0,18.0,2010.0,1.0,0.943683,3.0,90.0,0.6,0.78,11.0,25.1,7.0,0.0,0.0,0.565
25%,26.0,3.52,55.0,45.0,2011.0,3.0,1.251694,17.0,217.0,3.0025,3.28,45.0,51.0,41.0,0.0,2.0,1.16325
50%,28.0,3.9,68.0,61.0,2013.0,4.0,1.313433,26.0,285.0,3.75,3.85,64.0,68.1,63.0,0.0,4.0,1.298
75%,31.0,4.24,137.0,103.0,2014.0,8.0,1.37219,43.0,634.0,4.61,4.47,148.75,147.075,109.0,1.0,8.0,1.44
max,49.0,5.58,209.0,218.0,2015.0,17.0,1.655914,105.0,1009.0,10.69,8.57,262.0,251.0,301.0,51.0,24.0,2.357


As mentioned above, we will now go through similar processes for the remaining five sources of projection data, but in considerably less detail.

### FanGraphs
FanGraphs is a company-run website that provides historical major and minor league baseball statistics, analysis, and projections. Below, we scrape their projections from each season of interest.

In [233]:
# ========================== CREATE DATAFRAMES OF FANGRAPHS PITCHER & HITTER PROJECTIONS ==============================

# Set up our hitter and pitcher pandas DataFrames for FanGraphs method
fangraphs_hitters = pd.DataFrame()
fangraphs_pitchers = pd.DataFrame()

# for each year of projections read in the csv file and append it to the appropriate df
ID=[2010, 2011, 2012, 2013, 2014, 2015]
for i in ID:
    df = pd.read_csv('CSV_files/fangraphs/fans_hitters_{}.csv'.format(i))
    fangraphs_hitters = fangraphs_hitters.append(df, ignore_index = True)
    df2 = pd.read_csv('CSV_files/fangraphs/fans_pitchers_{}.csv'.format(i))
    fangraphs_pitchers = fangraphs_pitchers.append(df2, ignore_index = True)

# only keep statistics we are interested in 
fangraphs_hitters = fangraphs_hitters[['Name', 'AB', 'HR', 'R', 'RBI', 'SB', 'AVG', 'season']]
fangraphs_pitchers = fangraphs_pitchers[['Name', 'W', 'ERA', 'IP', 'K','SV', 'WHIP', 'season']]
fangraphs_hitters = fangraphs_hitters.rename(columns = {'season' : 'Season'})
fangraphs_pitchers = fangraphs_pitchers.rename(columns = {'season' : 'Season'})

fangraphs_pitchers['Name'] = fangraphs_pitchers['Name'].astype('str')

In [234]:
# using a function defined previously, change names to lower case for uniformity

fangraphs_pitchers['Name'] = fangraphs_pitchers['Name'].apply(lower_names)
fangraphs_hitters['Name'] = fangraphs_hitters['Name'].apply(lower_names)

fangraphs_pitchers['First'], fangraphs_pitchers['First_initial'], fangraphs_pitchers['Last'] = \
    fangraphs_pitchers['Name'].apply(split_names_first),fangraphs_pitchers['Name'].apply(split_names_first_initial),\
    fangraphs_pitchers['Name'].apply(split_names_last)
fangraphs_pitchers['Abbr_Name'] = fangraphs_pitchers['First_initial'] + ' ' + fangraphs_pitchers['Last']
fangraphs_hitters['First'], fangraphs_hitters['First_initial'], fangraphs_hitters['Last'] = \
    fangraphs_hitters['Name'].apply(split_names_first),fangraphs_hitters['Name'].apply(split_names_first_initial),\
    fangraphs_hitters['Name'].apply(split_names_last)
fangraphs_hitters['Abbr_Name'] = fangraphs_hitters['First_initial'] + ' ' + fangraphs_hitters['Last']

del fangraphs_hitters['First'], fangraphs_hitters['First_initial'], fangraphs_hitters['Last']
del fangraphs_pitchers['First'], fangraphs_pitchers['First_initial'], fangraphs_pitchers['Last']

In [235]:
# assign columns to the correct datatypes
fangraphs_hitters["AB"] = fangraphs_hitters["AB"].astype(str)
fangraphs_hitters["RBI"] = fangraphs_hitters["RBI"].astype(int)
fangraphs_hitters["R"] = fangraphs_hitters["R"].astype(int)
fangraphs_hitters["HR"] = fangraphs_hitters["HR"].astype(int)
fangraphs_hitters["SB"] = fangraphs_hitters["SB"].astype(int)
fangraphs_hitters["Season"] = fangraphs_hitters["Season"].astype(int)

fangraphs_pitchers["K"] = fangraphs_pitchers["K"].astype(int)
fangraphs_pitchers["W"] = fangraphs_pitchers["W"].astype(int)
fangraphs_pitchers["Season"] = fangraphs_pitchers["Season"].astype(int)

In [236]:
#Take a subset of the fangraphs dataframe to only include players in the actual statistics dataframes
new_df_pitch = find_names(pitchersDF, fangraphs_pitchers, season)
new_df_hit = find_names(hittersDF, fangraphs_hitters, season)

fangraphs_pitchers = new_df_pitch[0].append(new_df_pitch[3])
fangraphs_hitters = new_df_hit[0].append(new_df_hit[3])

In [237]:
fangraphs_hitters["AB"] = pd.to_numeric(fangraphs_hitters.AB)
fangraphs_hitters["AB"] = fangraphs_hitters["AB"].astype(int)

### Steamer
Steamer Projections provides statistical projections for major league baseball players. Below, we will obtain their results for each of our seasons of interest.

In [238]:
# =========================== CREATE DATAFRAMES OF STEAMER PITCHER & HITTER PROJECTIONS ==============================

# Set up our hitter and pitcher Pandas DataFrames for Steamer method
steamer_hitters = pd.DataFrame()
steamer_pitchers = pd.DataFrame()

#for each year of projections read in the csv file and append it to the apprpriate df

ID=[2010, 2011, 2012, 2013, 2014, 2015]
for i in ID:
    df = pd.read_csv('CSV_files/steamer/steamer_hitters_{}.csv'.format(i))
    steamer_hitters = steamer_hitters.append(df, ignore_index = True)
    df2 = pd.read_csv('CSV_files/steamer/steamer_pitchers_{}.csv'.format(i))
    steamer_pitchers = steamer_pitchers.append(df2, ignore_index = True)
    
# only keep statistics we are interested in 
steamer_hitters = steamer_hitters[['Name', 'AB', 'HR', 'R', 'RBI', 'SB', 'AVG', 'season']]
steamer_pitchers = steamer_pitchers[['Name', 'W', 'ERA', 'IP', 'K','SV', 'WHIP', 'season']]
steamer_hitters = steamer_hitters.rename(columns = {'season' : 'Season'})
steamer_pitchers = steamer_pitchers.rename(columns = {'season' : 'Season'})

In [239]:
# using a function defined previously, change names to lower case for uniformity
steamer_pitchers['Name'] = steamer_pitchers['Name'].apply(lower_names)
steamer_hitters['Name'] = steamer_hitters['Name'].apply(lower_names)

steamer_pitchers['First'], steamer_pitchers['First_initial'], steamer_pitchers['Last'] = \
    steamer_pitchers['Name'].apply(split_names_first),steamer_pitchers['Name'].apply(split_names_first_initial),\
    steamer_pitchers['Name'].apply(split_names_last)
steamer_pitchers['Abbr_Name'] = steamer_pitchers['First_initial'] + ' ' + steamer_pitchers['Last']
steamer_hitters['First'], steamer_hitters['First_initial'], steamer_hitters['Last'] = \
    steamer_hitters['Name'].apply(split_names_first),steamer_hitters['Name'].apply(split_names_first_initial),\
    steamer_hitters['Name'].apply(split_names_last)
steamer_hitters['Abbr_Name'] = steamer_hitters['First_initial'] + ' ' + steamer_hitters['Last']
del steamer_hitters['First'], steamer_hitters['First_initial'], steamer_hitters['Last']
del steamer_pitchers['First'], steamer_pitchers['First_initial'], steamer_pitchers['Last']

In [240]:
# assign columns to the correct datatypes
#steamer_hitters["AB"] = pd.to_numeric(steamer_hitters.AB)
steamer_hitters["AB"] = steamer_hitters["AB"].astype(str)
steamer_hitters["RBI"] = steamer_hitters["RBI"].astype(int)
steamer_hitters["R"] = steamer_hitters["R"].astype(int)
steamer_hitters["HR"] = steamer_hitters["HR"].astype(int)
steamer_hitters["SB"] = steamer_hitters["SB"].astype(int)
steamer_hitters["Season"] = steamer_hitters["Season"].astype(int)

steamer_pitchers["K"] = steamer_pitchers["K"].astype(int)
steamer_pitchers["W"] = steamer_pitchers["W"].astype(int)
steamer_pitchers["Season"] = steamer_pitchers["Season"].astype(int)

In [241]:
#Take a subset of the steamer dataframes to only include players in the actual statistics dataframes
new_df_pitch = find_names(pitchersDF, steamer_pitchers, season)
new_df_hit = find_names(hittersDF, steamer_hitters, season)

steamer_pitchers = new_df_pitch[0].append(new_df_pitch[3])
steamer_hitters = new_df_hit[0].append(new_df_hit[3])

### Guru
The Baseball Guru is another provider of major league baseball statistics, rankings, and forecasts. We will now scrape their projections from 2010-2016.

In [242]:
# ============================= CREATE DATAFRAMES OF GURU PITCHER & HITTER PROJECTIONS ================================

# Set up our hitter and pitcher Pandas dataframes for Guru method
guru_hitters = pd.DataFrame()
guru_pitchers = pd.DataFrame()

# for each year of projections read in the csv file and append it to the apprpriate df
ID=[2010, 2011, 2012, 2013, 2014, 2015, 2016]
for i in ID:
    df = pd.read_csv('CSV_files/guru/guru_hitters_{}.csv'.format(i))
    guru_hitters = guru_hitters.append(df, ignore_index = True)
    
    if i != 2015: # Special exception because Guru doesn't have projections for pitchers in 2015
        df2 = pd.read_csv('CSV_files/guru/guru_pitchers_{}.csv'.format(i))
        guru_pitchers = guru_pitchers.append(df2, ignore_index = True)
    else:
        pass

# only keep statistics we are interested in 
guru_hitters = guru_hitters[['Name', 'AB', 'HR', 'R', 'RBI', 'SB', 'AVG', 'season']]
guru_pitchers = guru_pitchers[['Name', 'W', 'ERA', 'IP', 'K','SV', 'WHIP', 'season']]
guru_hitters = guru_hitters.rename(columns = {'season' : 'Season'})
guru_pitchers = guru_pitchers.rename(columns = {'season' : 'Season'})

In [243]:
# drop null values
guru_hitters.dropna(inplace=True)
guru_pitchers.dropna(inplace=True)

In [244]:
# replace -'s values with 0's
guru_pitchers['SV'] = guru_pitchers['SV'].replace(".*[\-].* ","0", regex=True).astype(int)
guru_hitters['HR'] = guru_hitters['HR'].replace(".*[\-].*", "0", regex=True).astype(int)
guru_hitters['SB'] = guru_hitters['SB'].replace(".*[\-]*", "0", regex=True).astype(int)
guru_pitchers['W'] = guru_pitchers['W'].replace(".*[\-]*", "0", regex=True).astype(int)

# using a function defined previously, change names to lower case for uniformity

guru_pitchers['Name'] = guru_pitchers['Name'].apply(lower_names)
guru_hitters['Name'] = guru_hitters['Name'].apply(lower_names)

guru_pitchers['First'], guru_pitchers['First_initial'], guru_pitchers['Last'] = \
    guru_pitchers['Name'].apply(split_names_first),guru_pitchers['Name'].apply(split_names_first_initial),\
    guru_pitchers['Name'].apply(split_names_last)
guru_pitchers['Abbr_Name'] = guru_pitchers['First_initial'] + ' ' + guru_pitchers['Last']
guru_hitters['First'], guru_hitters['First_initial'], guru_hitters['Last'] = \
    guru_hitters['Name'].apply(split_names_first),guru_hitters['Name'].apply(split_names_first_initial),\
    guru_hitters['Name'].apply(split_names_last)
guru_hitters['Abbr_Name'] = guru_hitters['First_initial'] + ' ' + guru_hitters['Last']
del guru_hitters['First'], guru_hitters['First_initial'], guru_hitters['Last']
del guru_pitchers['First'], guru_pitchers['First_initial'], guru_pitchers['Last']

In [245]:
# assign columns to the correct datatypes
guru_hitters["AB"] = pd.to_numeric(guru_hitters.AB)
guru_hitters["AB"] = guru_hitters["AB"].astype(int)
guru_hitters["RBI"] = guru_hitters["RBI"].astype(int)
guru_hitters["R"] = guru_hitters["R"].astype(int)
guru_hitters["HR"] = guru_hitters["HR"].astype(int)
guru_hitters["SB"] = guru_hitters["SB"].astype(int)
guru_hitters["Season"] = guru_hitters["Season"].astype(int)

guru_pitchers["K"] = guru_pitchers["K"].astype(int)
guru_pitchers["W"] = guru_pitchers["W"].astype(int)
guru_pitchers["Season"] = guru_pitchers["Season"].astype(int)

In [246]:
#Take a subset of the fangraphs dataframe to only include players in the actual statistics dataframes
seasons = [2010, 2011, 2012, 2013, 2014, 2016]

#use the find names function defined earlier
new_df_pitch = find_names(pitchersDF, guru_pitchers, seasons)
new_df_hit = find_names(hittersDF, guru_hitters, seasons)

guru_pitchers = new_df_pitch[0].append(new_df_pitch[3])
guru_hitters = new_df_hit[0].append(new_df_hit[3])

### ESPN
ESPN is a sports television network and entertainment company that also provides analysis, projections, and rankings for collegiate and professional sports.

In [247]:
# ============================= CREATE DATAFRAMES OF ESPN PITCHER & HITTER PROJECTIONS ================================

# empty lists to append to for each year of information
espn_hitters = []
espn_pitchers = []

# append to pitchers dataframe
for ID in os.listdir('CSV_files/espn_pitchers/'):
    if ID not in '.listing':
        with open(os.path.join('./CSV_files/espn_pitchers/', ID)) as inFile:
            espn_pitchers.append(pd.read_csv(inFile))
            
# append to hitters dataframe
for ID in os.listdir('CSV_files/espn_hitters/'):
    if ID not in '.listing':
        with open(os.path.join('./CSV_files/espn_hitters/', ID)) as inFile:
            espn_hitters.append(pd.read_csv(inFile))

# concatenate master lists into dataframes            
espn_pitchers = pd.concat(espn_pitchers)
espn_hitters = pd.concat(espn_hitters)

# keep only the stats we are interested in and rename columns to match our other dataframes
espn_pitchers = espn_pitchers[['name', '    K', '    W', '   IP', '   SV', '  ERA', ' WHIP', 'season']]
espn_pitchers.columns = ['Name', 'K', 'W', 'IP', 'SV', 'ERA', 'WHIP', 'Season']
espn_hitters = espn_hitters[['Player', '   AB', 'RBI', 'R','HR', 'SB', 'AVG', 'Season']]
espn_hitters.columns = ['Name', 'AB', 'RBI', 'R','HR', 'SB', 'AVG', 'Season']
espn_hitters['Name'] = espn_hitters['Name'].astype(str)
espn_pitchers['Name'] = espn_pitchers['Name'].astype(str)

In [248]:
# convert names to all lower case letters for uniformity
espn_pitchers['Name'] = espn_pitchers['Name'].apply(lower_names)
espn_hitters['Name'] = espn_hitters['Name'].apply(lower_names)

espn_pitchers['First'], espn_pitchers['First_initial'], espn_pitchers['Last'] = \
    espn_pitchers['Name'].apply(split_names_first),espn_pitchers['Name'].apply(split_names_first_initial),\
    espn_pitchers['Name'].apply(split_names_last)
espn_pitchers['Abbr_Name'] = espn_pitchers['First_initial'] + ' ' + espn_pitchers['Last']
espn_hitters['First'], espn_hitters['First_initial'], espn_hitters['Last'] = \
    espn_hitters['Name'].apply(split_names_first),espn_hitters['Name'].apply(split_names_first_initial),\
    espn_hitters['Name'].apply(split_names_last)
espn_hitters['Abbr_Name'] = espn_hitters['First_initial'] + ' ' + espn_hitters['Last']
del espn_hitters['First'], espn_hitters['First_initial'], espn_hitters['Last']
del espn_pitchers['First'], espn_pitchers['First_initial'], espn_pitchers['Last']

In [249]:
#Take a subset of the espn dataframes to only include players in the actual statistics dataframes
new_df_pitch = find_names(pitchersDF, espn_pitchers, season)
new_df_hit = find_names(hittersDF, espn_hitters, season)

espn_pitchers = new_df_pitch[0].append(new_df_pitch[3])
espn_hitters = new_df_hit[0].append(new_df_hit[3])

In [250]:
# drop players without statistics

# we want to use IP for the 2015 season, which is the only one that included this data
espn_pitchers = espn_pitchers.replace(np.nan, -100)

#convert invalid data and drop it
espn_pitchers = espn_pitchers.replace(r'--', np.nan)
espn_pitchers = espn_pitchers.dropna()
espn_pitchers = espn_pitchers.replace(-100, np.nan)

# want to use AB for the 2015 season, which is the only one that included this data
espn_hitters = espn_hitters.replace(np.nan, -100)

# convert invalid data and drop it
espn_hitters = espn_hitters.replace(r'--', np.nan)
espn_hitters = espn_hitters.dropna()
espn_hitters = espn_hitters.replace(-100, np.nan)

In [251]:
print espn_hitters.isnull().sum()

AB                   1581
AVG                     0
Abbr_Name               0
HR                      0
Name                    0
Pos Summary             0
Prediction_Season       0
R                       0
RBI                     0
SB                      0
Season                  0
actual_2B               0
actual_3B               0
actual_AB               0
actual_AVG              0
actual_BB               0
actual_HR               0
actual_OBP              0
actual_R                0
actual_RBI              0
actual_SB               0
actual_SLG              0
actual_SO               0
actual_age              0
dtype: int64


In [252]:
# assign columns to the correct datatypes
#espn_hitters["AB"] = pd.to_numeric(espn_hitters.AB)
#espn_hitters["AB"] = espn_hitters["AB"].astype(int)
espn_hitters["AB"] = espn_hitters["AB"].astype(str)
espn_hitters["RBI"] = espn_hitters["RBI"].astype(int)
espn_hitters["R"] = espn_hitters["R"].astype(int)
espn_hitters["HR"] = espn_hitters["HR"].astype(int)
espn_hitters["SB"] = espn_hitters["SB"].astype(int)
espn_hitters["Season"] = espn_hitters["Season"].astype(int)
espn_hitters["AVG"] = espn_hitters["AVG"].astype(float)

espn_pitchers["ERA"] = espn_pitchers["ERA"].astype(float)
espn_pitchers["WHIP"] = espn_pitchers["WHIP"].astype(float)
espn_pitchers["K"] = espn_pitchers["K"].astype(int)
espn_pitchers["W"] = espn_pitchers["W"].astype(int)
espn_pitchers["Season"] = espn_pitchers["Season"].astype(int)

### CBS
CBS Sports is the sports division of the commercial broadcast television network, CBS.

In [253]:
# ============================= CREATE DATAFRAMES OF CBS PITCHER & HITTER PROJECTIONS ================================

# empty lists to append to for each year of information
cbs_hitters = []
cbs_pitchers = []

# append to pitchers dataframe
for ID in os.listdir('CSV_files/cbs_pitchers/'):
    if ID not in '.listing':
        with open(os.path.join('./CSV_files/cbs_pitchers/', ID)) as inFile:
            cbs_pitchers.append(pd.read_csv(inFile))

# append to hitters dataframe
for ID in os.listdir('CSV_files/cbs_hitters/'):
    if ID not in '.listing':
        with open(os.path.join('./CSV_files/cbs_hitters/', ID)) as inFile:
            cbs_hitters.append(pd.read_csv(inFile))
            
# concatenate master lists into dataframes            
cbs_pitchers = pd.concat(cbs_pitchers)
cbs_hitters = pd.concat(cbs_hitters)

# keep only the columns with stats we care about
cbs_pitchers = cbs_pitchers[['Name', 'SO', 'W', 'IP', 'ERA', 'WHIP', 'Season']]
cbs_pitchers.columns = ['Name', 'K', 'W', 'IP', 'ERA', 'WHIP', 'Season']
cbs_hitters = cbs_hitters[['Name', 'AB', 'RBI', 'R','HR', 'SB', 'AVG', 'Season']]
cbs_hitters.columns = ['Name', 'AB', 'RBI', 'R','HR', 'SB', 'AVG', 'Season']

cbs_hitters["Name"] = cbs_hitters["Name"].astype(str)

In [254]:
# convert names to all lower case letters
cbs_pitchers['Name'] = cbs_pitchers['Name'].apply(lower_names)
cbs_hitters['Name'] = cbs_hitters['Name'].apply(lower_names)

cbs_pitchers['First'], cbs_pitchers['First_initial'], cbs_pitchers['Last'] = \
    cbs_pitchers['Name'].apply(split_names_first),cbs_pitchers['Name'].apply(split_names_first_initial),\
    cbs_pitchers['Name'].apply(split_names_last)
cbs_pitchers['Abbr_Name'] = cbs_pitchers['First_initial'] + ' ' + cbs_pitchers['Last']
cbs_hitters['First'], cbs_hitters['First_initial'], cbs_hitters['Last'] = \
    cbs_hitters['Name'].apply(split_names_first),cbs_hitters['Name'].apply(split_names_first_initial),\
    cbs_hitters['Name'].apply(split_names_last)
cbs_hitters['Abbr_Name'] = cbs_hitters['First_initial'] + ' ' + cbs_hitters['Last']
del cbs_hitters['First'], cbs_hitters['First_initial'], cbs_hitters['Last']
del cbs_pitchers['First'], cbs_pitchers['First_initial'], cbs_pitchers['Last']

In [255]:
# assign columns to the correct datatype
cbs_hitters['AB'] = pd.to_numeric(cbs_hitters.AB)
cbs_hitters["AB"] = cbs_hitters["AB"].astype(int)
cbs_hitters["RBI"] = cbs_hitters["RBI"].astype(int)
#cbs_hitters["R"] = cbs_hitters["R"].astype(int)
cbs_hitters["HR"] = cbs_hitters["HR"].astype(int)
cbs_hitters["SB"] = cbs_hitters["SB"].astype(int)
cbs_hitters["Season"] = cbs_hitters["Season"].astype(int)

cbs_pitchers["K"] = cbs_pitchers["K"].astype(int)
cbs_pitchers["W"] = cbs_pitchers["W"].astype(int)
cbs_pitchers["Season"] = cbs_pitchers["Season"].astype(int)

In [256]:
cbs_hitters_2017 = cbs_hitters[cbs_hitters['Season']==2017]
new_df_hit_predict = find_names2(hittersDF, cbs_hitters_2017, [2017])
cbs_hitters_2017 = new_df_hit_predict[0].append(new_df_hit_predict[3])

cbs_pitchers_2017 = cbs_pitchers[cbs_pitchers['Season']==2017]
new_df_pitch_predict = find_names2(pitchersDF, cbs_pitchers_2017, [2017])
cbs_pitchers_2017 = new_df_pitch_predict[0].append(new_df_pitch_predict[3])

#Take a subset of the CBS dataframes to only include players in the actual statistics dataframes
new_df_pitch = find_names(pitchersDF, cbs_pitchers, season)
new_df_hit = find_names(hittersDF, cbs_hitters, season)

cbs_pitchers = new_df_pitch[0].append(new_df_pitch[3])
cbs_hitters = new_df_hit[0].append(new_df_hit[3])

Marcel didn't have projections for stolen bases, so we fill the missing values here.

Below we will use our function defined earlier to add columns to each of our dataframes. These columns are binary columns that have a 1 for a correct prediction and a 0 for an incorrect prediction. In the next section we will discuss "correct" predictions and "incorrect" predictions in more detail.

In [257]:
hitters_dict = {'RBI':7,'AVG':.01,'R':7, 'HR':5, 'SB':3}
pitchers_dict = {'W':2,'K':15,'ERA':.2, 'WHIP':.05}
#add the correct columns for our hitters dataframe
add_correct_column(steamer_hitters, hitters_dict)
add_correct_column(guru_hitters, hitters_dict)
add_correct_column(marcel_hitters, hitters_dict)
add_correct_column(espn_hitters, hitters_dict)
add_correct_column(cbs_hitters, hitters_dict)
add_correct_column(fangraphs_hitters, hitters_dict)

#add the correct columns for our pitchers dataframe
add_correct_column(steamer_pitchers, pitchers_dict)
add_correct_column(guru_pitchers, pitchers_dict)
add_correct_column(marcel_pitchers, pitchers_dict)
add_correct_column(espn_pitchers, pitchers_dict)
add_correct_column(cbs_pitchers, pitchers_dict)
add_correct_column(fangraphs_pitchers, pitchers_dict)

Now, we create new dataframes that we will use in our machine learning portion to try and classify correct and incorrect predictions. So, we join the previous years actual statistics to the projection statistics and then we also have the columns of binary classifications from the chunk of code above. 

In [258]:
new_df_pitch_predict = find_names2(pitchersDF, cbs_pitchers, season)
new_df_hit_predict = find_names2(hittersDF, cbs_hitters, season)
cbs_pitchers_predictions = new_df_pitch_predict[0].append(new_df_pitch_predict[3])
cbs_hitters_predictions = new_df_hit_predict[0].append(new_df_hit_predict[3])

new_df_pitch_predict = find_names2(pitchersDF, espn_pitchers, season)
new_df_hit_predict = find_names2(hittersDF, espn_hitters, season)
espn_pitchers_predictions = new_df_pitch_predict[0].append(new_df_pitch_predict[3])
espn_hitters_predictions = new_df_hit_predict[0].append(new_df_hit_predict[3])

new_df_pitch_predict = find_names2(pitchersDF, guru_pitchers, season)
new_df_hit_predict = find_names2(hittersDF, guru_hitters, season)
guru_pitchers_predictions = new_df_pitch_predict[0].append(new_df_pitch_predict[3])
guru_hitters_predictions = new_df_hit_predict[0].append(new_df_hit_predict[3])

new_df_pitch_predict = find_names2(pitchersDF, steamer_pitchers, season)
new_df_hit_predict = find_names2(hittersDF, steamer_hitters, season)
steamer_pitchers_predictions = new_df_pitch_predict[0].append(new_df_pitch_predict[3])
steamer_hitters_predictions = new_df_hit_predict[0].append(new_df_hit_predict[3])

new_df_pitch_predict = find_names2(pitchersDF, fangraphs_pitchers, season)
new_df_hit_predict = find_names2(hittersDF, fangraphs_hitters, season)
fangraphs_pitchers_predictions = new_df_pitch_predict[0].append(new_df_pitch_predict[3])
fangraphs_hitters_predictions = new_df_hit_predict[0].append(new_df_hit_predict[3])

new_df_pitch_predict = find_names2(pitchersDF, marcel_pitchers, season)
new_df_hit_predict = find_names2(hittersDF, marcel_hitters, season)
marcel_pitchers_predictions = new_df_pitch_predict[0].append(new_df_pitch_predict[3])
marcel_hitters_predictions = new_df_hit_predict[0].append(new_df_hit_predict[3])


We are going to now create a master dataframe that will posses all the data we will potentially need when it comes time to train our machine learning algorithms.

In [259]:
# ================================= CREATE MASTER DATAFRAME OF ALL PLAYER PROJECTIONS ================================


# create empty lists to append to
hitter_predictions = []
pitcher_predictions = []

# create lists of methods to loop through, one for hitters & one for pitchers
hitters_method_list = [cbs_hitters_predictions, espn_hitters_predictions, fangraphs_hitters_predictions, \
                       guru_hitters_predictions, marcel_hitters_predictions, steamer_hitters_predictions]
pitchers_method_list = [cbs_pitchers_predictions, espn_pitchers_predictions, fangraphs_pitchers_predictions,\
                        guru_pitchers_predictions, marcel_pitchers_predictions, steamer_pitchers_predictions]

# create a method column in each dataframe
for i in xrange(len(hitters_method_list)):
    hitters_method_list[i]["method"] = float(i) 
for i in xrange(len(pitchers_method_list)):
    pitchers_method_list[i]["method"] = float(i)

# loop through hitters projection dataframes
for methodDF in hitters_method_list:
    hitter_predictions.append(methodDF)
# concatenate master hitter projections list into pandas DataFrame
hitter_predictions = pd.concat(hitter_predictions)

# loop through pitchers projection dataframes
for methodDF in pitchers_method_list:
    pitcher_predictions.append(methodDF)
# concatenate master pitcher projections list into pandas DataFrame
pitcher_predictions = pd.concat(pitcher_predictions)

In [260]:
# hitters

hitter_predictions = hitter_predictions.dropna()

x_hitters = hitter_predictions[hitter_predictions.columns.difference(['Abbr_Name', 'Name', 'Pos Summary', 'Season', \
                        'correct_AVG', 'correct_HR', 'correct_R', 'correct_RBI', 'correct_SB', 'AB', 'Prediction_Season'])]

to_predict_hitters = ['correct_AVG', 'correct_HR', 'correct_R', 'correct_RBI', 'correct_SB']


# pitchers

pitcher_predictions = pitcher_predictions.dropna()

x_pitchers = pitcher_predictions[pitcher_predictions.columns.difference(['Abbr_Name', 'Name', 'Season',\
                'Prediction_Season', 'correct_ERA', 'correct_K', 'correct_W', 'correct_WHIP', 'IP', 'SV'])]

to_predict_pitchers = ['correct_ERA', 'correct_K', 'correct_W', 'correct_WHIP']

We are going to now create a 2017 dataframe that will possess all the data we will potentially need when it comes time to run our machine learning algorithms and get classifications for this season.

In [261]:
# ================================= CREATE MASTER DATAFRAME OF ALL PLAYER PROJECTIONS ================================


# create empty lists to append to
hitter_predictions = []
pitcher_predictions = []

# create lists of methods to loop through, one for hitters & one for pitchers
hitters_method_list = [cbs_hitters_predictions, espn_hitters_predictions, fangraphs_hitters_predictions, \
                       guru_hitters_predictions, marcel_hitters_predictions, steamer_hitters_predictions]
pitchers_method_list = [cbs_pitchers_predictions, espn_pitchers_predictions, fangraphs_pitchers_predictions,\
                        guru_pitchers_predictions, marcel_pitchers_predictions, steamer_pitchers_predictions]

# create a method column in each dataframe
for i in xrange(len(hitters_method_list)):
    hitters_method_list[i]["method"] = float(i) 
for i in xrange(len(pitchers_method_list)):
    pitchers_method_list[i]["method"] = float(i)

# loop through hitters projection dataframes
for methodDF in hitters_method_list:
    hitter_predictions.append(methodDF)
# concatenate master hitter projections list into pandas DataFrame
hitter_predictions = pd.concat(hitter_predictions)

# loop through pitchers projection dataframes
for methodDF in pitchers_method_list:
    pitcher_predictions.append(methodDF)
# concatenate master pitcher projections list into pandas DataFrame
pitcher_predictions = pd.concat(pitcher_predictions)

In [262]:
# hitters

hitter_predictions = hitter_predictions.dropna()

x_hitters = hitter_predictions[hitter_predictions.columns.difference(['Abbr_Name', 'Name', 'Pos Summary', 'Season', \
                        'correct_AVG', 'correct_HR', 'correct_R', 'correct_RBI', 'correct_SB', 'AB', 'Prediction_Season'])]

to_predict_hitters = ['correct_AVG', 'correct_HR', 'correct_R', 'correct_RBI', 'correct_SB']


# pitchers

pitcher_predictions = pitcher_predictions.dropna()

x_pitchers = pitcher_predictions[pitcher_predictions.columns.difference(['Abbr_Name', 'Name', 'Season',\
                'Prediction_Season', 'correct_ERA', 'correct_K', 'correct_W', 'correct_WHIP', 'IP', 'SV'])]

to_predict_pitchers = ['correct_ERA', 'correct_K', 'correct_W', 'correct_WHIP']

## Read in & clean 2017 projections

In [263]:
# =========================== CREATE DATAFRAME OF 2017 STEAMER PITCHER & HITTER PROJECTIONS ===========================

# Set up our hitter and pitcher Pandas DataFrames for 2017 Steamer method
steamer_hitters_2017 = pd.DataFrame()
steamer_pitchers_2017 = pd.DataFrame()

# read in the csv file for 2017 and append it to the appropriate df

steamer_hitters_2017 = pd.read_csv('CSV_files/steamer/steamer_hitters_{}.csv'.format(2017))
steamer_pitchers_2017 = pd.read_csv('CSV_files/steamer/steamer_pitchers_{}.csv'.format(2017))
    
# only keep statistics we are interested in 
steamer_hitters_2017 = steamer_hitters_2017[['Name', 'AB', 'HR', 'R', 'RBI', 'SB', 'AVG', 'season']]
steamer_pitchers_2017 = steamer_pitchers_2017[['Name', 'W', 'ERA', 'IP', 'K','SV', 'WHIP', 'season']]

steamer_hitters_2017 = steamer_hitters_2017.rename(columns = {'season' : 'Season'})
steamer_pitchers_2017 = steamer_pitchers_2017.rename(columns = {'season' : 'Season'})

In [264]:
# using a function defined previously, change names to lower case for uniformity
steamer_pitchers_2017['Name'] = steamer_pitchers_2017['Name'].apply(lower_names)
steamer_hitters_2017['Name'] = steamer_hitters_2017['Name'].apply(lower_names)

steamer_pitchers_2017['First'], steamer_pitchers_2017['First_initial'], steamer_pitchers_2017['Last'] = \
    steamer_pitchers_2017['Name'].apply(split_names_first),steamer_pitchers_2017['Name'].apply\
        (split_names_first_initial),steamer_pitchers_2017['Name'].apply(split_names_last)
steamer_pitchers_2017['Abbr_Name'] = steamer_pitchers_2017['First_initial'] + ' ' + steamer_pitchers_2017['Last']
steamer_hitters_2017['First'], steamer_hitters_2017['First_initial'], steamer_hitters_2017['Last'] = \
    steamer_hitters_2017['Name'].apply(split_names_first),steamer_hitters_2017['Name'].apply\
        (split_names_first_initial),steamer_hitters_2017['Name'].apply(split_names_last)
steamer_hitters_2017['Abbr_Name'] = steamer_hitters_2017['First_initial'] + ' ' + steamer_hitters_2017['Last']
del steamer_hitters_2017['First'], steamer_hitters_2017['First_initial'], steamer_hitters_2017['Last']
del steamer_pitchers_2017['First'], steamer_pitchers_2017['First_initial'], steamer_pitchers_2017['Last']

In [265]:
# assign columns to the correct datatypes
#steamer_hitters["AB"] = pd.to_numeric(steamer_hitters.AB)
steamer_hitters_2017["AB"] = steamer_hitters_2017["AB"].astype(str)
steamer_hitters_2017["RBI"] = steamer_hitters_2017["RBI"].astype(int)
steamer_hitters_2017["R"] = steamer_hitters_2017["R"].astype(int)
steamer_hitters_2017["HR"] = steamer_hitters_2017["HR"].astype(int)
steamer_hitters_2017["SB"] = steamer_hitters_2017["SB"].astype(int)
steamer_hitters_2017["Season"] = steamer_hitters_2017["Season"].astype(int)

steamer_pitchers_2017["K"] = steamer_pitchers_2017["K"].astype(int)
steamer_pitchers_2017["W"] = steamer_pitchers_2017["W"].astype(int)
steamer_pitchers_2017["Season"] = steamer_pitchers_2017["Season"].astype(int)

In [266]:
#Take a subset of the steamer dataframes to only include players in the actual statistics dataframes

new_df_pitch_predict = find_names2(pitchersDF, steamer_pitchers_2017, [2017])
new_df_hit_predict = find_names2(hittersDF, steamer_hitters_2017, [2017])
steamer_pitchers_2017 = new_df_pitch_predict[0].append(new_df_pitch_predict[3])
steamer_hitters_2017 = new_df_hit_predict[0].append(new_df_hit_predict[3])

In [267]:
# ============================= CREATE DATAFRAMES OF ESPN PITCHER & HITTER PROJECTIONS ================================

# Set up our hitter and pitcher Pandas DataFrames for 2017 ESPN method
espn_hitters_2017 = pd.DataFrame()
espn_pitchers_2017 = pd.DataFrame()

# read in the csv file for 2017 and append it to the appropriate df

espn_hitters_2017 = pd.read_csv('CSV_files/espn_hitters/espn_hitters_{}.csv'.format(2017))
espn_pitchers_2017 = pd.read_csv('CSV_files/espn_pitchers/espn_pitchers_{}.csv'.format(2017))

# keep only the stats we are interested in and rename columns to match our other dataframes
espn_pitchers_2017 = espn_pitchers_2017[['name', '    K', '    W', '   IP', '   SV', '  ERA', ' WHIP', 'season']]
espn_pitchers_2017.columns = ['Name', 'K', 'W', 'IP', 'SV', 'ERA', 'WHIP', 'Season']
espn_hitters_2017 = espn_hitters_2017[['Player', '   AB', 'RBI', 'R','HR', 'SB', 'AVG', 'Season']]
espn_hitters_2017.columns = ['Name', 'AB', 'RBI', 'R','HR', 'SB', 'AVG', 'Season']

In [268]:
# convert names to all lower case letters for uniformity
espn_pitchers_2017['Name'] = espn_pitchers_2017['Name'].apply(lower_names)
espn_hitters_2017['Name'] = espn_hitters_2017['Name'].apply(lower_names)

espn_pitchers_2017['First'], espn_pitchers_2017['First_initial'], espn_pitchers_2017['Last'] = \
    espn_pitchers_2017['Name'].apply(split_names_first),espn_pitchers_2017['Name'].apply(split_names_first_initial),\
    espn_pitchers_2017['Name'].apply(split_names_last)
espn_pitchers_2017['Abbr_Name'] = espn_pitchers_2017['First_initial'] + ' ' + espn_pitchers_2017['Last']
espn_hitters_2017['First'], espn_hitters_2017['First_initial'], espn_hitters_2017['Last'] = \
    espn_hitters_2017['Name'].apply(split_names_first),espn_hitters_2017['Name'].apply(split_names_first_initial),\
    espn_hitters_2017['Name'].apply(split_names_last)
espn_hitters_2017['Abbr_Name'] = espn_hitters_2017['First_initial'] + ' ' + espn_hitters_2017['Last']
del espn_hitters_2017['First'], espn_hitters_2017['First_initial'], espn_hitters_2017['Last']
del espn_pitchers_2017['First'], espn_pitchers_2017['First_initial'], espn_pitchers_2017['Last']

In [269]:
#Take a subset of the espn dataframes to only include players in the actual statistics dataframes
new_df_pitch_predict = find_names2(pitchersDF, espn_pitchers_2017, [2017])
new_df_hit_predict = find_names2(hittersDF, espn_hitters_2017, [2017])

espn_pitchers_2017 = new_df_pitch_predict[0].append(new_df_pitch_predict[3])
espn_hitters_2017 = new_df_hit_predict[0].append(new_df_hit_predict[3])

In [270]:
# drop players without statistics

# we want to use IP for the 2015 season, which is the only one that included this data
espn_pitchers_2017 = espn_pitchers_2017.replace(np.nan, -100)

#convert invalid data and drop it
espn_pitchers_2017 = espn_pitchers_2017.replace(r'--', np.nan)
espn_pitchers_2017 = espn_pitchers_2017.dropna()
espn_pitchers_2017 = espn_pitchers_2017.replace(-100, np.nan)

espn_hitters_2017 = espn_hitters_2017.replace(np.nan, -100)

# convert invalid data and drop it
espn_hitters_2017 = espn_hitters_2017.replace(r'--', np.nan)
espn_hitters_2017 = espn_hitters_2017.dropna()
espn_hitters_2017 = espn_hitters_2017.replace(-100, np.nan)

In [271]:
# assign columns to the correct datatypes
#espn_hitters["AB"] = pd.to_numeric(espn_hitters.AB)
#espn_hitters["AB"] = espn_hitters["AB"].astype(int)
espn_hitters_2017["AB"] = espn_hitters_2017["AB"].astype(str)
espn_hitters_2017["RBI"] = espn_hitters_2017["RBI"].astype(int)
espn_hitters_2017["R"] = espn_hitters_2017["R"].astype(int)
espn_hitters_2017["HR"] = espn_hitters_2017["HR"].astype(int)
espn_hitters_2017["SB"] = espn_hitters_2017["SB"].astype(int)
espn_hitters_2017["Season"] = espn_hitters_2017["Season"].astype(int)
espn_hitters_2017["AVG"] = espn_hitters_2017["AVG"].astype(float)

espn_pitchers_2017["IP"] = espn_pitchers_2017["IP"].astype(str)
espn_pitchers_2017["ERA"] = espn_pitchers_2017["ERA"].astype(float)
espn_pitchers_2017["WHIP"] = espn_pitchers_2017["WHIP"].astype(float)
espn_pitchers_2017["K"] = espn_pitchers_2017["K"].astype(int)
espn_pitchers_2017["W"] = espn_pitchers_2017["W"].astype(int)
espn_pitchers_2017["Season"] = espn_pitchers_2017["Season"].astype(int)

In [272]:
#### Fangraphs 2017
# ========================== CREATE DATAFRAMES OF FANGRAPHS PITCHER & HITTER PROJECTIONS ==============================

# Set up our hitter and pitcher pandas DataFrames for FanGraphs method
fangraphs_hitters_2017 = pd.DataFrame()
fangraphs_pitchers_2017 = pd.DataFrame()

# for each year of projections read in the csv file and append it to the appropriate df
df = pd.read_csv('CSV_files/fangraphs/fans_hitters_{}.csv'.format(2017))
fangraphs_hitters_2017 = fangraphs_hitters_2017.append(df, ignore_index = True)
df2 = pd.read_csv('CSV_files/fangraphs/fans_pitchers_{}.csv'.format(2017))
fangraphs_pitchers_2017 = fangraphs_pitchers_2017.append(df2, ignore_index = True)

# only keep statistics we are interested in 
fangraphs_hitters_2017 = fangraphs_hitters_2017[['Name', 'AB', 'HR', 'R', 'RBI', 'SB', 'AVG', 'season']]
fangraphs_pitchers_2017 = fangraphs_pitchers_2017[['Name', 'W', 'ERA', 'IP', 'K','SV', 'WHIP', 'season']]
fangraphs_hitters_2017 = fangraphs_hitters_2017.rename(columns = {'season' : 'Season'})
fangraphs_pitchers_2017 = fangraphs_pitchers_2017.rename(columns = {'season' : 'Season'})


fangraphs_hitters_2017.shape


# using a function defined previously, change names to lower case for uniformity

fangraphs_pitchers_2017['Name'] = fangraphs_pitchers_2017['Name'].apply(lower_names)
fangraphs_hitters_2017['Name'] = fangraphs_hitters_2017['Name'].apply(lower_names)

fangraphs_pitchers_2017['First'], fangraphs_pitchers_2017['First_initial'], fangraphs_pitchers_2017['Last'] = \
    fangraphs_pitchers_2017['Name'].apply(split_names_first),fangraphs_pitchers_2017['Name'].apply(split_names_first_initial),\
    fangraphs_pitchers_2017['Name'].apply(split_names_last)
fangraphs_pitchers_2017['Abbr_Name'] = fangraphs_pitchers_2017['First_initial'] + ' ' + fangraphs_pitchers_2017['Last']
fangraphs_hitters_2017['First'], fangraphs_hitters_2017['First_initial'], fangraphs_hitters_2017['Last'] = \
    fangraphs_hitters_2017['Name'].apply(split_names_first),fangraphs_hitters_2017['Name'].apply(split_names_first_initial),\
    fangraphs_hitters_2017['Name'].apply(split_names_last)
fangraphs_hitters_2017['Abbr_Name'] = fangraphs_hitters_2017['First_initial'] + ' ' + fangraphs_hitters_2017['Last']

del fangraphs_hitters_2017['First'], fangraphs_hitters_2017['First_initial'], fangraphs_hitters_2017['Last']
del fangraphs_pitchers_2017['First'], fangraphs_pitchers_2017['First_initial'], fangraphs_pitchers_2017['Last']


fangraphs_hitters_2017.shape


# assign columns to the correct datatypes
fangraphs_hitters_2017["AB"] = fangraphs_hitters_2017["AB"].astype(str)
fangraphs_hitters_2017["RBI"] = fangraphs_hitters_2017["RBI"].astype(int)
fangraphs_hitters_2017["R"] = fangraphs_hitters_2017["R"].astype(int)
fangraphs_hitters_2017["HR"] = fangraphs_hitters_2017["HR"].astype(int)
fangraphs_hitters_2017["SB"] = fangraphs_hitters_2017["SB"].astype(int)
fangraphs_hitters_2017["Season"] = fangraphs_hitters_2017["Season"].astype(int)

fangraphs_pitchers_2017["K"] = fangraphs_pitchers_2017["K"].astype(int)
fangraphs_pitchers_2017["W"] = fangraphs_pitchers_2017["W"].astype(int)
fangraphs_pitchers_2017["Season"] = fangraphs_pitchers_2017["Season"].astype(int)

#Take a subset of the fangraphs dataframe to only include players in the actual statistics dataframes
new_df_pitch_predict_2017 = find_names2(pitchersDF, fangraphs_pitchers_2017, [2017])
new_df_hit_predict_2017 = find_names2(hittersDF, fangraphs_hitters_2017, [2017])
fangraphs_pitchers_2017 = new_df_pitch_predict_2017[0].append(new_df_pitch_predict_2017[3])
fangraphs_hitters_2017 = new_df_hit_predict_2017[0].append(new_df_hit_predict_2017[3])

fangraphs_hitters_2017["AB"] = pd.to_numeric(fangraphs_hitters_2017.AB)
fangraphs_hitters_2017["AB"] = fangraphs_hitters_2017["AB"].astype(int)

In [273]:
# ============================= CREATE DATAFRAMES OF GURU PITCHER & HITTER PROJECTIONS ================================

# Set up our hitter and pitcher Pandas dataframes for Guru method
guru_hitters_2017 = pd.DataFrame()
guru_pitchers_2017 = pd.DataFrame()

# for each year of projections read in the csv file and append it to the apprpriate df
ID=[2017]
for i in ID:
    df = pd.read_csv('CSV_files/guru/guru_hitters_{}.csv'.format(i))
    guru_hitters_2017 = guru_hitters_2017.append(df, ignore_index = True)
    
    if i != 2015: # Special exception because Guru doesn't have projections for pitchers in 2015
        df2 = pd.read_csv('CSV_files/guru/guru_pitchers_{}.csv'.format(i))
        guru_pitchers_2017 = guru_pitchers_2017.append(df2, ignore_index = True)
    else:
        pass

# only keep statistics we are interested in 
guru_hitters_2017 = guru_hitters_2017[['Name', 'AB', 'HR', 'R', 'RBI', 'SB', 'AVG', 'season']]
guru_pitchers_2017 = guru_pitchers_2017[['Name', 'W', 'ERA', 'IP', 'K','SV', 'WHIP', 'season']]
guru_hitters_2017 = guru_hitters_2017.rename(columns = {'season' : 'Season'})
guru_pitchers_2017 = guru_pitchers_2017.rename(columns = {'season' : 'Season'})

# drop null values
guru_hitters_2017.dropna(inplace=True)
guru_pitchers_2017.dropna(inplace=True)

# replace -'s values with 0's
guru_pitchers_2017['SV'] = guru_pitchers_2017['SV'].replace(".*[\-].* ","0", regex=True).astype(int)
guru_hitters_2017['HR'] = guru_hitters_2017['HR'].replace(".*[\-].*", "0", regex=True).astype(int)
guru_hitters_2017['SB'] = guru_hitters_2017['SB'].replace(".*[\-]*", "0", regex=True).astype(int)
guru_pitchers_2017['W'] = guru_pitchers_2017['W'].replace(".*[\-]*", "0", regex=True).astype(int)

# using a function defined previously, change names to lower case for uniformity

guru_pitchers_2017['Name'] = guru_pitchers_2017['Name'].apply(lower_names)
guru_hitters_2017['Name'] = guru_hitters_2017['Name'].apply(lower_names)

guru_pitchers_2017['First'], guru_pitchers_2017['First_initial'], guru_pitchers_2017['Last'] = \
    guru_pitchers_2017['Name'].apply(split_names_first),guru_pitchers_2017['Name'].apply(split_names_first_initial),\
    guru_pitchers_2017['Name'].apply(split_names_last)
guru_pitchers_2017['Abbr_Name'] = guru_pitchers_2017['First_initial'] + ' ' + guru_pitchers_2017['Last']
guru_hitters_2017['First'], guru_hitters_2017['First_initial'], guru_hitters_2017['Last'] = \
    guru_hitters_2017['Name'].apply(split_names_first),guru_hitters_2017['Name'].apply(split_names_first_initial),\
    guru_hitters_2017['Name'].apply(split_names_last)
guru_hitters_2017['Abbr_Name'] = guru_hitters_2017['First_initial'] + ' ' + guru_hitters_2017['Last']
del guru_hitters_2017['First'], guru_hitters_2017['First_initial'], guru_hitters_2017['Last']
del guru_pitchers_2017['First'], guru_pitchers_2017['First_initial'], guru_pitchers_2017['Last']

# assign columns to the correct datatypes
guru_hitters_2017["AB"] = pd.to_numeric(guru_hitters_2017.AB)
guru_hitters_2017["AB"] = guru_hitters_2017["AB"].astype(int)
guru_hitters_2017["RBI"] = guru_hitters_2017["RBI"].astype(int)
guru_hitters_2017["R"] = guru_hitters_2017["R"].astype(int)
guru_hitters_2017["HR"] = guru_hitters_2017["HR"].astype(int)
guru_hitters_2017["SB"] = guru_hitters_2017["SB"].astype(int)
guru_hitters_2017["Season"] = guru_hitters_2017["Season"].astype(int)

guru_pitchers_2017["K"] = guru_pitchers_2017["K"].astype(int)
guru_pitchers_2017["W"] = guru_pitchers_2017["W"].astype(int)
guru_pitchers_2017["Season"] = guru_pitchers_2017["Season"].astype(int)

#Take a subset of the guru dataframe to only include players in the actual statistics dataframes
seasons = [2017]

#use the find names function defined earlier
new_df_pitch = find_names2(pitchersDF, guru_pitchers_2017, seasons)
new_df_hit = find_names2(hittersDF, guru_hitters_2017, seasons)

guru_pitchers_2017 = new_df_pitch[0].append(new_df_pitch[3])
guru_hitters_2017 = new_df_hit[0].append(new_df_hit[3])

In [274]:
list_sites = [espn_hitters_2017, fangraphs_hitters_2017, guru_hitters_2017, cbs_hitters_2017, steamer_hitters_2017]
j = 0

for i in marcel_hitters_2017['Name']:

    rand = np.random.randint(0,5)
    if i in list_sites[rand]['Name'].values:
        new_sb = list_sites[rand][list_sites[rand]['Name'] == i]['SB'].values[0]
        marcel_hitters_2017.set_value(j, 'SB', new_sb)
        j+=1
        j=j%307
    elif i in list_sites[(rand+1)%len(list_sites)]['Name'].values:
        rand += 1
        rand = rand%len(list_sites)
        new_sb = list_sites[rand][list_sites[rand]['Name'] == i]['SB'].values[0]
        marcel_hitters_2017.set_value(j, 'SB', new_sb)
        j+=1
        j=j%307
    else:
        marcel_hitters_2017.set_value(j, 'SB', \
                        hittersDF[(hittersDF['Name']=='jose abreu') & (hittersDF['Season']==2016)]['actual_SB'].values[0])
        j+=1
        j=j%307

After cleaning all of our dataframes we double checked that all the appropriate data was included, no unintended missing values or wrong values, and no duplicates. We now have a clean hitters and pitchers dataframe for the 6 prediction methods for 2010-2015, and a hitters and pitchers dataframe for the actual statistics of the years 2010-2016. 

In [275]:
# ================================= CREATE MASTER DATAFRAME OF 2017 PLAYER PROJECTIONS ================================

# create empty lists to append to
hitter_predictions_2017 = []
pitcher_predictions_2017 = []

# create lists of methods to loop through, one for hitters & one for pitchers
hitters2017_method_list = [cbs_hitters_2017, espn_hitters_2017, fangraphs_hitters_2017, \
                       guru_hitters_2017, marcel_hitters_2017, steamer_hitters_2017]
pitchers2017_method_list = [cbs_pitchers_2017, espn_pitchers_2017, fangraphs_pitchers_2017,\
                        guru_pitchers_2017, steamer_pitchers_2017]

# create a method column in each dataframe
for i in xrange(len(hitters2017_method_list)):
    hitters2017_method_list[i]["method"] = float(i+1) 
for i in xrange(len(pitchers2017_method_list)):
    if i <=4:
        pitchers2017_method_list[i]["method"] = float(i)
    else:
        pitchers2017_method_list[i]["method"] = float(i+1)

# loop through hitters projection dataframes
for methodDF in hitters2017_method_list:
    hitter_predictions_2017.append(methodDF)
# concatenate master hitter projections list into pandas DataFrame
hitter_predictions_2017 = pd.concat(hitter_predictions_2017)

# loop through pitchers projection dataframes
for methodDF in pitchers2017_method_list:
    pitcher_predictions_2017.append(methodDF)
# concatenate master pitcher projections list into pandas DataFrame
pitcher_predictions_2017 = pd.concat(pitcher_predictions_2017)

In [276]:
# hitters

hitter_predictions_2017 = hitter_predictions_2017.dropna()

x_hitters2017 = hitter_predictions_2017[hitter_predictions_2017.columns.difference(['Abbr_Name', 'Name', 'Pos Summary',\
                        'Season', 'correct_AVG', 'correct_HR', 'correct_R', 'correct_RBI', 'correct_SB', 'AB', \
                        'Prediction_Season'])]

to_predict_hitters_2017 = ['correct_AVG', 'correct_HR', 'correct_R', 'correct_RBI', 'correct_SB']


# pitchers

pitcher_predictions_2017 = pitcher_predictions_2017.dropna()

x_pitchers2017 = pitcher_predictions_2017[pitcher_predictions_2017.columns.difference(['Abbr_Name', 'Name', 'Season',\
                'Prediction_Season', 'correct_ERA', 'correct_K', 'correct_W', 'correct_WHIP', 'IP', 'SV'])]

to_predict_pitchers_2017 = ['correct_ERA', 'correct_K', 'correct_W', 'correct_WHIP']

# Write cleaned data to CSVs

In [277]:
# this will write the data frames below to csv files with the same name as the dataframes
pd.DataFrame.to_csv(hittersDF, 'CSV_files/cleanCSV/hittersDF.csv')
pd.DataFrame.to_csv(pitchersDF, 'CSV_files/cleanCSV/pitchersDF.csv')
pd.DataFrame.to_csv(espn_hitters, 'CSV_files/cleanCSV/espn_hitters.csv')
pd.DataFrame.to_csv(espn_pitchers, 'CSV_files/cleanCSV/espn_pitchers.csv')
pd.DataFrame.to_csv(fangraphs_hitters, 'CSV_files/cleanCSV/fangraphs_hitters.csv')
pd.DataFrame.to_csv(fangraphs_pitchers, 'CSV_files/cleanCSV/fangraphs_pitchers.csv')
pd.DataFrame.to_csv(guru_hitters, 'CSV_files/cleanCSV/guru_hitters.csv')
pd.DataFrame.to_csv(guru_pitchers, 'CSV_files/cleanCSV/guru_pitchers.csv')
pd.DataFrame.to_csv(marcel_hitters, 'CSV_files/cleanCSV/marcel_hitters.csv')
pd.DataFrame.to_csv(marcel_pitchers, 'CSV_files/cleanCSV/marcel_pitchers.csv')
pd.DataFrame.to_csv(cbs_hitters, 'CSV_files/cleanCSV/cbs_hitters.csv')
pd.DataFrame.to_csv(cbs_pitchers, 'CSV_files/cleanCSV/cbs_pitchers.csv')
pd.DataFrame.to_csv(steamer_hitters, 'CSV_files/cleanCSV/steamer_hitters.csv')
pd.DataFrame.to_csv(steamer_pitchers, 'CSV_files/cleanCSV/steamer_pitchers.csv')
pd.DataFrame.to_csv(hitter_predictions, 'CSV_files/cleanCSV/hitter_predictions.csv')
pd.DataFrame.to_csv(pitcher_predictions, 'CSV_files/cleanCSV/pitcher_predictions.csv')
pd.DataFrame.to_csv(hitter_predictions_2017, 'CSV_files/cleanCSV/hitter_predictions_2017.csv')
pd.DataFrame.to_csv(pitcher_predictions_2017, 'CSV_files/cleanCSV/pitcher_predictions_2017.csv')
pd.DataFrame.to_csv(x_hitters, 'CSV_files/cleanCSV/x_hitters.csv')
pd.DataFrame.to_csv(x_pitchers, 'CSV_files/cleanCSV/x_pitchers.csv')
pd.DataFrame.to_csv(x_hitters2017, 'CSV_files/cleanCSV/x_hitters_2017.csv')
pd.DataFrame.to_csv(x_pitchers2017, 'CSV_files/cleanCSV/x_pitchers_2017.csv')

In [278]:
pd.DataFrame.to_csv(steamer_pitchers_2017, 'CSV_files/cleanCSV/steamer_pitchers_2017.csv')
pd.DataFrame.to_csv(steamer_hitters_2017, 'CSV_files/cleanCSV/steamer_hitters_2017.csv')
pd.DataFrame.to_csv(espn_hitters_2017, 'CSV_files/cleanCSV/espn_hitters_2017.csv')
pd.DataFrame.to_csv(espn_pitchers_2017, 'CSV_files/cleanCSV/espn_pitchers_2017.csv')

pd.DataFrame.to_csv(fangraphs_hitters_2017, 'CSV_files/cleanCSV/fangraphs_hitters_2017.csv')
pd.DataFrame.to_csv(fangraphs_pitchers_2017, 'CSV_files/cleanCSV/fangraphs_pitchers_2017.csv')

pd.DataFrame.to_csv(marcel_hitters_2017, 'CSV_files/cleanCSV/marcel_hitters_2017.csv')
#note: we don't have marcel pitchers

pd.DataFrame.to_csv(guru_hitters_2017, 'CSV_files/cleanCSV/guru_hitters_2017.csv')
pd.DataFrame.to_csv(guru_pitchers_2017, 'CSV_files/cleanCSV/guru_pitchers_2017.csv')

pd.DataFrame.to_csv(cbs_hitters_2017, 'CSV_files/cleanCSV/cbs_hitters_2017.csv')
pd.DataFrame.to_csv(cbs_pitchers_2017, 'CSV_files/cleanCSV/cbs_pitchers_2017.csv')