In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from statsmodels.formula import api as sm
import pickle
%matplotlib inline

In [2]:
# Read json file of pitcher stats into a DF.
k_df = pd.read_json('pitcher_stats.json')

# The raw data has 14,576 rows and 14 columns.
k_df.shape

# The columns are ['age', 'era', 'ip', 'losses', 'player_name', 'position', 'salary', 'so', 'so9', 'team', 'war', 'win_loss_perc', 'wins', 'year']
k_df.columns

Index(['age', 'era', 'ip', 'losses', 'player_name', 'position', 'salary', 'so',
       'so9', 'team', 'war', 'win_loss_perc', 'wins', 'year'],
      dtype='object')

In [3]:
# Salary is object data type.  Need to convert that into an integer value.
# Strip '$', get rid of commas, and convert to integer.

k_df.salary = k_df.salary.apply(lambda x: int(''.join(x.strip('$ ').split(','))) if x else None)

In [4]:
# Some players from the early 20th century got scraped by mistake.
# Get rid of any player-years that are not from 2000 or later.

k_df.drop(k_df[k_df.year < 2000].index, inplace=True)

In [5]:
# 43% of the player-years don't have salary information.
# I will drop those years, though I may want to come back to this later
# and potentially make some assumptions.  E.g. that a salary from a previous
# player-year can be extrapolated forward to another player-year without a salary record.

k_df = k_df.dropna(subset=['salary'])

In [6]:
# Since I am hypothesizing that salary follows a lognormal distribution,
# similar to the distribution of income in the general economy,
# I want to include a log_salary column.

k_df['log_salary'] = np.log(k_df.salary)

In [7]:
# Due to team changes, some team name values need to be fixed.
# E.g. FLA needs to be changed to MIA.

replacements = {
                "FLA": "MIA",
                "MON": "WSN",
                "ANA": "LAA",
                "TBD": "TBR",
                }

k_df = k_df.replace({'team': replacements})

In [8]:
# Reset index to prepare for split-apply-combine operation.

k_df = k_df.reset_index(drop=True)

In [9]:
# Group by player, and create next_year_salary column with a shift.

k_df[['next_year_salary', 'next_year_log_salary']] = k_df.groupby('player_name')[['salary', 'log_salary']].apply(lambda x: x.shift(1))

In [10]:
# Data on league minimum by year, in dictionary format.
# These actually correspond to NEXT years league min in the given year.

lm_dict = {
            2000: 300000,
            2001: 300000,
            2002: 300000,
            2003: 316000,
            2004: 327000,
            2005: 380000,
            2006: 390000,
            2007: 400000,
            2008: 400000,
            2009: 414000,
            2010: 480000,
            2011: 480000,
            2012: 480000,
            2013: 507500,
            2014: 507500,
            2015: 535000,
            2016: 545000,
            2017: 555000
            }

In [11]:
k_df['league_min'] = np.zeros(k_df.shape[0])

for year_, salary_ in lm_dict.items():
    k_df.league_min.loc[k_df.year == year_] = salary_

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [12]:
# Get amount over minimum.

k_df['salary_over_minimum'] = k_df['next_year_salary'] - k_df['league_min']

In [13]:
# Create indicator variable for players within threshold amount of league minimum.

threshold = 10000

k_df['player_at_min'] = np.zeros(k_df.shape[0])
k_df.player_at_min.loc[k_df.salary_over_minimum <= threshold] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [14]:
# Drop empty values.
k_df = k_df.dropna()

In [15]:
k_df = k_df.reset_index(drop=True)

In [19]:
### Cleaned data ready to be pickled.
### Created picklefile 03 Oct in the afternoon.

#pickle_filename = '03_oct_cleaned_pickled_pitcher_data.pkl'
#with open(pickle_filename, 'wb') as f_obj:
#    pickle.dump(k_df, f_obj)