In [22]:
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from statsmodels.formula import api as sm
%matplotlib inline

In [23]:
# Read json data on batting stats into a DataFrame.

b_df = pd.read_json('pos_stats.json')

# The DataFrame has 15,133 rows and 13 columns.

b_df.shape

# The columns are ['age', 'avg', 'hr', 'obp', 'ops', 'player_name', 'position', 'rbi', 'salary', 'slg', 'team', 'war', 'year']

b_df.columns

Index(['age', 'avg', 'hr', 'obp', 'ops', 'player_name', 'position', 'rbi',
       'salary', 'slg', 'team', 'war', 'year'],
      dtype='object')

In [24]:
# Salary is object data type.  Need to convert that into an integer value.
# Strip '$', get rid of commas, and convert to integer.

b_df.salary = b_df.salary.apply(lambda x: int(''.join(x.strip('$ ').split(','))) if x else None)

In [25]:
# Some players from the early 20th century got scraped by mistake.
# Get rid of any player-years that are not from 2000 or later.

b_df.drop(b_df[b_df.year < 2000].index, inplace=True)

In [26]:
# 43% of the player-years don't have salary information.
# I will drop those years, though I may want to come back to this later
# and potentially make some assumptions.  E.g. that a salary from a previous
# player-year can be extrapolated forward to another player-year without a salary record.

b_df = b_df.dropna(subset=['salary'])

In [27]:
# Since I am hypothesizing that salary follows a lognormal distribution,
# similar to the distribution of income in the general economy,
# I want to include a log_salary column.

b_df['log_salary'] = np.log(b_df.salary)

In [28]:
# Due to team changes, some team name values need to be fixed.
# E.g. FLA needs to be changed to MIA.

replacements = {
                "FLA": "MIA",
                "MON": "WSN",
                "ANA": "LAA",
                "TBD": "TBR",
                }

b_df = b_df.replace({'team': replacements})

In [29]:
# Next step is to create new columns corresponding to next year's salary and log salary.
# Reset index to prepare for split-apply-combine operation.

b_df = b_df.reset_index(drop=True)

In [30]:
# Group by player, and create next_year_salary column with a shift.

b_df[['next_year_salary', 'next_year_log_salary']] = b_df.groupby('player_name')[['salary', 'log_salary']].apply(lambda x: x.shift(1))

In [31]:
# Drop the resulting null values.

b_df = b_df.dropna(subset=[['next_year_salary', 'next_year_log_salary']])

In [32]:
# There are about 30 rows with missing batting stats.  Drop them.

b_df = b_df.dropna()

In [33]:
# Data on league minimum by year, in dictionary format.
# These actually correspond to NEXT years league min in the given year.

lm_dict = {
            2000: 300000,
            2001: 300000,
            2002: 300000,
            2003: 316000,
            2004: 327000,
            2005: 380000,
            2006: 390000,
            2007: 400000,
            2008: 400000,
            2009: 414000,
            2010: 480000,
            2011: 480000,
            2012: 480000,
            2013: 507500,
            2014: 507500,
            2015: 535000,
            2016: 545000,
            2017: 555000
            }

In [34]:
b_df['league_min'] = np.zeros(b_df.shape[0])

for year_, salary_ in lm_dict.items():
    b_df.league_min.loc[b_df.year == year_] = salary_

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [35]:
# Get amount over minimum.

b_df['salary_over_minimum'] = b_df['next_year_salary'] - b_df['league_min']

In [36]:
# Create indicator variable for players within threshold amount of league minimum.

threshold = 10000

b_df['player_at_min'] = np.zeros(b_df.shape[0])
b_df.player_at_min.loc[b_df.salary_over_minimum <= threshold] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [37]:
b_df = b_df.dropna()

In [38]:
b_df = b_df.reset_index(drop=True)

In [39]:
b_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4914 entries, 0 to 4913
Data columns (total 19 columns):
age                     4914 non-null int64
avg                     4914 non-null float64
hr                      4914 non-null float64
obp                     4914 non-null float64
ops                     4914 non-null float64
player_name             4914 non-null object
position                4914 non-null object
rbi                     4914 non-null float64
salary                  4914 non-null float64
slg                     4914 non-null float64
team                    4914 non-null object
war                     4914 non-null float64
year                    4914 non-null int64
log_salary              4914 non-null float64
next_year_salary        4914 non-null float64
next_year_log_salary    4914 non-null float64
league_min              4914 non-null float64
salary_over_minimum     4914 non-null float64
player_at_min           4914 non-null float64
dtypes: float64(14), int64

In [40]:
### Cleaned data ready to be pickled.
### Created picklefile 03 Oct in the afternoon.

pickle_filename = '03_oct_cleaned_pickled_batter_data.pkl'
with open(pickle_filename, 'wb') as f_obj:
    pickle.dump(k_df, f_obj)

NameError: name 'pickle' is not defined