In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
sns.set()
import scipy.stats as st
import matplotlib.pyplot as plt

pd.set_option('display.float_format', lambda x: '%.2f' % x)

players = pd.read_csv('core/Master.csv')
salary = pd.read_csv('core/Salaries.csv')
teams = pd.read_csv('core/Teams.csv')
cpi_u_values = pd.read_csv('cpi_u_values.csv', float_precision="round_trip")

def percentile(num):
    def percentile_(df):
        return np.percentile(df, num)
    percentile_.__name__ = 'percentile_%s' % num
    return percentile_

## Questions to Answer

1. How has the salary for baseball been over the history of the MLB? Total? Average?
2. Whats the correlation between performance and salary?
3. What positions garner the highest salaries?
4. How is the wage disparity for baseball players? Top heavy? Even?
5. Do teams with high salaries win championships?

## What Needs to be Done

1. Need to convert the salaries dataframe to a timeindex dataframe. This will let us do a plot of salary changes over time.
2. ~~Adjust salaries to inflation.~~
3. Change plot lables to best match what we want to communicate

In [None]:
salary_by_year = salary.groupby(['yearID'])['salary'].agg([np.max, np.mean, np.median, np.min, percentile(25), percentile(75)])
salary_by_year.plot(figsize=(19, 19))
plt.tight_layout()

### Question 1:
Baseball salaries seem to be top heavy. 

In [None]:
salary_by_player_year = salary.groupby(['yearID'])
top_ten_by_year = salary_by_player_year.apply(lambda x: x.nlargest(10, 'salary'))

sorted_salary = salary.sort_values('salary', ascending=False)
sorted_salary.groupby('yearID').apply(lambda x: x['salary'].iloc[0] - x['salary'].median())
top_ten_by_year.loc[2013]

Why do some baseball players have a salary of 0??

In [None]:
salary[salary['yearID'] == 2016]['salary'].hist()

In [None]:
plot_salary_sum = salary_by_year.sum().plot(figsize=(12,6))
plot_salary_sum.set(xlabel='Year', ylabel="Sum of Salaries")
plt.tight_layout()

### Salary By Teams

In [None]:
salary.pivot_table('salary', index='yearID', columns='teamID')

In [None]:
# teams.merge(salary, on=['teamID', 'yearID'])
team_salary = teams.merge(salary.groupby(['teamID', 'yearID']).sum().reset_index(), on=['teamID', 'yearID'])
team_salary_by_year = team_salary.groupby(['yearID', 'teamID', 'W'])['salary'].sum().reset_index()
team_salary_by_year
#team_salary_by_year['rank'] = team_salary_by_year.groupby('yearID').rank()
team_salary_by_year['rank'] = team_salary_by_year.groupby('yearID')['salary'].rank()
#team_salary_by_year.pivot_table('rank', index='yearID', columns='teamID').plot(figsize=(20, 20))
team_salary_by_year.plot(kind='scatter', x='rank', y='W', figsize=(20, 20))
team_salary_by_year.head(5)

In [None]:
cpiu_with_salary = team_salary.merge(cpi_u_values, on="yearID")
cpiu_with_salary = cpiu_with_salary[['yearID', 'teamID', 'W', 'salary', 'cpi_u']]
cpiu_with_salary['salary_adj'] = cpiu_with_salary.apply(lambda x: (x['salary'] * 240.007)/x['cpi_u'], axis=1)
cpiu_with_salary.plot(kind='scatter', x='salary_adj', y='W', figsize=(20, 20))

In [None]:
#pd.options.display.float_format = '{:.2f}'.format
round(pd.read_csv('cpi_u_values.csv', float_precision='high').iloc[31]['Annual'], 3)

In [None]:
salary_with_cpi