# Data Wrangling

In [1]:
from pybaseball import batting_stats
from pybaseball import playerid_lookup
from pybaseball import player_search_list
from pybaseball import playerid_reverse_lookup
import numpy as np
import pandas as pd
import csv

In [None]:
salary_df = pd.read_csv('mlb-free-agency.csv')
salary_df = salary_df[salary_df['total_salary'].notna()]
salary_df = salary_df.drop([0])

In [None]:
# drop first column and reset index
salary_df.drop(columns=salary_df.columns[0], axis=1, inplace=True)
salary_df.reset_index(drop=True)

In [None]:
# drop all pitchers: SP and RP
drop_SP = salary_df[salary_df['position'] == 'SP'].index
salary_df.drop(drop_SP, inplace = True)
drop_RP = salary_df[salary_df['position'] == 'RP'].index
salary_df.drop(drop_RP, inplace = True)
# replace LF,CF,RF with OF
salary_df.position.replace(['LF', 'CF', 'RF'], 'OF', inplace=True)

In [None]:
salary_df.sort_values('name', inplace=True)
salary_df.reset_index(drop=True, inplace=True)

In [None]:
# check if there's any players who share the same name
# 1. keep entires with unique ID
unique = salary_df.drop_duplicates(subset=['spotracID'])
# 2. check if there's any duplciate name in 'Unique'
unique[unique.duplicated(subset=['name'], keep=False)]
# there isn't any

## Lookup fangraphs playerID using there name

In [None]:
# create playersearch DataFrame:

player_search = salary_df[['name','spotracID','spotracLink']].copy()
player_search.drop_duplicates(subset=['name'], keep='first', inplace=True)

In [None]:
for i, row in player_search.iterrows():
    player_search.at[i, 'first_name'] = row['name'].split()[0].lower()
    if len(row['name'].split()) <= 2:
        player_search.at[i, 'last_name'] = row['name'].split()[-1].lower()
    else:
        player_search.at[i, 'last_name'] = row['name'].split()[1].lower()

In [None]:
# change a.j. to 'a. j.'

for i, row in player_search.iterrows():
    if '.' in row.first_name:
        row.first_name = row.first_name.replace('.', '. ', 1)
        player_search.at[i, 'first_name'] = row.first_name
player_search

In [None]:
# create a list of tuples(last, first) to pass into the player lookup function
name_list = []

for i, row in player_search.iterrows():
    tup = (row.last_name, row.first_name)
    name_list.append(tup)
name_list

In [None]:
playerID = player_search_list(name_list)

In [None]:
playerID

In [None]:
# check which entires contain duplicate names
# remove all players whose mlb_played_last <= 2010

pd.set_option('display.max_rows', 10)
playerID[playerID.duplicated(subset=['name_last', 'name_first'], keep=False)]
idx_drop = playerID[playerID['mlb_played_last']<=2010].index
playerID.drop(idx_drop, inplace=True)

In [None]:
# check if there are still duplicated players

playerID[playerID.duplicated(subset=['name_last', 'name_first'], keep=False)]
# drop chris young 3196 and taylor michael 2591, they are not the ones we're looking for
cy_idx = playerID[playerID['key_fangraphs']==3196].index
mt_idx = playerID[playerID['key_fangraphs']==2591].index
playerID.drop(cy_idx, inplace=True)
playerID.drop(mt_idx, inplace=True)

In [None]:
print(f"Number of players we searched: {len(name_list)}")
print(f"Number of result that we got: {len(playerID.index)}")

In [None]:
# create a list of player name that returned positive results
name_list_positive = []
for i, row in playerID.iterrows():
    tup = (row.name_last, row.name_first)
    name_list_positive.append(tup)
# check who are we still missing
name_list_missing = [x for x in name_list if x not in name_list_positive]
name_list_missing

In [None]:
# create a 'fullname' column in salary_df for merge with playerID

salary_df['full_name'] = salary_df['name'].str.lower().str.replace(' ', '')

# create a fullname column in playerID

playerID['full_name'] = playerID['name_first'] + playerID['name_last']
playerID['full_name'] = playerID['full_name'].str.lower().str.replace(' ', '')

## Merege salary and playerID

In [None]:
salaryFinalDF = pd.merge(salary_df, playerID, on='full_name', how='inner')
salaryFinalDF

In [None]:
# drop unwanted columns
salaryFinalDF.drop(columns=['contract_length', 
                            'total_salary', 
                            'full_name', 
                            'name_last', 
                            'name_first',
                            'key_mlbam', 
                            'key_retro',
                            'key_bbref',
                            'spotracID',
                            'spotracLink'], inplace=True)
salaryFinalDF

In [None]:
# rename columns
salaryFinalDF.rename(columns={'year': 'year_fa',
                              'key_fangraphs': 'IDfg',
                              'mlb_played_first': 'first_played',
                              'mlb_played_last': 'last_played',
                              'to_tam': 'to_team'}, inplace=True)
salaryFinalDF

## import mlb batting stats from 07-20 and clean data furthermore
1. Remove entries with BA less than 400:
2. Remove unwanted variables

In [None]:
# import mlb all batting stats from 2007-2020
batting_df = pd.read_csv('mlb-batting.csv')
batting_df.head()
print(batting_df.shape)
# drop rows with AB < 400
# batting_df = batting_df.drop(batting_df[batting_df['AB']<400].index)
# batting_df.shape

In [None]:
# drop all unwanted stats
pd.set_option('display.max_columns', None)
batting_df.head()

In [None]:
var_list_basic = ['IDfg', 
            'Season', 
            'Name', 
            'Team', 
            'Age', 
            'G', 
            'PA', 
            'AB', 
            'R', 
            'H',
            '2B',
            '3B',
            'HR',
            'RBI',
            'SB',
            'CS',
            'BB',
            'SO',
            'GDP',
            'HBP',
            'SH',
            'SF',
            'IBB',
            'AVG',
            'OBP',
            'SLG',
            'BABIP',
            'ISO',
            'OPS',
            'wRC',
            'wRAA',
            'wOBA',
            'wRC+',
            'WPA',
            'WAR']
# not the full list
batting_basic_df = batting_df[var_list_basic]
batting_basic_df = batting_basic_df.sort_values(by=['IDfg', 'Season'], ascending=True)
batting_basic_df.reset_index(drop=True, inplace=True)
batting_basic_df

# Merge salary and batting for EDA

In [None]:
# re_index batting_basic_df
batting_basic_df.reset_index(drop=True, inplace=True)
batting_basic_df = batting_basic_df.set_index(['IDfg'])
batting_basic_df.sort_index(level=['IDfg'], inplace=True)

In [None]:
# aggregate batting_basic based on FA year
agg_method = {'Age':'max',
             'G':'sum',
             'PA':'sum',
             'AB':'sum',
             'R':'sum',
             'H':'sum',
             '2B':'sum',
             '3B':'sum',
             'HR':'sum',
             'RBI':'sum',
             'SB':'sum',
             'CS':'sum',
             'BB':'sum',
             'SO':'sum',
             'GDP':'sum',
             'HBP':'sum',
             'SH':'sum',
             'SF':'sum',
             'IBB':'sum',
             'AVG':'mean',
             'OBP':'mean',
             'SLG':'mean',
             'BABIP':'mean',
             'ISO':'mean',
             'OPS':'mean',
             'wRC':'sum',
             'wRAA':'sum',
             'wRC+':'sum',
             'WPA':'sum',
             'WAR':'sum'}
batting_aggDF = pd.DataFrame()
for row in salaryFinalDF.itertuples():
    selected_years = [row.year_fa-1, row.year_fa-2, row.year_fa-3, row.year_fa-4, row.year_fa-5,]
    player = batting_basic_df.loc[row.IDfg]
    player = player[player['Season'].isin(selected_years)].groupby(by=['IDfg','Name']).agg(agg_method)
    player['Year_FA']=row.year_fa
    player['Salary']=row.avg_salary
    batting_aggDF = batting_aggDF.append(player)

In [None]:
# natural log salaries
batting_aggDF.insert(len(batting_aggDF.columns), 'Salary_log',
         np.log(batting_aggDF['Salary']))

In [None]:
batting_aggDF