In [18]:
!pip install fuzzywuzzy[speedup]

Collecting fuzzywuzzy[speedup]
  Downloading https://files.pythonhosted.org/packages/d8/f1/5a267addb30ab7eaa1beab2b9323073815da4551076554ecc890a3595ec9/fuzzywuzzy-0.17.0-py2.py3-none-any.whl
Collecting python-levenshtein>=0.12; extra == "speedup"
[?25l  Downloading https://files.pythonhosted.org/packages/42/a9/d1785c85ebf9b7dfacd08938dd028209c34a0ea3b1bcdb895208bd40a67d/python-Levenshtein-0.12.0.tar.gz (48kB)
[K     |████████████████████████████████| 51kB 3.0MB/s eta 0:00:011
Building wheels for collected packages: python-levenshtein
  Building wheel for python-levenshtein (setup.py) ... [?25ldone
[?25h  Created wheel for python-levenshtein: filename=python_Levenshtein-0.12.0-cp37-cp37m-linux_x86_64.whl size=167763 sha256=e1682653e166c24580318962d4976aadf7e08b19c4c928e99a2453d98dd23e6f
  Stored in directory: /home/jup/.cache/pip/wheels/de/c2/93/660fd5f7559049268ad2dc6d81c4e39e9e36518766eaf7e342
Successfully built python-levenshtein
Installing collected packages: python-levenshtein,

In [4]:
import pandas as pd
from fuzzywuzzy import process

In [42]:
def get_players():
    url = "https://d3k2oh6evki4b7.cloudfront.net/short/inc/players_search_list.csv"
    df = (pd.read_csv(url,
                      names=["key", "name", "years", "is_active", "1", "2", "3", "4", "pop_rank"])
          .dropna(thresh=1000, axis=1) # drop empty columns
         ).sort_values("pop_rank", ascending=False) # put the most popular searches up top
    return df

def get_player_info(search_string, verbose=True):
    df = get_players()
    name, pct_sure, index = process.extract(search_string, df.name, limit=1)[0]
    if verbose:
        print(f"I'm {pct_sure}% that you want {name}")
    return df.loc[index]

In [55]:
df = get_players()
df.head(50)

Unnamed: 0,key,name,years,is_active,pop_rank
1621,bondsba01,Barry Bonds,1986-2007,0,2191.8
3239,clemero02,Roger Clemens,1984-2007,0,2170.2
15100,rodrial01,Alex Rodriguez,1994-2016,0,2155.78
11291,mayswi01,Willie Mays,1951-1973,0,2152.4
7685,henderi01,Rickey Henderson,1979-2003,0,2143.21
1,aaronha01,Hank Aaron,1954-1976,0,2142.01
10754,maddugr01,Greg Maddux,1986-2008,0,2139.57
14323,pujolal01,Albert Pujols,2001-2019,1,2138.28
1179,beltrad01,Adrian Beltre,1998-2018,0,2134.61
8851,johnsra05,Randy Johnson,1988-2009,0,2134.07


In [51]:
def get_player_info(search_string, verbose=True):
    df = get_players()
    name, pct_sure, index = process.extract(search_string, df.name, limit=1)[0]
    if verbose:
        print(f"I'm {pct_sure}% that you want {name}")
    return df.loc[index]

In [52]:
get_player_info("herman baby")

I'm 86% that you want Babe Herman


key            hermaba01
name         Babe Herman
years          1926-1945
is_active              0
pop_rank          1999.3
Name: 7757, dtype: object

In [54]:
get_player_info("hulio dubo")["key"]

I'm 70% that you want Julio Lugo


'lugoju01'

# let's work on getting standard Pitching and Standard Batting

# overview

https://widgets.sports-reference.com/wg.fcgi?css=1&site=br&url=%2Fplayers%2Fz%2Fzychto01.shtml&div=div_pitching_standard

https://widgets.sports-reference.com/wg.fcgi?css=1&site=br&url=%2Fplayers%2Fz%2Fzychto01.shtml&div=div_batting_value
    
https://widgets.sports-reference.com/wg.fcgi?css=1&site=br&url=%2Fplayers%2Fz%2Fzychto01.shtml&div=div_appearances
    
https://widgets.sports-reference.com/wg.fcgi?css=1&site=br&url=%2Fplayers%2Fz%2Fzychto01.shtml&div=div_br-salaries
    
# splits    
    
https://widgets.sports-reference.com/wg.fcgi?css=1&site=br&url=%2Fplayers%2Fsplit.fcgi%3Fid%3Dzychto01%26year%3D2017%26t%3Dp&div=div_total
    
https://widgets.sports-reference.com/wg.fcgi?css=1&site=br&url=%2Fplayers%2Fsplit.fcgi%3Fid%3Dzychto01%26year%3DCareer%26t%3Db&div=div_total
    
# logs
https://widgets.sports-reference.com/wg.fcgi?css=1&site=br&url=%2Fplayers%2Fgl.fcgi%3Fid%3Dzychto01%26t%3Dp%26year%3D2015&div=div_pitching_gamelogs

    

In [86]:
from functools import partial

def player_overview(key, table_type="appearances"):
    valid_info_types = ["pitching_standard", "pitching_value", "batting_standard", "batting_value", "standard_fielding", "appearances"]
    
    # raise error if info_type is not on the list
    if not table_type in valid_info_types:
        raise Exception(f"{table_type} is not valid, valid info types are {valid_info_types}")
    
    player_overview_url = f"https://widgets.sports-reference.com/wg.fcgi?css=1&site=br&url=%2Fplayers%2F{key[0]}%2F{key}.shtml&div=div_{table_type}"
    
    # grab the table   
    try:
        df = pd.read_html(player_overview_url)[0].query("Lg == 'NL' or Lg == 'AL'")
        df = df.apply(partial(pd.to_numeric, errors="ignore"))
    except:
        raise Exception(f"error getting {table_type} for key {key}. probably because the table doesn't exist on the page.")
        
    return df

In [92]:
for tp in ["pitching_standard", "pitching_value", "batting_standard", "batting_value", "standard_fielding", "appearances"]:
    try:
        print(tp)
        display(player_overview(get_player_info("jiman choi")["key"], tp))
    except:
        pass

pitching_standard
I'm 95% that you want Ji-Man Choi
pitching_value
I'm 95% that you want Ji-Man Choi
batting_standard
I'm 95% that you want Ji-Man Choi


Unnamed: 0,Year,Age,Tm,Lg,G,PA,AB,R,H,2B,...,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB,Pos,Awards
6,2016,25,LAA,AL,54.0,129.0,112.0,9.0,19.0,4.0,...,0.611,68.0,38.0,2.0,0.0,0.0,1.0,1.0,37/D,
8,2017,26,NYY,AL,6.0,18.0,15.0,2.0,4.0,1.0,...,1.067,169.0,11.0,1.0,0.0,0.0,1.0,0.0,/3,
11,2018,27,MIL,NL,12.0,32.0,30.0,4.0,7.0,2.0,...,0.781,104.0,15.0,1.0,0.0,0.0,0.0,1.0,/D37,
12,2018,27,TBR,AL,49.0,189.0,160.0,21.0,43.0,12.0,...,0.877,141.0,81.0,0.0,3.0,0.0,2.0,0.0,D/3,
13,2019,28,TBR,AL,127.0,487.0,410.0,54.0,107.0,20.0,...,0.822,119.0,188.0,7.0,6.0,0.0,7.0,2.0,3D,


batting_value
I'm 95% that you want Ji-Man Choi


Unnamed: 0,Year,Age,Tm,Lg,G,PA,Rbat,Rbaser,Rdp,Rfield,...,RAR,WAR,waaWL%,162WL%,oWAR,dWAR,oRAR,Salary,Pos,Awards
0,2016,25,LAA,AL,54.0,129.0,-5.0,-2.0,0.0,0.0,...,-5.0,-0.6,0.48,0.493,-0.6,-0.3,-5.0,"$650,000",37/D,
1,2017,26,NYY,AL,6.0,18.0,1.0,0.0,0.0,0.0,...,1.0,0.2,0.513,0.5,0.2,0.0,1.0,,/3,
2,2018,27,MIL,NL,12.0,32.0,0.0,0.0,0.0,1.0,...,1.0,0.1,0.498,0.5,0.0,0.0,0.0,,/D37,
3,2018,27,TBR,AL,49.0,189.0,8.0,-2.0,2.0,-1.0,...,10.0,1.0,0.506,0.502,1.1,-0.5,11.0,"$850,000",D/3,
4,2019,28,TBR,AL,127.0,487.0,11.0,-2.0,1.0,-1.0,...,20.0,2.0,0.502,0.501,2.0,-0.7,21.0,"$850,000",3D,


standard_fielding
I'm 95% that you want Ji-Man Choi


Unnamed: 0,Year,Age,Tm,Pos,Lg,G,GS,CG,Inn,Ch,...,Rtot,Rdrs,Rtot/yr,Rdrs/yr,RF/9,RF/G,lgFld%,lgRF9,lgRFG,Awards
0,2016,25,LAA,1B,AL,27,15,12.0,152.0,139.0,...,0.0,2.0,3.0,17.0,8.17,5.11,0.994,8.97,8.87,
1,2016,25,LAA,LF,AL,20,14,6.0,113.0,26.0,...,1.0,-2.0,7.0,-22.0,2.07,1.3,0.984,1.86,1.84,
2,2016,25,LAA,OF,AL,20,14,6.0,113.0,26.0,...,1.0,-2.0,7.0,-21.0,2.07,1.3,0.986,2.15,2.12,
3,2016,25,LAA,DH,AL,6,2,,,,...,,,,,,,,,,
4,2017,26,NYY,1B,AL,6,4,3.0,40.0,37.0,...,0.0,0.0,6.0,0.0,8.1,6.0,0.993,8.64,8.55,
7,2018,27,MIL,DH,NL,6,6,,,,...,,,,,,,,,,
8,2018,27,MIL,1B,NL,2,2,2.0,18.0,12.0,...,1.0,1.0,33.0,73.0,6.0,6.0,0.994,8.65,8.65,
9,2018,27,MIL,LF,NL,1,0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,1.8,1.9,
10,2018,27,MIL,OF,NL,1,0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,1.8,2.1,
11,2018,27,TBR,DH,AL,44,44,,,,...,,,,,,,,,,


appearances
I'm 95% that you want Ji-Man Choi


Unnamed: 0,Year,Age,Tm,Lg,G,GS,Batting,Defense,P,C,...,2B,3B,SS,LF,CF,RF,OF,DH,PH,PR
0,2016,25,LAA,AL,54,31,54,39,0,0,...,0,0,0,20,0,0,20,6,12,5
1,2017,26,NYY,AL,6,4,6,6,0,0,...,0,0,0,0,0,0,0,0,2,0
2,2018,27,TBR,AL,49,45,49,1,0,0,...,0,0,0,0,0,0,0,44,4,0
3,2018,27,MIL,NL,12,8,12,3,0,0,...,0,0,0,1,0,0,1,6,4,0
4,2019,28,TBR,AL,127,106,127,103,0,0,...,0,0,0,0,0,0,0,16,20,1


In [89]:
 player_overview(get_player_info("jiman choi", True)["key"])

I'm 95% that you want Ji-Man Choi


Unnamed: 0,Year,Age,Tm,Lg,G,GS,Batting,Defense,P,C,...,2B,3B,SS,LF,CF,RF,OF,DH,PH,PR
0,2016,25,LAA,AL,54,31,54,39,0,0,...,0,0,0,20,0,0,20,6,12,5
1,2017,26,NYY,AL,6,4,6,6,0,0,...,0,0,0,0,0,0,0,0,2,0
2,2018,27,TBR,AL,49,45,49,1,0,0,...,0,0,0,0,0,0,0,44,4,0
3,2018,27,MIL,NL,12,8,12,3,0,0,...,0,0,0,1,0,0,1,6,4,0
4,2019,28,TBR,AL,127,106,127,103,0,0,...,0,0,0,0,0,0,0,16,20,1


In [None]:
teams = "https://d3k2oh6evki4b7.cloudfront.net/short/inc/teamss_search_list.csv"

In [19]:
def get_teams():
    url = "https://d3k2oh6evki4b7.cloudfront.net/short/inc/teams_search_list.csv"
    df = (pd.read_csv(url,
                      names=["abbr", "name", "years", "0", "1", "2", "3", "4", "5"])
          .dropna(axis=1) # drop empty columns
          .drop("0", axis =1) # drop 0 column
         )
    return df


def get_team_info(search_string, verbose=True):
    df = get_teams()
    name, pct_sure, index = process.extract(search_string, df.name.append(df.abbr), limit=1)[0]
    if verbose:
        print(f"I'm {pct_sure}% that you want {name}")
    return df.loc[index]

In [34]:
get_team_info("pads")

I'm 68% that you want San Diego Padres


abbr                  SDP
name     San Diego Padres
years           1969-2019
Name: 22, dtype: object

In [35]:
url = "https://www.baseball-reference.com/teams/TBD/leaders_bat_50.shtml"

In [39]:
pd.read_html(url, attrs={"class": "columns"})

[       0                   1     2
 0    1.0       Evan Longoria  49.8
 1    2.0         Ben Zobrist  36.0
 2    3.0       Carl Crawford  35.6
 3    4.0     Kevin Kiermaier  26.2
 4    5.0         Carlos Pena  18.1
 5    6.0    Melvin Upton Jr.  15.6
 6    7.0          Julio Lugo  13.5
 7    8.0    Desmond Jennings  13.4
 8    9.0         Aubrey Huff  11.8
 9   10.0      Jason Bartlett  10.4
 10  11.0       Matthew Joyce   9.8
 11  12.0      Rocco Baldelli   9.6
 12  13.0      Logan Forsythe   9.4
 13  14.0        Fred McGriff   9.0
 14  15.0      Sean Rodriguez   7.7
 15  16.0          Randy Winn   7.4
 16  17.0     Akinori Iwamura   6.6
 17  18.0          Tommy Pham   6.3
 18  19.0        Willy Adames   6.2
 19  20.0    Steven Souza Jr.   6.1
 20  21.0           Toby Hall   5.7
 21  22.0         Greg Vaughn   5.3
 22  23.0         Joey Wendle   5.2
 23  24.0        Mallex Smith   5.1
 24  25.0       Brandon Guyer   4.5
 25   NaN          Travis Lee   4.5
 26  27.0     Corey Dickerso

In [38]:
pd.read_html(url)

[       0                   1     2
 0    1.0       Evan Longoria  49.8
 1    2.0         Ben Zobrist  36.0
 2    3.0       Carl Crawford  35.6
 3    4.0     Kevin Kiermaier  26.2
 4    5.0         Carlos Pena  18.1
 5    6.0    Melvin Upton Jr.  15.6
 6    7.0          Julio Lugo  13.5
 7    8.0    Desmond Jennings  13.4
 8    9.0         Aubrey Huff  11.8
 9   10.0      Jason Bartlett  10.4
 10  11.0       Matthew Joyce   9.8
 11  12.0      Rocco Baldelli   9.6
 12  13.0      Logan Forsythe   9.4
 13  14.0        Fred McGriff   9.0
 14  15.0      Sean Rodriguez   7.7
 15  16.0          Randy Winn   7.4
 16  17.0     Akinori Iwamura   6.6
 17  18.0          Tommy Pham   6.3
 18  19.0        Willy Adames   6.2
 19  20.0    Steven Souza Jr.   6.1
 20  21.0           Toby Hall   5.7
 21  22.0         Greg Vaughn   5.3
 22  23.0         Joey Wendle   5.2
 23  24.0        Mallex Smith   5.1
 24  25.0       Brandon Guyer   4.5
 25   NaN          Travis Lee   4.5
 26  27.0     Corey Dickerso