## Load the libraries

In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import seaborn as sns
from sklearn import neighbors, datasets, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier

#from sklearn.linear_model import LogisticRegression

## Load the data

In [2]:
# how can I put the names into a list and use a function to generate the dataframes (along with the correct names)?

master_df = pd.read_csv('Master.csv')
pitching_df = pd.read_csv('Pitching.csv')
batting_df = pd.read_csv('Batting.csv')
fielding_df = pd.read_csv('Fielding.csv')
awards_df = pd.read_csv('AwardsPlayers.csv')
allstar_df = pd.read_csv('AllstarFull.csv')
hof_df = pd.read_csv('HallOfFame.csv')
appearances_df = pd.read_csv('Appearances.csv')

## Display options

In [3]:
pd.set_option('display.max_columns', 105)
pd.set_option('display.max_rows', 85)

## Display check

In [4]:
#batting_df.head()

## Multiple stints per year
Necessary to aggregate properly in the next step.  If a player was demoted to minor leagues and called back up the same year, he will have at least 2 stints that year in the batting_df.

In [5]:
#batting_df['stint'].value_counts()

## Instantiate the source tables
### years_played

In [6]:
years_played = batting_df.groupby('playerID').nunique()  # the nunique handles the fact that there is a row for each stint/year combination in batting_df
years_played = years_played[['yearID']].rename(columns = {'yearID': 'years_played'})
years_played

Unnamed: 0_level_0,years_played
playerID,Unnamed: 1_level_1
aardsda01,9
aaronha01,23
aaronto01,7
aasedo01,13
abadan01,3
...,...
zupofr01,3
zuvelpa01,9
zuverge01,8
zwilldu01,4


### pitching_career stats

In [7]:
pitching_df

Unnamed: 0,playerID,yearID,stint,teamID,lgID,W,L,G,GS,CG,SHO,SV,IPouts,H,ER,HR,BB,SO,BAOpp,ERA,IBB,WP,HBP,BK,BFP,GF,R,SH,SF,GIDP
0,bechtge01,1871,1,PH1,,1,2,3,3,2,0,0,78,43,23,0,11,1,,7.96,,,,0,,,42,,,
1,brainas01,1871,1,WS3,,12,15,30,30,30,0,0,792,361,132,4,37,13,,4.50,,,,0,,,292,,,
2,fergubo01,1871,1,NY2,,0,0,1,0,0,0,0,3,8,3,0,0,0,,27.00,,,,0,,,9,,,
3,fishech01,1871,1,RC1,,4,16,24,24,22,1,0,639,295,103,3,31,15,,4.35,,,,0,,,257,,,
4,fleetfr01,1871,1,NY2,,0,1,1,1,1,0,0,27,20,10,0,3,0,,10.00,,,,0,,,21,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44958,zastrro01,2016,1,CHN,NL,1,0,8,1,0,0,0,48,12,2,0,5,17,0.207,1.13,0.0,0.0,1.0,0,66.0,1.0,3,0.0,2.0,1.0
44959,zieglbr01,2016,1,ARI,NL,2,3,36,0,0,0,18,115,41,12,1,15,27,0.281,2.82,5.0,0.0,2.0,0,165.0,30.0,13,1.0,1.0,10.0
44960,zieglbr01,2016,2,BOS,AL,2,4,33,0,0,0,4,89,26,5,1,11,31,0.234,1.52,2.0,1.0,1.0,0,124.0,12.0,8,1.0,0.0,6.0
44961,zimmejo02,2016,1,DET,AL,9,7,19,18,0,0,0,316,118,57,14,26,66,0.284,4.87,0.0,3.0,2.0,0,450.0,1.0,63,1.0,5.0,8.0


In [8]:
pitching_career = pitching_df.drop(['yearID', 'stint', 'teamID', 'lgID', 'BAOpp', 'ERA'], axis=1).groupby(by = 'playerID').sum()

pitching_career = pitching_career.rename(columns = {'G': 'G_pitcher'})

pitching_career

Unnamed: 0_level_0,W,L,G_pitcher,GS,CG,SHO,SV,IPouts,H,ER,HR,BB,SO,IBB,WP,HBP,BK,BFP,GF,R,SH,SF,GIDP
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
aardsda01,16,18,331,0,0,0,69,1011,296,160,41,183,340,22.0,12.0,16.0,1,1475.0,141.0,169,17.0,11.0,21.0
aasedo01,66,60,448,91,22,5,82,3328,1085,468,89,457,641,45.0,21.0,7.0,3,4730.0,234.0,503,0.0,0.0,0.0
abadfe01,6,26,315,6,0,0,1,822,260,113,36,99,234,9.0,9.0,11.0,1,1168.0,81.0,119,7.0,10.0,18.0
abbeybe01,22,40,79,66,52,0,1,1704,686,285,18,192,161,0.0,18.0,0.0,0,0.0,12.0,442,0.0,0.0,0.0
abbeych01,0,0,1,0,0,0,0,6,6,1,0,0,0,0.0,1.0,0.0,0,0.0,1.0,3,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zoldasa01,43,53,250,93,30,5,8,2788,956,366,54,301,207,0.0,8.0,3.0,4,3946.0,78.0,423,0.0,0.0,0.0
zuberbi01,43,42,224,65,23,3,6,2358,767,374,35,468,383,0.0,28.0,4.0,1,3476.0,90.0,418,0.0,0.0,0.0
zumayjo01,13,12,171,0,0,0,5,629,169,71,18,114,210,11.0,16.0,4.0,0,911.0,35.0,80,6.0,10.0,10.0
zuverge01,32,36,265,31,9,2,40,1927,660,253,56,203,223,29.0,10.0,27.0,1,2746.0,139.0,296,0.0,0.0,0.0


### batting_career stats

In [9]:
batting_career = batting_df.drop(['yearID', 'stint', 'teamID', 'lgID'], axis=1).groupby(by = 'playerID').sum()
#batting_career

### fielding_career stats

In [10]:
fielding_career = fielding_df[['playerID', 'G', 'GS', 'PO', 'A', 'E', 'DP']].groupby(by = 'playerID').sum()
#fielding_career

### awards

In [11]:
awards_career = pd.pivot_table(awards_df, values = 'yearID', index = 'playerID', columns = 'awardID', aggfunc='count')
#awards_career

### all-star selections

In [12]:
years_allstar = allstar_df.groupby('playerID').nunique()   # some years had multiple all-star games
years_allstar = years_allstar[['yearID']].rename(columns = {'yearID': 'years_allstar'})

### target variable = Hall of Fame selection

In [13]:
filt = (hof_df['inducted'] == 'Y') & (hof_df['category'] == 'Player') 
hof_df = hof_df[filt]
hof_df

Unnamed: 0,playerID,yearid,votedBy,ballots,needed,votes,inducted,category,needed_note
0,cobbty01,1936,BBWAA,226.0,170.0,222.0,Y,Player,
1,ruthba01,1936,BBWAA,226.0,170.0,215.0,Y,Player,
2,wagneho01,1936,BBWAA,226.0,170.0,215.0,Y,Player,
3,mathech01,1936,BBWAA,226.0,170.0,205.0,Y,Player,
4,johnswa01,1936,BBWAA,226.0,170.0,189.0,Y,Player,
...,...,...,...,...,...,...,...,...,...
4088,griffke02,2016,BBWAA,440.0,330.0,437.0,Y,Player,
4089,piazzmi01,2016,BBWAA,440.0,330.0,365.0,Y,Player,
4120,bagweje01,2017,BBWAA,442.0,332.0,381.0,Y,Player,
4121,raineti01,2017,BBWAA,442.0,332.0,380.0,Y,Player,


### time spent at each position, as a proportion of total career appearances

In [14]:
career_appearances = appearances_df.drop(['yearID', 'teamID', 'lgID'], axis=1).groupby(by = 'playerID').sum()
career_appearances['prop_games_pitcher'] = career_appearances.G_p/career_appearances.G_all
career_appearances['prop_games_catcher'] = career_appearances.G_c/career_appearances.G_all
#career_appearances

### join source tables into single flat file

In [15]:
career = pd.merge(years_played, pitching_career, how = 'inner', on = 'playerID'
                 
              #    ).merge(batting_career, how = 'left', on = 'playerID'
                        ).merge(fielding_career, how = 'inner', on = 'playerID'
                             ).merge(awards_career, how = 'left', on = 'playerID'
                                       ).merge(years_allstar, how = 'left', on = 'playerID'
                                              ).merge(hof_df, how = 'left', on = 'playerID'
                                                     ).merge(career_appearances, how = 'inner', on = 'playerID'
                                                            )#.merge(master_df, how = 'inner', on = 'playerID')  # save this dataframe for after the modeling, it's only labels
career

Unnamed: 0,playerID,years_played,W,L,G_pitcher,GS_x,CG,SHO,SV,IPouts,H,ER,HR,BB,SO,IBB,WP,HBP,BK,BFP,GF,R,SH,SF,GIDP,G,GS_y,PO,A,E,DP,ALCS MVP,All-Star Game MVP,Babe Ruth Award,Baseball Magazine All-Star,Branch Rickey Award,Comeback Player of the Year,Cy Young Award,Gold Glove,Hank Aaron Award,Hutch Award,Lou Gehrig Memorial Award,Most Valuable Player,NLCS MVP,Outstanding DH Award,Pitching Triple Crown,Reliever of the Year Award,Roberto Clemente Award,Rolaids Relief Man Award,Rookie of the Year,Silver Slugger,TSN All-Star,TSN Fireman of the Year,TSN Guide MVP,TSN Major League Player of the Year,TSN Pitcher of the Year,TSN Player of the Year,TSN Reliever of the Year,Triple Crown,World Series MVP,years_allstar,yearid,votedBy,ballots,needed,votes,inducted,category,needed_note,G_all,GS,G_batting,G_defense,G_p,G_c,G_1b,G_2b,G_3b,G_ss,G_lf,G_cf,G_rf,G_of,G_dh,G_ph,G_pr,prop_games_pitcher,prop_games_catcher
0,aardsda01,9,16,18,331,0,0,0,69,1011,296,160,41,183,340,22.0,12.0,16.0,1,1475.0,141.0,169,17.0,11.0,21.0,331,0.0,11,29.0,3.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,331,0.0,139,331,331,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.000000,0.0
1,aasedo01,13,66,60,448,91,22,5,82,3328,1085,468,89,457,641,45.0,21.0,7.0,3,4730.0,234.0,503,0.0,0.0,0.0,448,91.0,67,135.0,13.0,10.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,448,91.0,81,448,448,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.000000,0.0
2,abadfe01,7,6,26,315,6,0,0,1,822,260,113,36,99,234,9.0,9.0,11.0,1,1168.0,81.0,119,7.0,10.0,18.0,315,6.0,7,31.0,2.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,315,6.0,135,315,315,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.000000,0.0
3,abbeybe01,5,22,40,79,66,52,0,1,1704,686,285,18,192,161,0.0,18.0,0.0,0,0.0,12.0,442,0.0,0.0,0.0,79,0.0,17,134.0,22.0,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,79,0.0,79,79,79,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.000000,0.0
4,abbeych01,5,0,0,1,0,0,0,0,6,6,1,0,0,0,0.0,1.0,0.0,0,0.0,1.0,3,0.0,0.0,0.0,451,0.0,917,90.0,99.0,18.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,452,0.0,452,452,1,0,0,0,0,0,92,185,173,451,0.0,0.0,0.0,0.002212,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9297,zoldasa01,9,43,53,250,93,30,5,8,2788,956,366,54,301,207,0.0,8.0,3.0,4,3946.0,78.0,423,0.0,0.0,0.0,250,0.0,54,200.0,4.0,14.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,251,93.0,251,250,250,0,0,0,0,0,0,0,0,0,0.0,2.0,1.0,0.996016,0.0
9298,zuberbi01,11,43,42,224,65,23,3,6,2358,767,374,35,468,383,0.0,28.0,4.0,1,3476.0,90.0,418,0.0,0.0,0.0,224,0.0,29,107.0,5.0,9.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,224,65.0,224,224,224,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.000000,0.0
9299,zumayjo01,5,13,12,171,0,0,0,5,629,169,71,18,114,210,11.0,16.0,4.0,0,911.0,35.0,80,6.0,10.0,10.0,171,0.0,7,14.0,2.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,171,0.0,14,171,171,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.000000,0.0
9300,zuverge01,8,32,36,265,31,9,2,40,1927,660,253,56,203,223,29.0,10.0,27.0,1,2746.0,139.0,296,0.0,0.0,0.0,265,31.0,45,145.0,7.0,10.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,266,31.0,266,265,265,0,0,0,0,0,0,0,0,0,0.0,0.0,1.0,0.996241,0.0


In [16]:
filt = (career.prop_games_pitcher > 0.9) #& (career.prop_games_catcher < 0.10)

In [17]:
pitchers = career[filt]
pitchers

Unnamed: 0,playerID,years_played,W,L,G_pitcher,GS_x,CG,SHO,SV,IPouts,H,ER,HR,BB,SO,IBB,WP,HBP,BK,BFP,GF,R,SH,SF,GIDP,G,GS_y,PO,A,E,DP,ALCS MVP,All-Star Game MVP,Babe Ruth Award,Baseball Magazine All-Star,Branch Rickey Award,Comeback Player of the Year,Cy Young Award,Gold Glove,Hank Aaron Award,Hutch Award,Lou Gehrig Memorial Award,Most Valuable Player,NLCS MVP,Outstanding DH Award,Pitching Triple Crown,Reliever of the Year Award,Roberto Clemente Award,Rolaids Relief Man Award,Rookie of the Year,Silver Slugger,TSN All-Star,TSN Fireman of the Year,TSN Guide MVP,TSN Major League Player of the Year,TSN Pitcher of the Year,TSN Player of the Year,TSN Reliever of the Year,Triple Crown,World Series MVP,years_allstar,yearid,votedBy,ballots,needed,votes,inducted,category,needed_note,G_all,GS,G_batting,G_defense,G_p,G_c,G_1b,G_2b,G_3b,G_ss,G_lf,G_cf,G_rf,G_of,G_dh,G_ph,G_pr,prop_games_pitcher,prop_games_catcher
0,aardsda01,9,16,18,331,0,0,0,69,1011,296,160,41,183,340,22.0,12.0,16.0,1,1475.0,141.0,169,17.0,11.0,21.0,331,0.0,11,29.0,3.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,331,0.0,139,331,331,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.000000,0.0
1,aasedo01,13,66,60,448,91,22,5,82,3328,1085,468,89,457,641,45.0,21.0,7.0,3,4730.0,234.0,503,0.0,0.0,0.0,448,91.0,67,135.0,13.0,10.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,448,91.0,81,448,448,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.000000,0.0
2,abadfe01,7,6,26,315,6,0,0,1,822,260,113,36,99,234,9.0,9.0,11.0,1,1168.0,81.0,119,7.0,10.0,18.0,315,6.0,7,31.0,2.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,315,6.0,135,315,315,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.000000,0.0
3,abbeybe01,5,22,40,79,66,52,0,1,1704,686,285,18,192,161,0.0,18.0,0.0,0,0.0,12.0,442,0.0,0.0,0.0,79,0.0,17,134.0,22.0,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,79,0.0,79,79,79,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.000000,0.0
5,abbotda01,1,0,2,3,1,1,0,1,39,19,9,0,8,1,0.0,0.0,0.0,0,0.0,2.0,14,0.0,0.0,0.0,3,0.0,3,4.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3,0.0,3,3,3,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9297,zoldasa01,9,43,53,250,93,30,5,8,2788,956,366,54,301,207,0.0,8.0,3.0,4,3946.0,78.0,423,0.0,0.0,0.0,250,0.0,54,200.0,4.0,14.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,251,93.0,251,250,250,0,0,0,0,0,0,0,0,0,0.0,2.0,1.0,0.996016,0.0
9298,zuberbi01,11,43,42,224,65,23,3,6,2358,767,374,35,468,383,0.0,28.0,4.0,1,3476.0,90.0,418,0.0,0.0,0.0,224,0.0,29,107.0,5.0,9.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,224,65.0,224,224,224,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.000000,0.0
9299,zumayjo01,5,13,12,171,0,0,0,5,629,169,71,18,114,210,11.0,16.0,4.0,0,911.0,35.0,80,6.0,10.0,10.0,171,0.0,7,14.0,2.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,171,0.0,14,171,171,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.000000,0.0
9300,zuverge01,8,32,36,265,31,9,2,40,1927,660,253,56,203,223,29.0,10.0,27.0,1,2746.0,139.0,296,0.0,0.0,0.0,265,31.0,45,145.0,7.0,10.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,266,31.0,266,265,265,0,0,0,0,0,0,0,0,0,0.0,0.0,1.0,0.996241,0.0


In [18]:
pitchers = pitchers.fillna(0)

In [19]:
print(len(pitchers))

8262


In [20]:
print(len(pitchers[pitchers['inducted'] == 'Y']))

62


In [21]:
print(len(pitchers[pitchers['inducted'] == 0]))

8200


In [22]:
filt = (pitchers['inducted'] == 0)
pitchers.loc[filt, 'HoF'] = 'N'
pitchers

Unnamed: 0,playerID,years_played,W,L,G_pitcher,GS_x,CG,SHO,SV,IPouts,H,ER,HR,BB,SO,IBB,WP,HBP,BK,BFP,GF,R,SH,SF,GIDP,G,GS_y,PO,A,E,DP,ALCS MVP,All-Star Game MVP,Babe Ruth Award,Baseball Magazine All-Star,Branch Rickey Award,Comeback Player of the Year,Cy Young Award,Gold Glove,Hank Aaron Award,Hutch Award,Lou Gehrig Memorial Award,Most Valuable Player,NLCS MVP,Outstanding DH Award,Pitching Triple Crown,Reliever of the Year Award,Roberto Clemente Award,Rolaids Relief Man Award,Rookie of the Year,Silver Slugger,TSN All-Star,TSN Fireman of the Year,TSN Guide MVP,TSN Major League Player of the Year,TSN Pitcher of the Year,TSN Player of the Year,TSN Reliever of the Year,Triple Crown,World Series MVP,years_allstar,yearid,votedBy,ballots,needed,votes,inducted,category,needed_note,G_all,GS,G_batting,G_defense,G_p,G_c,G_1b,G_2b,G_3b,G_ss,G_lf,G_cf,G_rf,G_of,G_dh,G_ph,G_pr,prop_games_pitcher,prop_games_catcher,HoF
0,aardsda01,9,16,18,331,0,0,0,69,1011,296,160,41,183,340,22.0,12.0,16.0,1,1475.0,141.0,169,17.0,11.0,21.0,331,0.0,11,29.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0,0,331,0.0,139,331,331,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.000000,0.0,N
1,aasedo01,13,66,60,448,91,22,5,82,3328,1085,468,89,457,641,45.0,21.0,7.0,3,4730.0,234.0,503,0.0,0.0,0.0,448,91.0,67,135.0,13.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0.0,0.0,0.0,0,0,0,448,91.0,81,448,448,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.000000,0.0,N
2,abadfe01,7,6,26,315,6,0,0,1,822,260,113,36,99,234,9.0,9.0,11.0,1,1168.0,81.0,119,7.0,10.0,18.0,315,6.0,7,31.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0,0,315,6.0,135,315,315,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.000000,0.0,N
3,abbeybe01,5,22,40,79,66,52,0,1,1704,686,285,18,192,161,0.0,18.0,0.0,0,0.0,12.0,442,0.0,0.0,0.0,79,0.0,17,134.0,22.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0,0,79,0.0,79,79,79,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.000000,0.0,N
5,abbotda01,1,0,2,3,1,1,0,1,39,19,9,0,8,1,0.0,0.0,0.0,0,0.0,2.0,14,0.0,0.0,0.0,3,0.0,3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0,0,3,0.0,3,3,3,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.000000,0.0,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9297,zoldasa01,9,43,53,250,93,30,5,8,2788,956,366,54,301,207,0.0,8.0,3.0,4,3946.0,78.0,423,0.0,0.0,0.0,250,0.0,54,200.0,4.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0,0,251,93.0,251,250,250,0,0,0,0,0,0,0,0,0,0.0,2.0,1.0,0.996016,0.0,N
9298,zuberbi01,11,43,42,224,65,23,3,6,2358,767,374,35,468,383,0.0,28.0,4.0,1,3476.0,90.0,418,0.0,0.0,0.0,224,0.0,29,107.0,5.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0,0,224,65.0,224,224,224,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.000000,0.0,N
9299,zumayjo01,5,13,12,171,0,0,0,5,629,169,71,18,114,210,11.0,16.0,4.0,0,911.0,35.0,80,6.0,10.0,10.0,171,0.0,7,14.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0,0,171,0.0,14,171,171,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.000000,0.0,N
9300,zuverge01,8,32,36,265,31,9,2,40,1927,660,253,56,203,223,29.0,10.0,27.0,1,2746.0,139.0,296,0.0,0.0,0.0,265,31.0,45,145.0,7.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0,0,266,31.0,266,265,265,0,0,0,0,0,0,0,0,0,0.0,0.0,1.0,0.996241,0.0,N


In [23]:
filt = (pitchers['inducted'] == 'Y')
pitchers.loc[filt, 'HoF'] = 'Y'
pitchers

Unnamed: 0,playerID,years_played,W,L,G_pitcher,GS_x,CG,SHO,SV,IPouts,H,ER,HR,BB,SO,IBB,WP,HBP,BK,BFP,GF,R,SH,SF,GIDP,G,GS_y,PO,A,E,DP,ALCS MVP,All-Star Game MVP,Babe Ruth Award,Baseball Magazine All-Star,Branch Rickey Award,Comeback Player of the Year,Cy Young Award,Gold Glove,Hank Aaron Award,Hutch Award,Lou Gehrig Memorial Award,Most Valuable Player,NLCS MVP,Outstanding DH Award,Pitching Triple Crown,Reliever of the Year Award,Roberto Clemente Award,Rolaids Relief Man Award,Rookie of the Year,Silver Slugger,TSN All-Star,TSN Fireman of the Year,TSN Guide MVP,TSN Major League Player of the Year,TSN Pitcher of the Year,TSN Player of the Year,TSN Reliever of the Year,Triple Crown,World Series MVP,years_allstar,yearid,votedBy,ballots,needed,votes,inducted,category,needed_note,G_all,GS,G_batting,G_defense,G_p,G_c,G_1b,G_2b,G_3b,G_ss,G_lf,G_cf,G_rf,G_of,G_dh,G_ph,G_pr,prop_games_pitcher,prop_games_catcher,HoF
0,aardsda01,9,16,18,331,0,0,0,69,1011,296,160,41,183,340,22.0,12.0,16.0,1,1475.0,141.0,169,17.0,11.0,21.0,331,0.0,11,29.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0,0,331,0.0,139,331,331,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.000000,0.0,N
1,aasedo01,13,66,60,448,91,22,5,82,3328,1085,468,89,457,641,45.0,21.0,7.0,3,4730.0,234.0,503,0.0,0.0,0.0,448,91.0,67,135.0,13.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0.0,0.0,0.0,0,0,0,448,91.0,81,448,448,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.000000,0.0,N
2,abadfe01,7,6,26,315,6,0,0,1,822,260,113,36,99,234,9.0,9.0,11.0,1,1168.0,81.0,119,7.0,10.0,18.0,315,6.0,7,31.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0,0,315,6.0,135,315,315,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.000000,0.0,N
3,abbeybe01,5,22,40,79,66,52,0,1,1704,686,285,18,192,161,0.0,18.0,0.0,0,0.0,12.0,442,0.0,0.0,0.0,79,0.0,17,134.0,22.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0,0,79,0.0,79,79,79,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.000000,0.0,N
5,abbotda01,1,0,2,3,1,1,0,1,39,19,9,0,8,1,0.0,0.0,0.0,0,0.0,2.0,14,0.0,0.0,0.0,3,0.0,3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0,0,3,0.0,3,3,3,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.000000,0.0,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9297,zoldasa01,9,43,53,250,93,30,5,8,2788,956,366,54,301,207,0.0,8.0,3.0,4,3946.0,78.0,423,0.0,0.0,0.0,250,0.0,54,200.0,4.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0,0,251,93.0,251,250,250,0,0,0,0,0,0,0,0,0,0.0,2.0,1.0,0.996016,0.0,N
9298,zuberbi01,11,43,42,224,65,23,3,6,2358,767,374,35,468,383,0.0,28.0,4.0,1,3476.0,90.0,418,0.0,0.0,0.0,224,0.0,29,107.0,5.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0,0,224,65.0,224,224,224,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.000000,0.0,N
9299,zumayjo01,5,13,12,171,0,0,0,5,629,169,71,18,114,210,11.0,16.0,4.0,0,911.0,35.0,80,6.0,10.0,10.0,171,0.0,7,14.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0,0,171,0.0,14,171,171,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.000000,0.0,N
9300,zuverge01,8,32,36,265,31,9,2,40,1927,660,253,56,203,223,29.0,10.0,27.0,1,2746.0,139.0,296,0.0,0.0,0.0,265,31.0,45,145.0,7.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0,0,266,31.0,266,265,265,0,0,0,0,0,0,0,0,0,0.0,0.0,1.0,0.996241,0.0,N


In [24]:
target = pitchers['HoF']

In [25]:
#features = pitchers.drop(['playerID', 'prop_games_pitcher', 'prop_games_catcher', 'HoF'], axis=1)

features = pitchers[['years_played', 'W', 'SO', 'BFP']]

In [30]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.1)

In [31]:
y_test

5871    N
3123    N
6199    N
8650    N
6032    N
       ..
4132    N
1677    N
2073    N
7386    N
2438    N
Name: HoF, Length: 827, dtype: object

In [32]:
rfc = RandomForestClassifier()

In [33]:
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [34]:
print(rfc.feature_importances_)

[0.12783835 0.37830929 0.18014759 0.31370477]


In [43]:
y_pred = rfc.predict(X_test)
#y_pred

In [37]:
print(confusion_matrix(y_test, y_pred))

[[816   2]
 [  3   6]]


In [38]:
accuracy_score(y_test, y_pred)

0.9939540507859734

In [41]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           N       1.00      1.00      1.00       818
           Y       0.75      0.67      0.71         9

    accuracy                           0.99       827
   macro avg       0.87      0.83      0.85       827
weighted avg       0.99      0.99      0.99       827

