In [1]:
#!pip install tpot --user --no-warn-script-location
#!conda install -c conda-forge tpot
#!pip install xgboost

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tpot import TPOTClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score as acc



In [3]:
data = pd.read_csv('cleaned_draftdata2.csv', index_col = 'Unnamed: 0')
data

Unnamed: 0,Year,Round,OverallPick,RoundPick,Team,Signed,Name,Position,WAR,GamesBatted,...,ERA,WHIP,Saves,Type,DraftedFrom,CBA,State,Region,Division,PositionGroup
0,2007,1,1,1,Rays,Y,David Price (minors)\priceda01,LHP,39.4,21.0,...,3.31,1.15,0.0,4Yr,Vanderbilt University (Nashville TN),1,TN,South,AL East,Pitcher
1,2007,1,2,2,Royals,Y,Mike Moustakas (minors)\moustmi01,SS,15.3,1131.0,...,,,,HS,Chatsworth HS (Chatsworth CA),1,CA,California,AL Central,Infield
2,2007,1,3,3,Cubs,Y,Josh Vitters (minors)\vittejo01,3B,-1.3,36.0,...,,,,HS,Cypress HS (Cypress CA),1,CA,California,NL Central,Infield
3,2007,1,4,4,Pirates,Y,Daniel Moskos (minors)\moskoda01,LHP,0.2,30.0,...,2.96,1.56,0.0,4Yr,Clemson University (Clemson SC),1,SC,South,NL Central,Pitcher
4,2007,1,5,5,Orioles,Y,Matt Wieters (minors)\wietema01,C,18.2,1148.0,...,,,,4Yr,Georgia Institute of Technology (Atlanta GA),1,GA,South,AL East,Catcher/Utility
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8075,2019,9,283,26,Brewers,Y,Darrien Miller (minors),C,,,...,,,,HS,Clovis HS (Clovis CA),3,CA,California,NL Central,Catcher/Utility
8076,2019,9,284,27,Athletics,Y,Colin Peluse (minors),P,,,...,,,,4Yr,Wake Forest University (Winston-Salem NC),3,NC,South,AL West,Pitcher
8077,2019,9,285,28,Yankees,Y,Spencer Henson (minors),1B,,,...,,,,4Yr,Oral Roberts University (Tulsa OK),3,OK,Midwest,AL East,Infield
8078,2019,9,286,29,Astros,Y,Peyton Battenfield (minors),P,,,...,,,,4Yr,Oklahoma State University (Stillwater OK),3,OK,Midwest,AL West,Pitcher


In [4]:
#Encoder for Signed values
labelenc = LabelEncoder()
labelenc.fit(data['Signed'])
print(labelenc.classes_)
data['Signed'] = labelenc.transform(data['Signed']).ravel()

['N' 'Y']


In [5]:
not_signed = data[data['Signed'] == 0]
signed = data[data['Signed'] == 1]
#data[data['Signed'] == 'Unk']

In [6]:
#High School Subset
hsdata = data[data['Type'] == 'HS']
hsdata

Unnamed: 0,Year,Round,OverallPick,RoundPick,Team,Signed,Name,Position,WAR,GamesBatted,...,ERA,WHIP,Saves,Type,DraftedFrom,CBA,State,Region,Division,PositionGroup
1,2007,1,2,2,Royals,1,Mike Moustakas (minors)\moustmi01,SS,15.3,1131.0,...,,,,HS,Chatsworth HS (Chatsworth CA),1,CA,California,AL Central,Infield
2,2007,1,3,3,Cubs,1,Josh Vitters (minors)\vittejo01,3B,-1.3,36.0,...,,,,HS,Cypress HS (Cypress CA),1,CA,California,NL Central,Infield
8,2007,1,9,9,Diamondbacks,1,Jarrod Parker (minors)\parkeja02,RHP,6.5,4.0,...,3.68,1.24,0.0,HS,Norwell HS (Ossian IN),1,IN,Midwest,NL West,Pitcher
9,2007,1,10,10,Giants,1,Madison Bumgarner (minors)\bumgama01,LHP,36.8,288.0,...,3.13,1.11,0.0,HS,South Caldwell HS (Hudson NC),1,NC,South,NL West,Pitcher
10,2007,1,11,11,Mariners,1,Phillippe Aumont (minors)\aumonph01,RHP,-0.7,44.0,...,6.80,1.99,2.0,HS,Ecole Du Versant (Gatineau QC),1,QC,Non-50 States,AL West,Pitcher
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8044,2019,8,252,25,Cubs,1,Davidjohn Herz (minors),P,,,...,,,,HS,Terry Sanford HS (Fayetteville NC),3,NC,South,NL Central,Pitcher
8046,2019,8,254,27,Athletics,1,Jose Dicochea (minors),P,,,...,,,,HS,Sahuarita HS (Sahuarita AZ),3,AZ,West,AL West,Pitcher
8072,2019,9,280,23,Indians,1,Will Bartlett (minors),C,,,...,,,,HS,IMG Academy (Bradenton FL),3,FL,Florida,AL Central,Catcher/Utility
8074,2019,9,282,25,Cubs,1,Tyler Schlaffer (minors),P,,,...,,,,HS,Homewood-Flossmoor HS (Flossmoor IL),3,IL,Midwest,NL Central,Pitcher


In [7]:
modeling_data = hsdata[['Round', 'CBA', 'Region', 'Division', 'PositionGroup', 'Signed']] #'Team', 'Position',
modeling_data

Unnamed: 0,Round,CBA,Region,Division,PositionGroup,Signed
1,1,1,California,AL Central,Infield,1
2,1,1,California,NL Central,Infield,1
8,1,1,Midwest,NL West,Pitcher,1
9,1,1,South,NL West,Pitcher,1
10,1,1,Non-50 States,AL West,Pitcher,1
...,...,...,...,...,...,...
8044,8,3,South,NL Central,Pitcher,1
8046,8,3,West,AL West,Pitcher,1
8072,9,3,Florida,AL Central,Catcher/Utility,1
8074,9,3,Midwest,NL Central,Pitcher,1


In [21]:
ct = ColumnTransformer(
[('one_hot_encoder', OneHotEncoder(categories='auto', sparse=False), [1,2,3,4])],
remainder='passthrough'
)
datamod = np.array(ct.fit_transform(modeling_data))
datamod[100]

array([ 1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  1.,  0., 19.,  0.])

In [9]:
#Subsetting out X and Y
X = datamod[:, :-1]
y = datamod[:, -1]

In [10]:
#Train/test Split
X_train, X_test, y_train, y_test = tts(X, y, test_size = 0.2, random_state = 1693)
X_test.shape

(428, 22)

In [36]:
#Logistic Regression?
log_class = LogisticRegression(solver = 'lbfgs')
log_class.fit(X_train,y_train)
yhat_log = log_class.predict(X_test)
print(log_class.coef_)
acc(y_test, yhat_log)

[[-0.48474616  0.21134672  0.27178333 -0.36809471 -0.15745981 -0.15607672
  -0.17849288  1.20628934 -0.11167323 -0.19407737 -0.04203073  0.00423519
  -0.11372549 -0.23428218  0.06177015  0.28669796 -0.00631173  0.33163014
  -0.17179226  0.11869352 -0.2801475  -0.19487261]]


0.822429906542056

In [38]:
spc = ['Did Not Sign', 'Signed']
cm = confusion_matrix(y_test, yhat_log)
pd.DataFrame(cm, columns=spc, index=spc)

Unnamed: 0,Did Not Sign,Signed
Did Not Sign,25,54
Signed,22,327


In [37]:
print(y_test)
print()
print(yhat_log)

[0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 0. 1. 0.
 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 0. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 0. 0. 1. 1. 1. 0.
 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 0. 0. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 0. 0. 0. 1. 1. 1.
 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 0. 1. 0. 1. 0. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 0. 0. 1. 0. 1. 0. 1. 1. 1. 1.
 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 0. 1. 1. 1. 0. 0. 1.
 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 0. 0. 1. 1.

In [39]:
#Gradient Boosting Classifer 2
gbc = GradientBoostingClassifier(random_state = 1693)
gbc.fit(X_train,y_train)
yhat = gbc.predict(X_test)
acc(y_test, yhat)
spc = ['Did Not Sign', 'Signed']
cm = confusion_matrix(y_test, yhat)
pd.DataFrame(cm, columns=spc, index=spc)

Unnamed: 0,Did Not Sign,Signed
Did Not Sign,26,53
Signed,28,321


In [40]:
#Linear SVC
svc = SVC(kernel = 'linear', random_state = 1693)
svc.fit(X_train, y_train)
yhat_svc = svc.predict(X_test)
acc(y_test, yhat_svc)
spc = ['Did Not Sign', 'Signed']
cm = confusion_matrix(y_test, yhat_svc)
pd.DataFrame(cm, columns=spc, index=spc)

Unnamed: 0,Did Not Sign,Signed
Did Not Sign,0,79
Signed,0,349


In [41]:
#Gaussian NB
nb = GaussianNB()
nb.fit(X_train, y_train)
yhat_nb = nb.predict(X_test)
acc(y_test, yhat_nb)
spc = ['Did Not Sign', 'Signed']
cm = confusion_matrix(y_test, yhat_nb)
pd.DataFrame(cm, columns=spc, index=spc)

Unnamed: 0,Did Not Sign,Signed
Did Not Sign,42,37
Signed,52,297


In [15]:
X_train, X_test, y_train, y_test = tts(X, y, test_size = 0.2, random_state = 1515)

log_class2 = LogisticRegression(solver = 'liblinear')
log_class2.fit(X_train,y_train)
yhat_log2 = log_class2.predict(X_test)
print(log_class2.coef_)
acc(y_test, yhat_log2)

[[ 0.15921352  0.84796628  0.89914137 -0.1219434   0.07879297  0.08535066
   0.06182715  1.43315582  0.13006109  0.04555847  0.1935184   0.32081821
   0.20694806  0.08717212  0.37918271  0.60117264  0.31102743  0.7914659
   0.31139081  0.59775804  0.20570643 -0.19137316]]


0.822429906542056

In [16]:
log_class3 = LogisticRegression(solver = 'newton-cg')
log_class3.fit(X_train,y_train)
yhat_log3 = log_class3.predict(X_test)
print(log_class3.coef_)
acc(y_test, yhat_log3)

[[-0.48424055  0.21170999  0.27253034 -0.36803288 -0.15742062 -0.15574418
  -0.17811527  1.20623275 -0.11135911 -0.19396644 -0.04159448  0.00456836
  -0.11357042 -0.23407713  0.06193761  0.28721227 -0.00607092  0.33217625
  -0.17153169  0.11921227 -0.27985705 -0.19488588]]


0.822429906542056

In [44]:
#Random Forest 1?
RFdata = pd.read_csv('cleaned_draftdata.csv', index_col = 'Unnamed: 0')
labelenc = LabelEncoder()
labelenc.fit(data['Signed'])
data['Signed'] = labelenc.transform(data['Signed']).ravel()



rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_rf_class = rf.predict(X_test)
acc(y_test, y_rf_class)
spc = ['Did Not Sign', 'Signed']
cm = confusion_matrix(y_test, y_rf_class)
pd.DataFrame(cm, columns=spc, index=spc)

Unnamed: 0,Did Not Sign,Signed
Did Not Sign,29,50
Signed,33,316


In [45]:
#KNN Attempt 1 (tts state 1693)
knn = KNeighborsClassifier(weights = 'distance')
knn.fit(X_train, y_train)
yhat = knn.predict(X_test)
acc(y_test, yhat)
spc = ['Did Not Sign', 'Signed']
cm = confusion_matrix(y_test, yhat)
pd.DataFrame(cm, columns=spc, index=spc)

Unnamed: 0,Did Not Sign,Signed
Did Not Sign,32,47
Signed,39,310


## Moving onto 2020 data

In [22]:
data2020 = pd.read_csv('2020list.csv')
data2020.head()
data2020.shape

(160, 25)

In [23]:
data2020['CBA'] = 3
teamlist = [None] * len(data2020)
states = [None] * len(data2020)
region = [None] * len(data2020)
division = [None] * len(data2020)
posgroup = [None] * len(data2020)

for i in range(0, len(data2020)):
    team = data2020['Team'][i].split()
    if team[0] == 'Devil':
        teamlist[i] = 'Rays'
    elif team[0] in ['Blue', 'White', 'Red']:
        teamlist[i] = team[0] + ' ' + team[1]
    elif team[0] == "D'backs":
        teamlist[i] = 'Diamondbacks'
    else:
        teamlist[i] = team[0]
data2020['Team'] = teamlist

for i in range(len(data2020)):
    draftedfrom = data2020['DraftedFrom'][i]
    if draftedfrom is not np.nan:
        draft_list = draftedfrom.split()
        last = draft_list[-1]
        state = last[:2]
        states[i] = state
data2020['State'] = states
data2020['State'] = data2020['State'].replace(to_replace = {'Ri': 'PR', 'Is':'VI', 'Ge': 'GM'})
        
for i in range(len(data2020)):
    state = data2020['State'][i]
    if state is not np.nan:
        if state in ['AB', 'BC', 'ON', 'QC', 'PR', 'GM', 'VI', 'CU']:
            region[i] = 'Non-50 States'
        elif state == 'CA':
            region[i] = 'California'
        elif state in ['WA', 'OR', 'ID', 'MT', 'NV', 'UT', 'AZ', 'NM', 'AK', 'HI']:
            region[i] = 'West'
        elif state in ['ND', 'SD', 'NE', 'KS', 'IA', 'MO', 'OK', 'WY', 'CO', 
                       'WI', 'MI', 'IL', 'IN', 'OH', 'KY', 'WV', 'MN']:
            region[i] = 'Midwest'
        elif state == 'TX':
            region[i] = 'Texas'
        elif state in ['NY', 'CT', 'MA', 'VT', 'NH', 'ME', 'RI', 'PA', 'NJ', 'DE', 'MD', 'DC', 'VA']:
            region[i] = 'East'
        elif state == 'FL':
            region[i] = 'Florida'
        elif state in ['NC', 'SC', 'GA', 'AL', 'MS', 'LA', 'TN', 'AR']:
            region[i] = 'South'
            
for i in range(len(data2020)):
    if data2020['Team'][i] in ['Rays', 'Red Sox', 'Yankees', 'Blue Jays', 'Orioles']:
        division[i] = 'AL East'
    elif data2020['Team'][i] in ['White Sox', 'Twins', 'Indians', 'Royals', 'Tigers']:
        division[i] = 'AL Central'
    elif data2020['Team'][i] in ['Astros', 'Angels', 'Mariners', 'Athletics', 'Rangers']:
        division[i] = 'AL West'
    elif data2020['Team'][i] in ['Mets', 'Nationals', 'Braves', 'Phillies', 'Marlins']:
        division[i] = 'NL East'
    elif data2020['Team'][i] in ['Reds', 'Cubs', 'Cardinals', 'Brewers', 'Pirates']:
        division[i] = 'NL Central'
    elif data2020['Team'][i] in ['Dodgers', 'Giants', 'Diamondbacks', 'Rockies', 'Padres']:
        division[i] = 'NL West'
        
for i in range(len(data2020)):
    if data2020['Position'][i] in ['LHP', 'RHP', 'P']:
        posgroup[i] = 'Pitcher'
    elif data2020['Position'][i] in ['1B', '2B', '3B', 'SS', 'IF', 'INF']:
        posgroup[i] = 'Infield'
    elif data2020['Position'][i] in ['LF', 'RF', 'CF', 'OF']:
        posgroup[i] = 'Outfield'
    else:
        posgroup[i] = 'Catcher/Utility'

data2020['PositionGroup'] = posgroup        
data2020['Division'] = division
data2020['Region'] = region
data2020 = data2020.replace(to_replace = {'1s':'1', '2s':'2', '3s':'3'})
data2020.head()

Unnamed: 0,Year,Round,DT,OverallPick,FrRnd,RoundPick,Team,Signed,Bonus,Name,...,ERA,WHIP,Saves,Type,DraftedFrom,CBA,State,PositionGroup,Division,Region
0,2020,1,,1,FrRnd,1,Tigers,Y,"$8,416,300",Spencer Torkelson (minors),...,,,,4Yr,Arizona State University (Tempe AZ),3,AZ,Infield,AL Central,West
1,2020,1,,2,FrRnd,2,Orioles,Y,"$5,200,000",Heston Kjerstad (minors),...,,,,4Yr,University of Arkansas (Fayetteville AR),3,AR,Outfield,AL East,South
2,2020,1,,3,FrRnd,3,Marlins,Y,"$6,700,000",Max Meyer (minors),...,,,,4Yr,University of Minnesota (Minneapolis MN),3,MN,Pitcher,NL East,Midwest
3,2020,1,,4,FrRnd,4,Royals,Y,"$6,670,000",Asa Lacy (minors),...,,,,4Yr,Texas A&M University (College Station TX),3,TX,Pitcher,AL Central,Texas
4,2020,1,,5,FrRnd,5,Blue Jays,N,,Austin Martin (minors),...,,,,4Yr,Vanderbilt University (Nashville TN),3,TN,Infield,AL East,South


In [24]:
hsdata2020 = data2020[data2020['Type'] == 'HS']
test_set = hsdata2020[['Round', 'CBA', 'Region', 'Division', 'PositionGroup', 'Signed']]
#hsdata2020
#test_set
labelenc2 = LabelEncoder()
labelenc2.fit(test_set['Signed'])
test_set['Signed'] = labelenc2.transform(test_set['Signed']).ravel()
datamod2020 = np.array(ct.transform(test_set))
datamod2020[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


array([0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 1., 0., 1., 1.])

In [25]:
hsdata2020.shape

(47, 30)

In [26]:
test_X = datamod2020[:, :-1]
test_target = datamod2020[:, -1]

In [46]:
target_log = log_class.predict(test_X)
print("The actual values for the 2020 draft class:")
print()
print(test_target)
print()
print('The logistic model predicted values for the 2020 draft class:')
print()
print(target_log)

The actual values for the 2020 draft class:

[1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 1. 0. 1. 1.
 1. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 0. 1. 1. 0. 0. 1. 1. 0. 1. 1. 1. 1.]

The logistic model predicted values for the 2020 draft class:

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [49]:
print(acc(test_target, target_log))
spc = ['Did Not Sign', 'Signed']
cm = confusion_matrix(test_target, target_log)
pd.DataFrame(cm, columns=spc, index=spc)

0.7021276595744681


Unnamed: 0,Did Not Sign,Signed
Did Not Sign,0,14
Signed,0,33


In [28]:
target_log2 = log_class2.predict(test_X)
print(acc(test_target, target_log2))
print(test_target)
print(target_log2)

0.7021276595744681
[1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 1. 0. 1. 1.
 1. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 0. 1. 1. 0. 0. 1. 1. 0. 1. 1. 1. 1.]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [29]:
target_rf = rf.predict(test_X)
print(acc(test_target, target_rf))
print(test_target)
print(target_rf)

NameError: name 'rf' is not defined

In [30]:
target_svc = svc.predict(test_X)
print(acc(test_target, target_svc))
print(test_target)
print(target_svc)

0.7021276595744681
[1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 1. 0. 1. 1.
 1. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 0. 1. 1. 0. 0. 1. 1. 0. 1. 1. 1. 1.]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [31]:
target_nb = nb.predict(test_X)
print(acc(test_target, target_nb))
print(test_target)
print(target_nb)

0.7021276595744681
[1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 1. 0. 1. 1.
 1. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 0. 1. 1. 0. 0. 1. 1. 0. 1. 1. 1. 1.]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


#### What if we built a model based only on rounds 1-5 (and ignoring CBA) and checked 2020 on that?

In [None]:
firstfive = hsdata[['Round', 'CBA', 'Region', 'Division', 'PositionGroup', 'Signed']] #'Team', 'Position',
firstfive = firstfive[firstfive['Round'] <= 5]
firstfive
ct = ColumnTransformer(
[('one_hot_encoder', OneHotEncoder(categories='auto', sparse=False), [1,2,3,4])],
remainder='passthrough'
)
datamodfive = np.array(ct.fit_transform(firstfive))
Xfive = datamodfive[:, :-1]
yfive = datamodfive[:, -1]

In [None]:
#Logistic Regression
log_five = LogisticRegression(solver = 'lbfgs')
log_five.fit(Xfive,yfive)
ylog_five = log_five.predict(test_X)
print(log_five.coef_)
print(acc(test_target, ylog_five))
print(test_target)
print(ylog_five)

In [None]:
#Random Forest
rf = RandomForestClassifier()
rf.fit(Xfive,yfive)
y_rf_five = rf.predict(test_X)
print(acc(test_target, y_rf_five))
print(test_target)
print(y_rf_five)