In [7]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [8]:
# pull dataframes from csv files
dh_data = pd.read_csv('Final_MLB_Data.csv')
dh_salary = pd.read_csv('player_salaries.csv')

In [9]:
def remove_non_letters(input_string):
    cleaned_string = re.sub(r'[^a-zA-ZÀ-ÿ ]', '', input_string)
    return cleaned_string

# aggregate duplicates by averaging statistics
dh_data["Name"] = dh_data["Name"].apply(remove_non_letters)
dh_data = dh_data.groupby('Name', as_index=False).mean()
dh_data

Unnamed: 0,Name,Age,G,PA,AB,R,H,2B,3B,HR,...,TB,GDP,HBP,SH,SF,IBB,wOBA,OAA,OFF,DEF
0,AJ Pollock,34.0,138.0,527.0,489.0,61.0,120.0,26.0,1.0,14.0,...,190.0,13.0,2.0,0.0,4.0,0.0,0.299564,12.0,0.299564,12.0
1,Aaron Bummer,28.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
2,Aaron Hicks,32.0,130.0,453.0,384.0,54.0,83.0,9.0,2.0,8.0,...,120.0,10.0,4.0,1.0,2.0,3.0,0.294031,0.0,0.294031,0.0
3,Aaron Judge,30.0,157.0,696.0,570.0,133.0,177.0,28.0,0.0,62.0,...,391.0,14.0,6.0,0.0,5.0,19.0,0.462095,-4.0,0.462095,-4.0
4,Aaron Whitefield,25.0,5.0,11.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
784,Zach McKinstry,27.0,28.5,92.5,83.0,10.5,16.5,3.0,1.5,2.5,...,30.0,0.0,0.5,1.0,0.0,0.0,0.289402,2.0,0.289402,2.0
785,Zach Pop,25.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
786,Zach Reks,28.0,16.0,34.0,34.0,3.0,9.0,1.0,0.0,0.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.246765,-1.0,0.246765,-1.0
787,Zack Collins,27.0,18.0,54.0,48.5,4.5,7.5,2.0,0.0,2.0,...,15.5,0.0,0.5,0.0,0.5,0.0,0.200908,1.0,0.200908,1.0


In [10]:
# collect training data for valuable and non valueable dh
combined_df = pd.merge(dh_data,dh_salary, left_on="Name", right_on="player name", how="left")
combined_df['salary'] = combined_df['salary'].str.replace(',','').astype(float)
high_value_dh = combined_df[combined_df["salary"] >= int(combined_df["salary"].quantile(0.9))].drop("salary", axis=1)
low_value_dh = combined_df[combined_df["salary"] <= int(combined_df["salary"].quantile(0.1))].drop("salary", axis=1)
combined_df = combined_df.drop("salary", axis=1)

In [11]:
# define value and labeled set
high_value_dh = high_value_dh.assign(valuable=True)
low_value_dh = low_value_dh.assign(valuable=False)
truth_set = pd.concat([high_value_dh, low_value_dh])

In [13]:
feature_columns = ['Age', 'G', 'PA', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SB',
                   'CS', 'BB', 'SO', 'BA', 'OBP', 'SLG', 'OPS', 'OPS+', 'TB', 'GDP', 'HBP',
                   'SH', 'SF', 'IBB', 'wOBA', 'OAA', 'OFF', 'DEF', 'pos', 'ht', 'wt', 'ba',
                   'th', 'born', 'hilvl', 'mlb years', 'stat years']
target_columns = "valuable"
categorical_features = truth_set[feature_columns].select_dtypes(include=['object']).columns

feature_columns = [x for x in feature_columns if x not in categorical_features]

X = truth_set[feature_columns]
y = truth_set[target_columns]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

gnb = GaussianNB()
gnb.fit(X_train, y_train)

y_pred = gnb.predict(X_test)



In [14]:
y_pred

array([False, False, False, False, False, False, False, False, False,
       False,  True, False,  True, False,  True, False, False, False,
       False, False,  True, False, False, False, False, False,  True,
       False, False, False, False,  True, False, False, False, False,
        True, False, False, False,  True, False,  True, False, False,
       False, False, False, False, False,  True,  True,  True, False,
        True, False, False, False, False,  True,  True,  True,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False,  True,  True, False,
       False, False, False,  True,  True, False,  True,  True, False,
       False, False,  True, False, False,  True, False, False, False,
       False, False, False, False, False,  True, False, False, False,
       False,  True, False,  True, False, False, False, False, False])