In [80]:
import pandas as pd
import sklearn
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn import utils
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

In [23]:
def sort_dataset(dataset_df):
    sorted_dataset = dataset_df.sort_values(by='year')
    return sorted_dataset

In [68]:
def split_dataset(dataset_df):
    target = dataset_df['salary'] * 0.001
    data = dataset_df.drop('salary', axis='columns')
    
    X_train = data[:1718]
    X_test = data[1718:]
    Y_train = target[:1718]
    Y_test = target[1718:]
    
    return X_train, X_test, Y_train, Y_test
    

In [69]:
def extract_numerical_cols(dataset_df):
    extracted_dataset = dataset_df.drop(['batter_name', 'TB', 'GB', 'BU', 'year', 'year_born', 'hand2', 'cp', 
                                        'tp', '1B', 'FBP', 'avg', 'OBP', 'SLG', 'OPS', 'p_year', 'YAB', 'YOPS'], axis='columns')
    
    return extracted_dataset

In [70]:
def train_predict_decision_tree(X_train, Y_train, X_test):
    dt_cls = DecisionTreeClassifier()    
    lab = preprocessing.LabelEncoder()
    transformed_Y_train = lab.fit_transform(Y_train)
    dt_cls.fit(X_train, transformed_Y_train)
    
    return dt_cls.predict(X_test)

In [76]:
def train_predict_random_forest(X_train, Y_train, X_test):
    dt_cls = RandomForestClassifier()    
    lab = preprocessing.LabelEncoder()
    transformed_Y_train = lab.fit_transform(Y_train)
    dt_cls.fit(X_train, transformed_Y_train)
    
    return dt_cls.predict(X_test)

In [85]:
def train_predict_svm(X_train, Y_train, X_test):
    svm_pipe = Pipeline([('scaler', StandardScaler()), ('svm', SVC())])
    lab = preprocessing.LabelEncoder()
    transformed_Y_train = lab.fit_transform(Y_train)
    svm_pipe.fit(X_train, transformed_Y_train)
    
    return svm_pipe.predict(X_test)

In [89]:
def calculate_RMSE(labels, predictions):
    return np.sqrt(np.mean((predictions - labels)**2))

In [91]:
if __name__=='__main__':
	#DO NOT MODIFY THIS FUNCTION UNLESS PATH TO THE CSV MUST BE CHANGED.
	data_df = pd.read_csv('2019_kbo_for_kaggle_v2.csv')
	
	sorted_df = sort_dataset(data_df)	
	X_train, X_test, Y_train, Y_test = split_dataset(sorted_df)
	
	X_train = extract_numerical_cols(X_train)
	X_test = extract_numerical_cols(X_test)

	dt_predictions = train_predict_decision_tree(X_train, Y_train, X_test)
	rf_predictions = train_predict_random_forest(X_train, Y_train, X_test)
	svm_predictions = train_predict_svm(X_train, Y_train, X_test)
	
	print ("Decision Tree Test RMSE: ", calculate_RMSE(Y_test, dt_predictions))	
	print ("Random Forest Test RMSE: ", calculate_RMSE(Y_test, rf_predictions))	
	print ("SVM Test RMSE: ", calculate_RMSE(Y_test, svm_predictions))

Decision Tree Test RMSE:  50.03821616447849
Random Forest Test RMSE:  45.9659573028696
SVM Test RMSE:  40.146354051018676
