In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np

def sort_dataset(dataset_df):
   
    sorted_df = dataset_df.sort_values(by='year', ascending=True)
    return sorted_df

def split_dataset(dataset_df):
   
    dataset_df['salary'] *= 0.001
    train_data = dataset_df.iloc[:1718]
    test_data = dataset_df.iloc[1718:]

    X_train = extract_numerical_cols(train_data)
    Y_train = train_data['salary']

    X_test = extract_numerical_cols(test_data)
    Y_test = test_data['salary']

    return X_train, X_test, Y_train, Y_test

def extract_numerical_cols(dataset_df):
   
    numerical_columns = ['age', 'G', 'PA', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'HBP', 'SO', 'GDP', 'fly', 'war']
    numerical_df = dataset_df[numerical_columns]
    return numerical_df

def train_predict_decision_tree(X_train, Y_train, X_test):
    
    dt_model = DecisionTreeRegressor(random_state=42)
    dt_model.fit(X_train, Y_train)
    dt_predictions = dt_model.predict(X_test)
    return dt_predictions

def train_predict_random_forest(X_train, Y_train, X_test):
   
    rf_model = RandomForestRegressor(random_state=42)
    rf_model.fit(X_train, Y_train)
    rf_predictions = rf_model.predict(X_test)
    return rf_predictions

def train_predict_svm(X_train, Y_train, X_test):
   
    svm_model = Pipeline([('scaler', StandardScaler()),('svm', SVR())])
    svm_model.fit(X_train, Y_train)
    svm_predictions = svm_model.predict(X_test)
    return svm_predictions

def calculate_RMSE(labels, predictions):
  
    mse = mean_squared_error(labels, predictions)
    rmse = np.sqrt(mse)
    return rmse

if __name__ == '__main__':
    data_df = pd.read_csv('2019_kbo_for_kaggle_v2.csv')

    sorted_df = sort_dataset(data_df)
 
    X_train, X_test, Y_train, Y_test = split_dataset(sorted_df)
    dt_predictions = train_predict_decision_tree(X_train, Y_train, X_test)
    rf_predictions = train_predict_random_forest(X_train, Y_train, X_test)
    svm_predictions = train_predict_svm(X_train, Y_train, X_test)

    print("Decision Tree Test RMSE: ", calculate_RMSE(Y_test, dt_predictions))
    print("Random Forest Test RMSE: ", calculate_RMSE(Y_test, rf_predictions))
    print("SVM Test RMSE: ", calculate_RMSE(Y_test, svm_predictions))

Decision Tree Test RMSE:  31.018574460127297
Random Forest Test RMSE:  22.82450903865859
SVM Test RMSE:  32.3804844983029
