In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
import numpy as np

def sort_dataset(dataset_df):
  # Sort ascending by year
	sort_dataset = dataset_df.sort_values(by='year', ascending=True)
	return sort_dataset

def split_dataset(dataset_df):
	# Rescale label value by multiplying it by 0.001
  dataset_df['salary'] *= 0.001

  # Split the index range [:1718] for the train dataset
  train_df = dataset_df.iloc[:1718, :]
  # Split the index range [1718:] for the test dataset
  test_df = dataset_df.iloc[1718:, :]

  # Separate features (X) and labels (Y)
  X_train = train_df.drop('salary', axis=1)
  Y_train = train_df['salary']

  X_test = test_df.drop('salary', axis=1)
  Y_test = test_df['salary']

  return X_train, X_test, Y_train, Y_test

def extract_numerical_cols(dataset_df):
	# List of numerical columns
  numerical_columns = ['age', 'G', 'PA', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'HBP', 'SO', 'GDP', 'fly', 'war']
  # Extract only numerical features
  numerical_df = dataset_df[numerical_columns]

  return numerical_df

def train_predict_decision_tree(X_train, Y_train, X_test):
	decision_tree_model = DecisionTreeRegressor()
	decision_tree_model.fit(X_train, Y_train)

	predictions = decision_tree_model.predict(X_test)

	return predictions

def train_predict_random_forest(X_train, Y_train, X_test):
  # Train Random Forest Regressor
  rf_regressor = RandomForestRegressor()
  rf_regressor.fit(X_train, Y_train)

  # Predict using the trained model
  predictions = rf_regressor.predict(X_test)

  return predictions

def train_predict_svm(X_train, Y_train, X_test):
  # Create a pipeline with Standard Scaler and SVM model
  svm_pipe = make_pipeline(
      StandardScaler(),
      SVR()
  )
  # Train the pipeline
  svm_pipe.fit(X_train, Y_train)

  # Predict using the trained pipeline
  predictions = svm_pipe.predict(X_test)

  return predictions

def calculate_RMSE(labels, predictions):
  # Calculate Mean Squared Error (MSE)
  mse = np.mean((predictions-labels)**2)

  # Calculate RMSE by taking the square root of MSE
  rmse = np.sqrt(mse)

  return rmse

if __name__=='__main__':
	#DO NOT MODIFY THIS FUNCTION UNLESS PATH TO THE CSV MUST BE CHANGED.
  # 구글 코랩 환경에서 하여 경로 설정을 아래와 같이 설정함.
	data_df = pd.read_csv('/content/sample_data/2019_kbo_for_kaggle_v2.csv')

	sorted_df = sort_dataset(data_df)
	X_train, X_test, Y_train, Y_test = split_dataset(sorted_df)

	X_train = extract_numerical_cols(X_train)
	X_test = extract_numerical_cols(X_test)

	dt_predictions = train_predict_decision_tree(X_train, Y_train, X_test)
	rf_predictions = train_predict_random_forest(X_train, Y_train, X_test)
	svm_predictions = train_predict_svm(X_train, Y_train, X_test)

	print ("Decision Tree Test RMSE: ", calculate_RMSE(Y_test, dt_predictions))
	print ("Random Forest Test RMSE: ", calculate_RMSE(Y_test, rf_predictions))
	print ("SVM Test RMSE: ", calculate_RMSE(Y_test, svm_predictions))

Decision Tree Test RMSE:  30.405594331745373
Random Forest Test RMSE:  22.757052149093
SVM Test RMSE:  32.38048449830289
