In [47]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
import xgboost as xgb

## Data Preprocessing

In [14]:
# Data from https://archive.ics.uci.edu/ml/datasets/YearPredictionMSD
data = pd.read_csv('YearPredictionMSD.txt', sep=",", header=None)

In [15]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,81,82,83,84,85,86,87,88,89,90
0,2001,49.94357,21.47114,73.0775,8.74861,-17.40628,-13.09905,-25.01202,-12.23257,7.83089,...,13.0162,-54.40548,58.99367,15.37344,1.11144,-23.08793,68.40795,-1.82223,-27.46348,2.26327
1,2001,48.73215,18.4293,70.32679,12.94636,-10.32437,-24.83777,8.7663,-0.92019,18.76548,...,5.66812,-19.68073,33.04964,42.87836,-9.90378,-32.22788,70.49388,12.04941,58.43453,26.92061
2,2001,50.95714,31.85602,55.81851,13.41693,-6.57898,-18.5494,-3.27872,-2.35035,16.07017,...,3.038,26.05866,-50.92779,10.93792,-0.07568,43.2013,-115.00698,-0.05859,39.67068,-0.66345
3,2001,48.2475,-1.89837,36.29772,2.58776,0.9717,-26.21683,5.05097,-10.34124,3.55005,...,34.57337,-171.70734,-16.96705,-46.67617,-12.51516,82.58061,-72.08993,9.90558,199.62971,18.85382
4,2001,50.9702,42.20998,67.09964,8.46791,-15.85279,-16.81409,-12.48207,-9.37636,12.63699,...,9.92661,-55.95724,64.92712,-17.72522,-1.49237,-7.50035,51.76631,7.88713,55.66926,28.74903


In [16]:
data.shape

(515345, 91)

In [24]:
training_data = data[:463715]
training_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,81,82,83,84,85,86,87,88,89,90
0,2001,49.94357,21.47114,73.0775,8.74861,-17.40628,-13.09905,-25.01202,-12.23257,7.83089,...,13.0162,-54.40548,58.99367,15.37344,1.11144,-23.08793,68.40795,-1.82223,-27.46348,2.26327
1,2001,48.73215,18.4293,70.32679,12.94636,-10.32437,-24.83777,8.7663,-0.92019,18.76548,...,5.66812,-19.68073,33.04964,42.87836,-9.90378,-32.22788,70.49388,12.04941,58.43453,26.92061
2,2001,50.95714,31.85602,55.81851,13.41693,-6.57898,-18.5494,-3.27872,-2.35035,16.07017,...,3.038,26.05866,-50.92779,10.93792,-0.07568,43.2013,-115.00698,-0.05859,39.67068,-0.66345
3,2001,48.2475,-1.89837,36.29772,2.58776,0.9717,-26.21683,5.05097,-10.34124,3.55005,...,34.57337,-171.70734,-16.96705,-46.67617,-12.51516,82.58061,-72.08993,9.90558,199.62971,18.85382
4,2001,50.9702,42.20998,67.09964,8.46791,-15.85279,-16.81409,-12.48207,-9.37636,12.63699,...,9.92661,-55.95724,64.92712,-17.72522,-1.49237,-7.50035,51.76631,7.88713,55.66926,28.74903


In [32]:
y_train = training_data.iloc[:, 0]
y_train.head()

0    2001
1    2001
2    2001
3    2001
4    2001
Name: 0, dtype: int64

In [33]:
x_train = training_data.iloc[:, 1:]
x_train.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,81,82,83,84,85,86,87,88,89,90
0,49.94357,21.47114,73.0775,8.74861,-17.40628,-13.09905,-25.01202,-12.23257,7.83089,-2.46783,...,13.0162,-54.40548,58.99367,15.37344,1.11144,-23.08793,68.40795,-1.82223,-27.46348,2.26327
1,48.73215,18.4293,70.32679,12.94636,-10.32437,-24.83777,8.7663,-0.92019,18.76548,4.5921,...,5.66812,-19.68073,33.04964,42.87836,-9.90378,-32.22788,70.49388,12.04941,58.43453,26.92061
2,50.95714,31.85602,55.81851,13.41693,-6.57898,-18.5494,-3.27872,-2.35035,16.07017,1.39518,...,3.038,26.05866,-50.92779,10.93792,-0.07568,43.2013,-115.00698,-0.05859,39.67068,-0.66345
3,48.2475,-1.89837,36.29772,2.58776,0.9717,-26.21683,5.05097,-10.34124,3.55005,-6.36304,...,34.57337,-171.70734,-16.96705,-46.67617,-12.51516,82.58061,-72.08993,9.90558,199.62971,18.85382
4,50.9702,42.20998,67.09964,8.46791,-15.85279,-16.81409,-12.48207,-9.37636,12.63699,0.93609,...,9.92661,-55.95724,64.92712,-17.72522,-1.49237,-7.50035,51.76631,7.88713,55.66926,28.74903


In [18]:
test_data = data[463715:]
test_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,81,82,83,84,85,86,87,88,89,90
463715,2007,45.442,-30.74976,31.78587,4.63569,-15.14894,0.2337,-11.97968,-9.59708,6.48111,...,-8.84046,-0.15439,137.4421,77.54739,-4.22875,-61.92657,-33.52722,-3.86253,36.424,7.17309
463716,2003,52.67814,-2.88914,43.95268,-1.39209,-14.93379,-15.86877,1.19379,0.31401,-4.44235,...,-5.74356,-42.5791,-2.91103,48.72805,-3.08183,-9.38888,-7.27179,-4.00966,-68.96211,-5.21525
463717,2005,45.74235,12.02291,11.03009,-11.60763,11.80054,-11.12389,-5.39058,-1.11981,-7.74086,...,-4.70606,-24.22599,-35.22686,27.77729,15.38934,58.20036,-61.12698,-10.92522,26.75348,-5.78743
463718,2003,52.55883,2.87222,27.38848,-5.76235,-15.35766,-15.01592,-5.86893,-0.31447,-5.06922,...,-8.35215,-16.86791,-10.58277,40.10173,-0.54005,-11.54746,-45.3586,-4.55694,-43.17368,-3.33725
463719,2005,51.34809,9.02702,25.33757,-6.62537,0.03367,-12.69565,-3.134,2.98649,-6.7175,...,-6.87366,-20.03371,-66.3894,50.56569,0.27747,67.05657,-55.58846,-7.50859,28.23511,-0.72045


In [34]:
y_test = test_data.iloc[:, 0]
y_test.head()

463715    2007
463716    2003
463717    2005
463718    2003
463719    2005
Name: 0, dtype: int64

In [35]:
x_test = test_data.iloc[:, 1:]
x_test.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,81,82,83,84,85,86,87,88,89,90
463715,45.442,-30.74976,31.78587,4.63569,-15.14894,0.2337,-11.97968,-9.59708,6.48111,-8.89073,...,-8.84046,-0.15439,137.4421,77.54739,-4.22875,-61.92657,-33.52722,-3.86253,36.424,7.17309
463716,52.67814,-2.88914,43.95268,-1.39209,-14.93379,-15.86877,1.19379,0.31401,-4.44235,-5.78934,...,-5.74356,-42.5791,-2.91103,48.72805,-3.08183,-9.38888,-7.27179,-4.00966,-68.96211,-5.21525
463717,45.74235,12.02291,11.03009,-11.60763,11.80054,-11.12389,-5.39058,-1.11981,-7.74086,-3.33421,...,-4.70606,-24.22599,-35.22686,27.77729,15.38934,58.20036,-61.12698,-10.92522,26.75348,-5.78743
463718,52.55883,2.87222,27.38848,-5.76235,-15.35766,-15.01592,-5.86893,-0.31447,-5.06922,-4.62734,...,-8.35215,-16.86791,-10.58277,40.10173,-0.54005,-11.54746,-45.3586,-4.55694,-43.17368,-3.33725
463719,51.34809,9.02702,25.33757,-6.62537,0.03367,-12.69565,-3.134,2.98649,-6.7175,-1.85804,...,-6.87366,-20.03371,-66.3894,50.56569,0.27747,67.05657,-55.58846,-7.50859,28.23511,-0.72045


## Model Training + Evaluation

In [36]:
model = xgb.XGBRegressor()
model.fit(x_train, y_train)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1)

In [37]:
preds = model.predict(x_test)

### Average Absolute Error

In [45]:
# In paper, results are 6.14
mean_absolute_error(preds, y_test)

6.647365714144665

### Square Root of Average Squared Error

In [46]:
# In paper, they get 8.76
np.sqrt(mean_squared_error(preds, y_test))

9.353678731132687

### Save Data to CSV

In [52]:
training_data.to_csv("msd_training_data.csv", index=False)

In [53]:
test_data.to_csv("msd_test_data.csv", index=False)