## This demonstrates scikit-learn regression for comparison with Tribuo regression

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [2]:
# This dataset is prepared in the notebook: scikit-learn Regressor - Data Cleanup
# unzip cleanedCars.zip
df = pd.read_csv('../../data/cleanedCars.csv')
# print(df)

In [3]:
df_X = df.drop(['price_usd'], axis=1)
df_y = pd.DataFrame(df[['price_usd']])


X_train, X_test, y_train, y_test = train_test_split(df_X.values, df_y.values, test_size=0.2, random_state=1)
print('Training data size = %d, number of features = %d' % (len(X_train), len(df.columns) - 1))
print('Testing data size = %d, number of features = %d' % (len(X_test), len(df.columns) - 1))

Training data size = 30816, number of features = 1186
Testing data size = 7705, number of features = 1186


In [4]:
def evaluate(actual, predicted):
    print('Evaluation (test):')
    print('  RMSE: %.2f' % mean_squared_error(actual, predicted, squared=False))
    print('  MAE:  %.2f' % mean_absolute_error(actual, predicted))
    print('  R^2:  %.2f' % r2_score(actual, predicted))

In [5]:
sgd = SGDRegressor(learning_rate='constant', eta0=0.01)

lr = LinearRegression()

cart = DecisionTreeRegressor(max_depth=10)

rfr = RandomForestRegressor(n_estimators=150, criterion='mse', random_state=1, n_jobs=-1)

In [6]:
%time sgd.fit(X_train, y_train.ravel())

# run 1
# time:  0.90 s

# run 2
# time:  1.44 s

# run 3
# time:  0.81 s

CPU times: user 791 ms, sys: 5.62 ms, total: 797 ms
Wall time: 805 ms


SGDRegressor(learning_rate='constant')

In [7]:
predicted = sgd.predict(X_test)
evaluate(y_test, predicted)

# run 1
#  RMSE: 702927861133705216000.00
#  MAE:  614450579929122078720.00
#  R^2:  -11742524120677054383333060064051200.00

# run 2
#  RMSE: 419040872859692302336.00
#  MAE:  366281277131313446912.00
#  R^2:  -4173041635673082319358243373056000.00 

# run 3
#  RMSE: 397216533342632148992.00
#  MAE:  347224843698021138432.00
#  R^2:  -3749683226619859872732929331298304.00 

Evaluation (test):
  RMSE: 397216533342632148992.00
  MAE:  347224843698021138432.00
  R^2:  -3749683226619859872732929331298304.00


In [8]:
%time lr = lr.fit(X_train,y_train)
# run 1
# time:  4.82 s

# run 2
# time:  4.48 s

# run 3
# time:  4.53 s

CPU times: user 7.66 s, sys: 287 ms, total: 7.95 s
Wall time: 4.53 s


In [9]:
predicted = lr.predict(X_test)
evaluate(y_test, predicted)

# run 1
#  RMSE: 2845.83
#  MAE:  1728.60
#  R^2:  0.81

# run 2
#  RMSE: 2845.83
#  MAE:  1728.60
#  R^2:  0.81 

# run 3
#  RMSE: 2845.83
#  MAE:  1728.60
#  R^2:  0.81 

Evaluation (test):
  RMSE: 2845.83
  MAE:  1728.60
  R^2:  0.81


In [10]:
%time cart = cart.fit(X_train,y_train)
# run 1
# time:  2.19 s

# run 2
# time:  2.15 s

# run 3
# time:  2.18 s

CPU times: user 2.09 s, sys: 63.9 ms, total: 2.16 s
Wall time: 2.18 s


In [11]:
predicted = cart.predict(X_test)
evaluate(y_test, predicted)

# run 1
#  RMSE: 2511.46
#  MAE:  1474.86
#  R^2:  0.85

# run 2
#  RMSE: 2503.18
#  MAE:  1472.68
#  R^2:  0.85 

# run 3
#  RMSE: 2503.72
#  MAE:  1472.48
#  R^2:  0.85

Evaluation (test):
  RMSE: 2503.72
  MAE:  1472.48
  R^2:  0.85


In [12]:
%time rfr.fit(X_train, y_train.ravel())
# run 1
# time:  2min 11s

# run 2
# time:  2min 13s

# run 3
# time:  2min 6s

CPU times: user 7min 23s, sys: 1.66 s, total: 7min 25s
Wall time: 2min 6s


RandomForestRegressor(n_estimators=150, n_jobs=-1, random_state=1)

In [13]:
predicted = rfr.predict(X_test)
evaluate(y_test, predicted)

# run 1
#  RMSE: 1899.01
#  MAE:  1076.23
#  R^2:  0.91

# run 2
#  RMSE: 1899.01
#  MAE:  1076.23
#  R^2:  0.91 

#  RMSE: 1899.01
#  MAE:  1076.23
#  R^2:  0.91

Evaluation (test):
  RMSE: 1899.01
  MAE:  1076.23
  R^2:  0.91
