## This demonstrates scikit-learn regression for comparison with Tribuo regression

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [2]:
# This dataset is prepared in the notebook: scikit-learn Regressor - Data Cleanup
# unzip cleanedCars.zip
df = pd.read_csv('data/cleanedCars.csv')
# print(df)

In [3]:
df_X = df.drop(['price_usd'], axis=1)
df_y = pd.DataFrame(df[['price_usd']])


X_train, X_test, y_train, y_test = train_test_split(df_X.values, df_y.values, test_size=0.2, random_state=1)
print('Training data size = %d, number of features = %d' % (len(X_train), len(df.columns) - 1))
print('Testing data size = %d, number of features = %d' % (len(X_test), len(df.columns) - 1))

Training data size = 30816, number of features = 1186
Testing data size = 7705, number of features = 1186


In [4]:
def evaluate(actual, predicted):
    print('Evaluation (test):')
    print('  RMSE: %.2f' % mean_squared_error(actual, predicted, squared=False))
    print('  MAE:  %.2f' % mean_absolute_error(actual, predicted))
    print('  R^2:  %.2f' % r2_score(actual, predicted))

In [5]:
sgd = SGDRegressor(learning_rate='constant', eta0=0.01)

lr = LinearRegression()

cart = DecisionTreeRegressor(max_depth=10)

rfr = RandomForestRegressor(n_estimators=150, criterion='mse', random_state=1, n_jobs=-1)

In [6]:
%time sgd.fit(X_train, y_train.ravel())

# run 1
# time:  0.90s

# run 2
# time:  

# run 3
# time:  

CPU times: user 815 ms, sys: 12.8 ms, total: 828 ms
Wall time: 901 ms


SGDRegressor(learning_rate='constant')

In [7]:
predicted = sgd.predict(X_test)
evaluate(y_test, predicted)

# run 1
#

# run 2
# 

# run 3
# 

Evaluation (test):
  RMSE: 639742832053393162240.00
  MAE:  559204384222139645952.00
  R^2:  -9726370381207040741037385365585920.00


In [8]:
%time lr = lr.fit(X_train,y_train)
# run 1
# time:  5.60s

# run 2
# time:  

# run 3
# time:  

CPU times: user 8.7 s, sys: 392 ms, total: 9.09 s
Wall time: 5.6 s


In [9]:
predicted = lr.predict(X_test)
evaluate(y_test, predicted)

# run 1
#

# run 2
# 

# run 3
#

Evaluation (test):
  RMSE: 2845.83
  MAE:  1728.60
  R^2:  0.81


In [10]:
%time cart = cart.fit(X_train,y_train)
# run 1
# time:  2.25s

# run 2
# time:  

# run 3
# time:  

CPU times: user 2.14 s, sys: 67.3 ms, total: 2.2 s
Wall time: 2.25 s


In [11]:
predicted = cart.predict(X_test)
evaluate(y_test, predicted)

# run 1
#

# run 2
# 

# run 3
#

Evaluation (test):
  RMSE: 2498.51
  MAE:  1471.50
  R^2:  0.85


In [12]:
%time rfr.fit(X_train, y_train.ravel())
# run 1
# time:  2.37s

# run 2
# time:  

# run 3
# time:  

CPU times: user 7min 37s, sys: 4.55 s, total: 7min 41s
Wall time: 2min 37s


RandomForestRegressor(n_estimators=150, n_jobs=-1, random_state=1)

In [13]:
predicted = rfr.predict(X_test)
evaluate(y_test, predicted)

# run 1
#

# run 2
# 

# run 3
#

Evaluation (test):
  RMSE: 1899.01
  MAE:  1076.23
  R^2:  0.91
