In this notebook, we will look at building regression models for decision tree, random forest and SVM.

We are going to start with Linear Regression on teh Car purchase dataset. Previously, we considered this dataset for linear regression. 

In [2]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split

In [3]:
auto = pd.read_csv('Car_Purchasing_Data.csv')
print(auto.head())

print(auto.shape)


auto.head()

print(auto.columns)
print(auto.dtypes)

print(auto.Country.unique())

     Customer Name                                    Customer e-mail Country  \
0    Martina Avila  cubilia.Curae.Phasellus@quisaccumsanconvallis.edu     USA   
1    Harlan Barnes                                eu.dolor@diam.co.uk     USA   
2  Naomi Rodriquez  vulputate.mauris.sagittis@ametconsectetueradip...     USA   
3  Jade Cunningham                            malesuada@dignissim.com     USA   
4     Cedric Leach     felis.ullamcorper.viverra@egetmollislectus.net     USA   

   Gender  Age  Annual Salary  Credit Card Debt    Net Worth  \
0       0   42    62812.09301      11609.380910  238961.2505   
1       0   41    66646.89292       9572.957136  530973.9078   
2       1   43    53798.55112      11160.355060  638467.1773   
3       1   58    79370.03798      14426.164850  548599.0524   
4       1   57    59729.15130       5358.712177  560304.0671   

   Car Purchase Amount  
0          35321.45877  
1          45115.52566  
2          42925.70921  
3          67422.36313  
4  

In [4]:
# dropping 

auto.drop(["Customer Name", "Customer e-mail", "Country"], axis=1, inplace=True)

#df.drop(['column_name1', 'column_name2'], axis=1, inplace=True)

print(auto.shape)

print(auto.columns)
print(auto.info())

(500, 6)
Index(['Gender', 'Age', 'Annual Salary', 'Credit Card Debt', 'Net Worth',
       'Car Purchase Amount'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Gender               500 non-null    int64  
 1   Age                  500 non-null    int64  
 2   Annual Salary        500 non-null    float64
 3   Credit Card Debt     500 non-null    float64
 4   Net Worth            500 non-null    float64
 5   Car Purchase Amount  500 non-null    float64
dtypes: float64(4), int64(2)
memory usage: 23.6 KB
None


In [5]:
# creating an instance of LinearRegression class
reg = linear_model.LinearRegression()
print(reg)

LinearRegression()


In [6]:
from sklearn.preprocessing import StandardScaler

In [7]:
a_scaler = StandardScaler() # creating an instance of Standard Scaler

auto_independent = a_scaler.fit_transform(auto[['Age','Annual Salary', 'Credit Card Debt', 'Net Worth']])

auto_target = auto["Car Purchase Amount"]

# using train_test_split(), we are splitting the data into training and test 
# x_train, x_test, y_train, y_test

x_train, x_test, y_train, y_test = train_test_split(auto_independent, auto_target, test_size=0.2, random_state=4)

### Linear Regression

In [9]:
# we have to do the fit on the training data
reg.fit(x_train, y_train)

# y = mx + b here m is the coefficient (or slope) of x 
# and b is the intercept

print(reg.coef_) # 
print(reg.intercept_)


yhat = reg.predict(x_test)

from sklearn.metrics import mean_squared_error

mse_test = mean_squared_error(y_test, yhat)
print(mse_test)


# y_t_predict is the predicted y values for the x_train data
y_t_predict = reg.predict(x_train)

# note that y_train is the true y value
mse_train = mean_squared_error(y_train, y_t_predict)
print(mse_train)


from sklearn.metrics import r2_score

print("r-squared for the test data: ", r2_score(y_test, yhat))
    
print("r-squared for the train data: ", r2_score(y_train, y_t_predict))

[6698.49987763 6568.62278558   20.6676723  5020.44202563]
44203.966801967785
60782.516700413435
57257.252200951014
r-squared for the test data:  0.9994342211483591
r-squared for the train data:  0.999507487102516


#### Linear Regression Equation

The relationship between the independent and dependent variables is 
'Age','Annual Salary', 'Credit Card Debt', 'Net Worth' independent variables

car purchase amount is the dependent variable

car_purchase = 6698.49* age + 6568.62 * annual_salary + 20.66 * credit_debt + 5050.44 * net_worth + 44203.96

### Decision Tree Regressor

In [12]:
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor(random_state=0)
dt.fit(x_train, y_train)

yhat_dt = dt.predict(x_test)


mse_test_dt = mean_squared_error(y_test, yhat_dt)
print(mse_test_dt)


# y_t_predict is the predicted y values for the x_train data
y_t_predict_dt = dt.predict(x_train)

# note that y_train is the true y value
mse_train_dt = mean_squared_error(y_train, y_t_predict_dt)
print(mse_train_dt)


from sklearn.metrics import r2_score

print("r-squared for the test data: ", r2_score(y_test, yhat_dt))
    
print("r-squared for the train data: ", r2_score(y_train, y_t_predict_dt))

16295203.887030277
0.0
r-squared for the test data:  0.848320170948182
r-squared for the train data:  1.0


#### Random Forest Regressor

In [14]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(max_depth=3, random_state=0)
rf.fit(x_train, y_train)


yhat_rf = rf.predict(x_test)


mse_test_rf = mean_squared_error(y_test, yhat_rf)
print(mse_test_rf)


# y_t_predict is the predicted y values for the x_train data
y_t_predict_rf = rf.predict(x_train)

# note that y_train is the true y value
mse_train_rf = mean_squared_error(y_train, y_t_predict_rf)
print(mse_train_rf)


print("r-squared for the test data: ", r2_score(y_test, yhat_rf))
    
print("r-squared for the train data: ", r2_score(y_train, y_t_predict_rf))

25832764.853058342
20564601.174725942
r-squared for the test data:  0.7595421705667412
r-squared for the train data:  0.8231083239094181


### SVM Regressor

In [16]:
from sklearn.svm import SVR

svm = SVR(kernel="linear", C=1.0, epsilon=0.2)
svm.fit(x_train, y_train)

yhat_svm = svm.predict(x_test)


mse_test_svm = mean_squared_error(y_test, yhat_svm)
print(mse_test_svm)


# y_t_predict is the predicted y values for the x_train data
y_t_predict_svm = svm.predict(x_train)

# note that y_train is the true y value
mse_train_svm = mean_squared_error(y_train, y_t_predict_svm)
print(mse_train_svm)

print("r-squared for the test data: ", r2_score(y_test, yhat_svm))
    
print("r-squared for the train data: ", r2_score(y_train, y_t_predict_svm))

107124728.95281684
109419302.31971906
r-squared for the test data:  0.0028562583547495635
r-squared for the train data:  0.0588018887628563


### Conclusion

Comparing all posible models, linear regression performed the best.
SVM regressor performed the worst.