In [1]:
#import and alias pandas and numpy
import pandas as pd
import numpy as np

In [2]:
#read in data file
data = pd.read_csv("/Users/christopherbond/Desktop/Portfolio Projects/car_purchasing.csv",encoding='latin-1')
data.head()

Unnamed: 0,customer name,customer e-mail,country,gender,age,annual Salary,credit card debt,net worth,car purchase amount
0,Martina Avila,cubilia.Curae.Phasellus@quisaccumsanconvallis.edu,Bulgaria,0,41.85172,62812.09301,11609.38091,238961.2505,35321.45877
1,Harlan Barnes,eu.dolor@diam.co.uk,Belize,0,40.870623,66646.89292,9572.957136,530973.9078,45115.52566
2,Naomi Rodriquez,vulputate.mauris.sagittis@ametconsectetueradip...,Algeria,1,43.152897,53798.55112,11160.35506,638467.1773,42925.70921
3,Jade Cunningham,malesuada@dignissim.com,Cook Islands,1,58.271369,79370.03798,14426.16485,548599.0524,67422.36313
4,Cedric Leach,felis.ullamcorper.viverra@egetmollislectus.net,Brazil,1,57.313749,59729.1513,5358.712177,560304.0671,55915.46248


In [32]:
data.shape

(500, 6)

In [3]:
#generate list of column names
col_list = data.columns.tolist()
col_list

['customer name',
 'customer e-mail',
 'country',
 'gender',
 'age',
 'annual Salary',
 'credit card debt',
 'net worth',
 'car purchase amount']

In [4]:
#index dataframe to remove extraneous data columns
data = data[['gender',
 'age',
 'annual Salary',
 'credit card debt',
 'net worth',
 'car purchase amount']]

data.head()

Unnamed: 0,gender,age,annual Salary,credit card debt,net worth,car purchase amount
0,0,41.85172,62812.09301,11609.38091,238961.2505,35321.45877
1,0,40.870623,66646.89292,9572.957136,530973.9078,45115.52566
2,1,43.152897,53798.55112,11160.35506,638467.1773,42925.70921
3,1,58.271369,79370.03798,14426.16485,548599.0524,67422.36313
4,1,57.313749,59729.1513,5358.712177,560304.0671,55915.46248


In [5]:
#Check if dataframe contains any missing values to be addressed.
data.isna().sum() 

gender                 0
age                    0
annual Salary          0
credit card debt       0
net worth              0
car purchase amount    0
dtype: int64

In [6]:
#Round data values to 2 decimal places
data = data.round(2)
data.head()

Unnamed: 0,gender,age,annual Salary,credit card debt,net worth,car purchase amount
0,0,41.85,62812.09,11609.38,238961.25,35321.46
1,0,40.87,66646.89,9572.96,530973.91,45115.53
2,1,43.15,53798.55,11160.36,638467.18,42925.71
3,1,58.27,79370.04,14426.16,548599.05,67422.36
4,1,57.31,59729.15,5358.71,560304.07,55915.46


In [7]:
#Perform correlation matrix. Was surprised at how little credit card debt is a factor. 
corr_matrix = data.corr()
corr_matrix

Unnamed: 0,gender,age,annual Salary,credit card debt,net worth,car purchase amount
gender,1.0,-0.064487,-0.036499,0.024193,-0.008395,-0.066408
age,-0.064487,1.0,9.9e-05,0.034689,0.020365,0.63285
annual Salary,-0.036499,9.9e-05,1.0,0.049599,0.014767,0.617862
credit card debt,0.024193,0.034689,0.049599,1.0,-0.049378,0.028883
net worth,-0.008395,0.020365,0.014767,-0.049378,1.0,0.48858
car purchase amount,-0.066408,0.63285,0.617862,0.028883,0.48858,1.0


In [8]:
#Sort correlation values for our target column "Car Purchase Amount"
corr_matrix["car purchase amount"].sort_values(ascending=False)

car purchase amount    1.000000
age                    0.632850
annual Salary          0.617862
net worth              0.488580
credit card debt       0.028883
gender                -0.066408
Name: car purchase amount, dtype: float64

In [9]:
#Index dataframe for "car purchase amount". This is our target data. 
data_target = data["car purchase amount"]

In [10]:
#Drop "car purchase amount" column from dataframe. This dataframe will be our feature data. 
data_features = data.drop(["car purchase amount"], axis = 1)

In [11]:
#Import train_test split from sklearn. Perform test train split on our feature (X_data) and target (y_data) data.
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(data_features, test_size=0.2, random_state = 42)
y_train, y_test = train_test_split(data_target, test_size=0.2, random_state = 42)

In [12]:
#Import GradientBoostingRegressor from sklearn.
from sklearn.ensemble import GradientBoostingRegressor

#Instantiate a GradientBoostingRegressor.
gbr_reg = GradientBoostingRegressor(random_state = 42)

#Fit the GradientBoostingRegressor to our training data. 
gbr_reg.fit(X_train, y_train)

In [13]:
#Import DecisionTreeRegressor from sklearn.
from sklearn.tree import DecisionTreeRegressor

#Instantiate a DecisionTreeRegressor.
tree_reg = DecisionTreeRegressor(random_state = 42)

#Fit the DecisionTreeRegressor to our training data.
tree_reg.fit(X_train, y_train)

In [14]:
#Import RandomForestRegressor from sklearn.
from sklearn.ensemble import RandomForestRegressor

#Instantiate a RandomForestRegressor.
forest_reg = RandomForestRegressor(random_state = 42)

#Fit the RandomForestRegressor to our training data.
forest_reg.fit(X_train, y_train)

In [15]:
#Import RandomForestRegressor from sklearn.
from sklearn.model_selection import cross_val_score


#Perform cross fold validation on training data using the GradientBoostingRegressor. 
gbr_rmse = -cross_val_score(gbr_reg, X_train, y_train,
                              scoring="neg_root_mean_squared_error", cv=10)

In [16]:
#Perform cross fold validation on training data using the DecisionTreeRegressor.
tree_rmse = -cross_val_score(tree_reg, X_train, y_train,
                              scoring="neg_root_mean_squared_error", cv=10)

In [17]:
#Perform cross fold validation on training data using the RandomForestRegressor.
forest_rmse = -cross_val_score(forest_reg, X_train, y_train,
                              scoring="neg_root_mean_squared_error", cv=10)

In [18]:
#Print RMSE from cross fold. Gradient Boost appears to be the best so far.
print("GBR RMSE : " + str(pd.Series(gbr_rmse).mean()))
print("Tree RMSE: " + str(pd.Series(tree_rmse).mean()))
print("Forest RMSE: " + str(pd.Series(forest_rmse).mean()))

GBR RMSE : 1975.295235847699
Tree RMSE: 4114.8470994099325
Forest RMSE: 2707.673749257173


In [19]:
#GradientBoostingRegressor GridSearch
#Performed numerous grid searches previously to zero in on these parameters. 

#Import the GridSearchCV class from the sklearn.model_selection library
from sklearn.model_selection import GridSearchCV

#Create a dictionary of search parameters called param_grid
param_grid = [{'max_depth': [2], 'n_estimators': [1475,1500,1525], 'learning_rate': [.3]},] 

#Instantiate GradientBoostingRegressor model
opt_gbr_reg = GradientBoostingRegressor(random_state=42)

#Instantiate a grid search. 
gbr_grid_search_cv = GridSearchCV(opt_gbr_reg,param_grid,cv=3)

#Run grid search on training data. 
gbr_grid_search_cv.fit(X_train, y_train)

#Print the best parameters
print("The best parameters are: ", gbr_grid_search_cv.best_params_)

The best parameters are:  {'learning_rate': 0.3, 'max_depth': 2, 'n_estimators': 1500}


In [20]:
#used best_params_attribute of the gbrt_grid_search to generate an optimal GradientBoostingRegressor
optimal_gbr = GradientBoostingRegressor(**gbr_grid_search_cv.best_params_, random_state = 42)

#fit the optimal regressor onto our training data
optimal_gbr.fit(X_train,y_train)

In [21]:
#Perform cross fold validation on training data using the tuned GradientBoostingRegressor. 
gbr_opt_rmse = -cross_val_score(optimal_gbr, X_train, y_train,
                              scoring="neg_root_mean_squared_error", cv=10)

array([1589.66508541, 1782.4007327 , 1617.25731331, 1461.36396636,
       1742.05128581, 1791.93721193, 1584.63679331, 1513.69293508,
       1728.33725982, 1704.13874551])

In [22]:
#Create a dictionary of search parameters called param_grid.
param_grid = [{'min_samples_split': [2], 'max_depth': [15,16,17,18,19], 'n_estimators': [34,35,36]}] 

#Instantiate a RandomForestRegressor model.
opt_forest_reg = RandomForestRegressor(random_state=42)

#Instantiate a grid search.
forest_grid_search_cv = GridSearchCV(opt_forest_reg,param_grid,cv=3)

#Run grid search on training data.
forest_grid_search_cv.fit(X_train, y_train)

#Print the best parameters
print("The best parameters are: ", forest_grid_search_cv.best_params_)

The best parameters are:  {'max_depth': 17, 'min_samples_split': 2, 'n_estimators': 35}


In [23]:
#Used best_params_attribute of the gbrt_grid_search to generate an optimal GradientBoostingRegressor
opt_forest = RandomForestRegressor(**forest_grid_search_cv.best_params_, random_state = 42)

#Fit the optimal regressor onto our training data
opt_forest.fit(X_train,y_train)

In [24]:
#Perform cross fold validation on training data using the tuned GradientBoostingRegressor. 
forest_opt_rmse = -cross_val_score(opt_forest, X_train, y_train,
                              scoring="neg_root_mean_squared_error", cv=10)

array([2173.27607222, 3631.60080602, 2677.76703376, 2888.07769138,
       3461.99299627, 3563.25605457, 1267.08596968, 2356.07980699,
       2788.31156919, 2314.71926148])

In [25]:
#Print RMSE from cross fold. Gradient Boost appears to be the best so far.
print("Optimal GBR RMSE : " + str(pd.Series(gbr_opt_rmse).mean()))
print("Optimal Forest RMSE: " + str(pd.Series(forest_opt_rmse).mean()))

Optimal GBR RMSE : 1651.5481329233469
Optimal Forest RMSE: 2712.216726155525


In [30]:
#Generated predictions on our X_test data using optimal GBR Model.
final_pred = optimal_gbr.predict(X_test)

#Calculate RMSE using predicted X_test values and y_test data.
final_rmse = round(mean_squared_error(y_test, final_pred, squared=False),2)
final_rmse 

1420.42

In [31]:
#Print RMSE of GBR Final Model.
print("Final RMSE : " + str(final_rmse))

Final RMSE : 1420.42
