Price Prediction With Regression Analysis in R

In [4]:
install.packages("corrplot")
install.packages("glmnet")



The downloaded binary packages are in
	/var/folders/11/mktbwy31519g01sbl5b6y3_c0000gn/T//RtmpeZeGIR/downloaded_packages

The downloaded binary packages are in
	/var/folders/11/mktbwy31519g01sbl5b6y3_c0000gn/T//RtmpeZeGIR/downloaded_packages


In [5]:
library(corrplot)
library(glmnet)

corrplot 0.92 loaded

Loading required package: Matrix

Loaded glmnet 4.1-8



Task 2: Load the Dataset

In [None]:
#Load the dataset as a data frame
dataset <- read.csv("scrap_price.csv")
#View the data frame
print(dataset)

Task 3: Explore the Dataset

In [None]:
#Find the dimension of the data frame
dim(dataset)
#Displays internal structure of data frame
str(dataset)

In [None]:
#Get a list of unique elements in each column
ulist <- lapply(dataset, unique)
ulen <- lengths(ulist)
#Print the number of unique values in each column
print(ulen)

Task 4: Preprocess the Dataset

In [None]:
#Drop the column from the data frame
newdata <- dataset[ -c(1,3) ]

#print the head of the subset data frame
print(head(newdata))

In [None]:
#Convert categorical columns to numeric
newdata[, c('fueltypes', 'aspiration', 'doornumbers', 'carbody', 'drivewheels', 'enginelocation', 'enginetype', 'cylindernumber', 'fuelsystem')] <- sapply(newdata[, c('fueltypes', 'aspiration', 'doornumbers', 'carbody', 'drivewheels', 'enginelocation', 'enginetype', 'cylindernumber', 'fuelsystem')], unclass)

#Print the head of the preprocessed data frame
print(head(newdata))

In [None]:
#Scale data
newdata <- scale(newdata)

#Convert to data frame
dataframe=as.data.frame(newdata)

#Display the initial rows of the data frame
print(head(dataframe))

Task 5: Find Outliers in the Dataset

In [None]:
#Create a grid of boxplots
par(mfrow=c(2,2))
for (i in 1:length(dataframe)) {
        boxplot(dataframe[,i], main=names(dataframe[i]), type="l", outcol="blue")
}

Task 6: Feature Correlation Visualisation

In [None]:
#Calculate correlation
dataframe.cor = cor(dataframe)
#Display the correlation plot
corrplot(dataframe.cor)

Linear Regression

Task 7: Split Test and Train Data

In [None]:
#Make the results reproducible
set.seed(1)

#Use 70% of the dataset as a training set and 30% as a test set
sample <- sample(seq(1, 2), nrow(dataframe), replace=TRUE, prob=c(0.7,0.3))
train_set  <- dataframe[sample==1, ]
test   <- dataframe[sample==2, ]

In [None]:
#Store the target variable and other variables of the complete dataset separately
x_vars_df <- dataframe[,-c(24)]
y_var_df <- dataframe[, c(24)]

#Store the target variable and other variables of the test data
test_set_x <- test[,-c(24)]
test_set_y <- test[,c(24)]

In [None]:
print(nrow(train_set))
print(nrow(test))

Task 8: Train the Model 

In [None]:
# Create the regression model.
fit_train = lm(price ~ symboling + fueltypes + aspiration + doornumbers + carbody + drivewheels + enginelocation + wheelbase + carlength + carwidth + carheight + curbweight + enginetype + cylindernumber + enginesize + fuelsystem + boreratio + stroke + compressionratio + horsepower + peakrpm + citympg + highwaympg, data = train_set)

In [None]:
print(summary(fit_train))

Task 9: Test the Model

In [None]:
fit_test = predict(fit_train, newdata=test_set_x)

In [None]:
#Create a grid for scatter plots
par(mfrow=c(2,2))
for (i in 1:length(test_set_x)) {
     main_label=paste("Relation b/w", names(test_set_x[i]),"& Price", sep = " ", collapse = NULL)
     plot(test_set_x[,i], fit_test , main=main_label, xlab=names(test_set_x[i]), ylab="Price ",pch=19)
     lines(lowess(test_set_x[,i],fit_test), col="blue")
     lines(lowess(test_set_x[,i],test_set_y), col="red")       
}

Task 10: Evaluate the Model

In [None]:
mse <- ( mean((fit_test- (test_set_y))^2) )
print(mse)

In [None]:
#find SST and SSE
sst <- sum((test_set_y - mean(test_set_y))^2)
sse <- sum((fit_test - test_set_y)^2)

#find R-Squared of test Data
rsq <- 1 - sse/sst
print(rsq)

 Lasso Regression

Task 11: Prepare the Data 

In [None]:
#Convert all data frames to matrixes
x_vars <- as.matrix(x_vars_df)
y_var <- as.matrix(y_var_df)
x_test <- as.matrix(test_set_x)
y_test <- as.vector(test_set_y)
x_train <- as.matrix (train_set[, -c(24)])
y_train <- as.vector(train_set[, c(24)])
train <- as.matrix(train_set)

In [None]:
#Verify the split
print(length(y_train))
print(length(y_test))

Task 12: Create and Run Lasso Regression Model

In [None]:
lambda_seq <- 10^seq(-2, 2, by = .1)

#Train the model
cv_output <- cv.glmnet(x_train, y_train, alpha = 1, lambda = lambda_seq, nfolds = 5)

Task 13: Find the Best Lambda Value

In [None]:
plot(cv_output)

In [None]:
best_lam <- cv_output$lambda.min
print(best_lam)

 Task 14: Build Lasso Model With the Best Lambda

In [None]:
#Build a Model
lasso_best <- glmnet(x_train, y_train, alpha = 1, lambda = best_lam)

#Display the coefficients of all variables after training
print(coef(lasso_best))

#Display the list of variables with zero coefficients
print("Following variables are declared irrelevant by the model:")
print(coef(lasso_best)[coef(lasso_best)[,1]==0,])

Task 15: Test the Model

In [None]:
pred <- predict(lasso_best, s = best_lam, newx = x_test)

In [None]:
#Create a grid for scatter plots
par(mfrow=c(2,2))
for (i in 1:length(test_set_x)) {
     main_label=paste("Relation b/w", names(test_set_x[i]),"& Price", sep = " ", collapse = NULL)
     plot(test_set_x[,i], pred , main=main_label, xlab=names(test_set_x[i]), ylab="Price ",pch=19)
     lines(lowess(test_set_x[,i],pred), col="blue")
     lines(lowess(test_set_x[,i],y_test), col="red")       
}

Task 16: Evaluate the Model

In [None]:
#Calculate the MSE of the test data:
mse <- (mean((pred - y_test)^2)) 
print(mse)

In [None]:
#Find SST and SSE
sst <- sum((y_test - mean(y_test))^2)
sse <- sum((pred - y_test)^2)

#Find R-Squared of test Data
rsq <- 1 - sse/sst
print(rsq)