#Importing Modules


In [63]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

#Importing the dataset

In [64]:
dataset = pd.read_csv('CarPrice_Assignment.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values


#Encoding categorical data

In [65]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [2,3,4,5,6,7,8,14,15,17])], remainder='passthrough')
X = ct.fit_transform(X)

# concatenate the numerical features (in dense format) to the one-hot encoded features (in sparse format)
X = hstack((X[:, :-9].toarray(), X[:, -9:]))

#Splitting the dataset into the Training set and Test set



In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

#Training the Multiple Linear Regression model on the Training set

In [61]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)

#Predicting the Test set results

In [62]:

y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

diff = np.abs(y_pred - y_test)
mae = np.mean(diff)
print("Mean Absolute Error:", mae)

[[ 5657.93  6795.  ]
 [14590.71 15750.  ]
 [14333.79 15250.  ]
 [-4707.21  5151.  ]
 [11294.23  9995.  ]
 [11952.47 11199.  ]
 [10907.31  5389.  ]
 [ 5977.92  7898.  ]
 [17334.3  17199.  ]
 [10238.53  6529.  ]
 [ 4524.7  20970.  ]
 [21935.26 31400.5 ]
 [ 9374.17 10945.  ]
 [21676.46 18344.  ]
 [ 8973.92  8916.5 ]
 [ 9587.94  9989.  ]
 [11460.15  9295.  ]
 [18769.44 18920.  ]
 [12318.09  7895.  ]
 [ 8569.61  6488.  ]
 [ 9593.65  9959.  ]
 [11401.14 15580.  ]
 [14114.89  9895.  ]
 [12302.57 11549.  ]
 [17514.69 15998.  ]
 [ 5398.22  5118.  ]
 [ 8798.28  6938.  ]
 [ 8776.51 16695.  ]
 [ 6427.56  8358.  ]
 [ 5846.72  5499.  ]
 [12116.68  7975.  ]
 [12842.73 12290.  ]
 [14766.57 22018.  ]
 [10387.    8948.  ]
 [ 7168.2   6849.  ]
 [15340.67 41315.  ]
 [17695.3  11595.  ]
 [16974.29 18150.  ]
 [ 7512.26  6377.  ]
 [35550.39 45400.  ]
 [ 7902.52  8916.5 ]]
Mean Absolute Error: 3629.857989371545


#Insights
The linear regression model has an R-squared value of 0.87, indicating that the model explains 87% of the variability in the data. This is a reasonably good fit for the data.

The mean absolute error of the model is 3629.86, which means that, on average, the model's predictions are off by $3629.86. This error is relatively high for a car price prediction model, and there is room for improvement.

The features that are most important in predicting car prices are the car's make, fuel type, engine size, curb weight, horsepower, city and highway miles per gallon, and the car's age.

To improve the model's performance, we can try using more advanced regression techniques, such as Ridge or Lasso regression, to reduce overfitting and improve the model's generalization ability.

We can also try adding more features that may have a significant impact on car prices, such as the car's transmission type, number of cylinders, and safety features.