In [1]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

In [2]:
# Loading the data from csv files to pandas dataframe
data = pd.read_csv('ford.csv')
print(data.shape)
data.head()

(17965, 9)


Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,Fiesta,2017,12000,Automatic,15944,Petrol,150,57.7,1.0
1,Focus,2018,14000,Manual,9083,Petrol,150,57.7,1.0
2,Focus,2017,13000,Manual,12456,Petrol,150,57.7,1.0
3,Fiesta,2019,17500,Manual,10460,Petrol,145,40.3,1.5
4,Fiesta,2019,16500,Automatic,1482,Petrol,145,48.7,1.0


In [3]:
# Checking null values
data.isnull().sum()

model           0
year            0
price           0
transmission    0
mileage         0
fuelType        0
tax             0
mpg             0
engineSize      0
dtype: int64

In [4]:
# Encoding categorical data
data['model']=pd.factorize(data.model)[0]
data['transmission']=pd.factorize(data.transmission)[0]
data['fuelType']=pd.factorize(data.fuelType)[0]

In [5]:
data.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,0,2017,12000,0,15944,0,150,57.7,1.0
1,1,2018,14000,1,9083,0,150,57.7,1.0
2,1,2017,13000,1,12456,0,150,57.7,1.0
3,0,2019,17500,1,10460,0,145,40.3,1.5
4,0,2019,16500,0,1482,0,145,48.7,1.0


In [6]:
# Splitting features and target column
X = data.drop(['price'],axis=1)
Y = data['price']

In [7]:
# Normalization of data
scaler = MinMaxScaler(copy=True, feature_range=(0, 1))
X = scaler.fit_transform(X)

In [8]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.1,random_state=0)

In [9]:
# Using random forest to train model
RandomForestRegressorModel = RandomForestRegressor(n_estimators=50,max_depth=14, random_state=10)
RandomForestRegressorModel.fit(X_train, Y_train)

print('Random Forest Regressor Train Score is : ' , RandomForestRegressorModel.score(X_train, Y_train))
print('Random Forest Regressor Test Score is : ' , RandomForestRegressorModel.score(X_test, Y_test))

Random Forest Regressor Train Score is :  0.9735143403178541
Random Forest Regressor Test Score is :  0.9468630117112607


In [10]:
LinearRegressionModel = LinearRegression(fit_intercept=True, normalize=True,copy_X=True,n_jobs=-1)
LinearRegressionModel.fit(X_train, Y_train)

print('Linear Regression Train Score is : ' , LinearRegressionModel.score(X_train, Y_train))
print('Linear Regression Test Score is : ' , LinearRegressionModel.score(X_test, Y_test))

Y_pred = LinearRegressionModel.predict(X_test)

Linear Regression Train Score is :  0.734289335217472
Linear Regression Test Score is :  0.7516991520804799


In [11]:
# Model evaluation
MSEValue = metrics.mean_squared_error(Y_test, Y_pred) 
print('Mean Squared Error Value is : ', MSEValue)

Mean Squared Error Value is :  6000742.384061105
