## Importing Data

In [1]:
   import numpy as np
   import pandas as pd
   import matplotlib.pyplot as pit
   import seaborn as sns; sns.set()
   import warnings
   warnings.filterwarnings("ignore")

## Loading data

In [2]:
train = pd.read_csv('Train (2).csv')
train.sample(10)

Unnamed: 0,VehicleID,Location,Maker,Model,Year,Colour,Amount (Million Naira),Type,Distance
2930,VHL14011,Abuja,Acura,TL,2009,Black,3.94,Nigerian Used,10351.0
1420,VHL17905,Lagos,Kia,Cerato,2011,Gray,1.46,Brand New,102387.0
5840,VHL18444,Lagos,Mercedes-Benz,M Class,2014,Black,12.8,Foreign Used,84570.0
1895,VHL17160,Lagos,Toyota,Highlander XLE 4x4 V6 (3.5L 6cyl 8A),2018,Gray,15.0,Foreign Used,63000.0
529,VHL14595,Abuja,Mercedes-Benz,GLK-Class 350 4MATIC,2012,Black,7.35,Foreign Used,
2518,VHL18524,Abuja,Hyundai,Genesis 5 RWD,2016,Black,27.0,Foreign Used,69000.0
1272,VHL17293,Lagos,BMW,523i,2006,Brown,2.81,Nigerian Used,291386.0
5301,VHL12691,Abuja,Acura,MDX,2015,Black,6.5,Foreign Used,38000.0
2171,VHL10114,Lagos,Toyota,Camry XLE V6 FWD,2020,Black,28.5,Brand New,
415,VHL15466,Ibadan,Toyota,Corolla,2003,Gold,1.15,Nigerian Used,99000.0


In [3]:
train.shape

(7205, 9)

In [4]:
rows, columns = train.shape
print(f'We have {rows} rows, and {columns} columns')

We have 7205 rows, and 9 columns


In [5]:
train.isna().sum()

VehicleID                    0
Location                     0
Maker                        0
Model                        0
Year                        21
Colour                       0
Amount (Million Naira)      17
Type                       197
Distance                  2360
dtype: int64

In [6]:
train.head()

Unnamed: 0,VehicleID,Location,Maker,Model,Year,Colour,Amount (Million Naira),Type,Distance
0,VHL12546,Abuja,Honda,Accord Coupe EX V-6,2011,Silver,2.2,Nigerian Used,
1,VHL18827,Ibadan,Hyundai,Sonata,2012,Silver,3.5,Nigerian Used,125000.0
2,VHL19499,Lagos,Lexus,RX 350,2010,Red,9.2,Foreign Used,110852.0
3,VHL17991,Abuja,Mercedes-Benz,GLE-Class,2017,Blue,22.8,Foreign Used,30000.0
4,VHL12170,Ibadan,Toyota,Highlander,2002,Red,2.6,Nigerian Used,125206.0


In [7]:
#Exploratory Data
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7205 entries, 0 to 7204
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   VehicleID               7205 non-null   object 
 1   Location                7205 non-null   object 
 2   Maker                   7205 non-null   object 
 3   Model                   7205 non-null   object 
 4   Year                    7184 non-null   object 
 5   Colour                  7205 non-null   object 
 6   Amount (Million Naira)  7188 non-null   float64
 7   Type                    7008 non-null   object 
 8   Distance                4845 non-null   object 
dtypes: float64(1), object(8)
memory usage: 506.7+ KB


In [55]:
#feature eng
train_red = train[['Location','Maker','Type','Model','Year','Amount (Million Naira)']]
train_red.head()

Unnamed: 0,Location,Maker,Type,Model,Year,Amount (Million Naira)
0,Abuja,Honda,Nigerian Used,Accord Coupe EX V-6,2011,2.2
1,Ibadan,Hyundai,Nigerian Used,Sonata,2012,3.5
2,Lagos,Lexus,Foreign Used,RX 350,2010,9.2
3,Abuja,Mercedes-Benz,Foreign Used,GLE-Class,2017,22.8
4,Ibadan,Toyota,Nigerian Used,Highlander,2002,2.6


In [56]:
#Those code drops the null values in a dataset
train_red.dropna(inplace=True)
train_red.isna().any()

Location                  False
Maker                     False
Type                      False
Model                     False
Year                      False
Amount (Million Naira)    False
dtype: bool

In [57]:
for column in train_red.columns:
    print(column,">>>>>",train_red[column].nunique())

Location >>>>> 3
Maker >>>>> 54
Type >>>>> 3
Model >>>>> 1180
Year >>>>> 33
Amount (Million Naira) >>>>> 679


In [58]:
#Data PreProcessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split,KFold
from sklearn.preprocessing import LabelEncoder,PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

In [59]:
train_red.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6972 entries, 0 to 7204
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Location                6972 non-null   object 
 1   Maker                   6972 non-null   object 
 2   Type                    6972 non-null   object 
 3   Model                   6972 non-null   object 
 4   Year                    6972 non-null   object 
 5   Amount (Million Naira)  6972 non-null   float64
dtypes: float64(1), object(5)
memory usage: 381.3+ KB


In [60]:
cat_col = ['Maker','Model','Type']

for col in cat_col:
        lab = LabelEncoder()
        lab.fit(train_red[col])
        train_red[col] = lab.transform(train_red[col])

In [61]:
train_red['Year'] = train_red['Year'].str.replace(',','')

In [62]:
locat = pd.get_dummies(train_red['Location'],drop_first = True)

In [63]:
all_data = pd.concat([train_red.drop(['Location'],axis=1),locat], axis=1).reset_index(drop = True)
all_data.sample(10)

Unnamed: 0,Maker,Type,Model,Year,Amount (Million Naira),Ibadan,Lagos
2845,28,1,402,2008,4.55,0,1
1449,51,1,365,2004,2.65,0,0
3963,15,1,92,2010,5.5,0,1
4623,15,1,1169,2019,44.0,0,1
2289,33,1,385,2010,5.8,0,0
3507,28,1,402,2008,4.6,0,1
3690,33,1,708,2009,5.9,0,0
5436,16,1,139,2020,13.5,0,0
3747,28,1,402,2007,4.5,0,1
6060,51,1,1049,1999,3.4,0,1


In [64]:
features = all_data.drop('Amount (Million Naira)',axis=1)
labels = train_red['Amount (Million Naira)']

In [65]:
random_state = 42
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=random_state)

In [66]:
#Modelling
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, BaggingRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression,Lasso,Ridge

In [67]:
model = LinearRegression()
model.fit(X_train,y_train)
Accuracy = model.score(X_test,y_test)
print(f"The model's Accuracy is {Accuracy}")

The model's Accuracy is 0.2651544404025463


In [68]:
model2 = DecisionTreeRegressor()
model2.fit(X_train,y_train)
Accuracy = model2.score(X_test,y_test)
print(f"The model's Accuracy is {Accuracy}")

The model's Accuracy is 0.787579601773709


In [69]:
model3 = RandomForestRegressor()
model3.fit(X_train,y_train)
Accuracy = model3.score(X_test,y_test)
print(f"The model's Accuracy is {Accuracy}")

The model's Accuracy is 0.7727066303002809


In [70]:
model4 = ExtraTreesRegressor()
model4.fit(X_train,y_train)
Accuracy = model4.score(X_test,y_test)
print(f"The model's Accuracy is {Accuracy}")

The model's Accuracy is 0.7531625832095591


In [71]:
model5 = BaggingRegressor()
model5.fit(X_train,y_train)
Accuracy = model5.score(X_test,y_test)
print(f"The model's Accuracy is {Accuracy}")

The model's Accuracy is 0.7756679124665795


In [72]:
y_train

4029     5.50
6147     3.26
673     11.50
33       2.40
3172     3.35
        ...  
3891     6.00
5362     1.24
5397     2.25
5564     7.50
884     17.50
Name: Amount (Million Naira), Length: 5577, dtype: float64

In [73]:
#prediction
X_test.head()

Unnamed: 0,Maker,Type,Model,Year,Ibadan,Lagos
132,28,1,881,2010,1,0
5571,16,2,115,2003,0,1
2168,16,2,97,2010,0,0
4086,32,2,252,2014,0,0
4787,51,2,547,2010,0,1


In [75]:
train_red.head()

Unnamed: 0,Location,Maker,Type,Model,Year,Amount (Million Naira)
0,Abuja,16,2,116,2011,2.2
1,Ibadan,18,2,1019,2012,3.5
2,Lagos,28,1,881,2010,9.2
3,Abuja,33,1,490,2017,22.8
4,Ibadan,51,2,548,2002,2.6


In [79]:
pred = model2.predict(np.array([[16,2,116,2011,15,0]]))
print(pred)

[2.2]


In [80]:
#how to save model
import pickle #for small models
import joblib #for large models

In [81]:
with open('model.pkl','wb')as model_file:
    pickle.dump(model,model_file)

In [82]:
#loading model with pickle
with open('model.pkl','rb')as model_file:
    loaded_model = pickle.load(model_file)

In [None]:
#joblib
#dump(model,'model.joblib')
#loaded = load('model.joblib')