In [1]:
import pandas as pd

In [2]:
car_sales = pd.read_csv("car-sales-extended.csv")
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [3]:
X = car_sales.drop("Price",axis=1)
y = car_sales["Price"]
X.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431,4
1,BMW,Blue,192714,5
2,Honda,White,84714,4
3,Toyota,White,154365,4
4,Nissan,Blue,181577,3


In [4]:
y.head()

0    15323
1    19943
2    28343
3    13434
4    14043
Name: Price, dtype: int64

# differentiate training and testing data

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
train_X,test_X,train_y,test_y = train_test_split(X,y,test_size=0.2)

In [7]:
train_X.shape,test_X.shape,train_y.shape,test_y.shape

((800, 4), (200, 4), (800,), (200,))

In [8]:
# Convert data into numbers

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [10]:
colection = ['Make', 'Colour', 'Doors']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot',
                                 one_hot,
                                 colection)],
                               remainder = 'passthrough')
transformed_X = transformer.fit_transform(X)
pd.DataFrame(transformed_X).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0


In [11]:
dummy = pd.get_dummies(car_sales[['Make','Colour','Doors']])
dummy.head()

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,4,0,1,0,0,0,0,0,0,1
1,5,1,0,0,0,0,1,0,0,0
2,4,0,1,0,0,0,0,0,0,1
3,4,0,0,0,1,0,0,0,0,1
4,3,0,0,1,0,0,1,0,0,0


In [12]:
#Built model
train_X,test_X,train_y,test_y = train_test_split(transformed_X,y,test_size=0.2)

In [13]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100)
model.fit(train_X,train_y)

RandomForestRegressor()

In [14]:
model.score(test_X,test_y)

0.14386489127179325

In [15]:
model.score(train_X,train_y)

0.9003381338008585

# Fill missing data with sklearn

In [16]:
car_miss = pd.read_csv("car-sales-extended-missing-data.csv")
car_miss.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [17]:
car_miss.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [18]:
car_miss.dropna(subset=["Price"],inplace=True)

In [19]:
car_miss.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [20]:
X = car_miss.drop('Price',axis=1)
y=car_miss['Price']

# Split data into train and test

In [21]:
from sklearn.model_selection import train_test_split
train_X,test_X,train_y,test_y = train_test_split(X,y,test_size=0.2)

# import sklearn library

In [22]:

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [23]:
#Create imputers
cat_imputer = SimpleImputer(strategy='constant',fill_value="missing")
door_imputer = SimpleImputer(strategy='constant',fill_value=4)
odometer_imputer = SimpleImputer(strategy='mean')

In [24]:
#create features
cat_features = ["Make","Colour"]
door_features = ["Doors"]
odometer_features = ["Odometer (KM)"]

In [25]:
#Create imputer
imputer = ColumnTransformer([
('cat_imputer',cat_imputer,cat_features),
('door_imputer',door_imputer,door_features),
('odometer_imputer',odometer_imputer,odometer_features)],
remainder="passthrough")


In [26]:
#Apply
X_train = imputer.fit_transform(train_X)
X_test = imputer.fit_transform(test_X)
X_test = pd.DataFrame(X_test,columns=['Make',"Colour","Doors","Odometer (KM)"])

# Convert string into numbers


In [27]:
X_train = pd.DataFrame(X_train,columns=['Make',"Colour","Doors","Odometer (KM)"])
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

one_hot = OneHotEncoder()
feature = ['Make','Colour','Doors']
transformor = ColumnTransformer([('one_hot',one_hot,feature)],
                               remainder='passthrough')

X_train = transformor.fit_transform(X_train)
X_test = transformor.fit_transform(X_test)

# Built model

In [28]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train,train_y)
model.score(X_test,test_y)

0.23005780160939027

In [29]:
model.score(X_train,train_y)

0.87737879445849

In [30]:
car_sales = pd.read_csv('car-sales-extended.csv')
car_sales.head(2)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943


In [31]:
car_sales.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [32]:
len(car_sales['Make'])

1000

# Converting x and y

In [33]:
X = car_sales.drop("Price",axis=1)
y = car_sales["Price"]
#z = car_sales['Odometer (KM)']

# Spliting into train and test

In [34]:
from sklearn.model_selection import train_test_split
train_X,test_X,train_y,test_y = train_test_split(X,y,test_size=0.2)

# Converting train_x and test_X into numbers

In [35]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

change = ["Make","Colour","Doors"]
one_hot = OneHotEncoder()

transformer = ColumnTransformer([("one_hot",one_hot,change)],
                               remainder = 'passthrough')

X_train = transformer.fit_transform(train_X)
X_test = transformer.fit_transform(test_X)
#X_test = pd.DataFrame(X_test)
#X_train = pd.DataFrame(X_train,columns = ['Make',"Colour","Doors"])
#X_train

# Training model

In [36]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train,train_y)

RandomForestRegressor()

In [37]:
model.score(X_test,test_y)

0.3379284330702039

In [38]:
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


#  Get data for prediction

In [39]:
data = [['BMW','Blue',192714,'5'],
        ['Honda','White',35431,'4'],
        ['Nissan','Black',192714,'3'],
         ['Toyota','Green',192714,'3'],
         ['Nissan','Red',192714,'3']]
pridict = pd.DataFrame(data,columns=['Make','Colour','Odometer (KM)',"Doors"])
pridict

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,BMW,Blue,192714,5
1,Honda,White,35431,4
2,Nissan,Black,192714,3
3,Toyota,Green,192714,3
4,Nissan,Red,192714,3


# Change it into numbers

In [40]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
change = ["Make","Colour","Doors"]
one_hot = OneHotEncoder()

transformer = ColumnTransformer([("one_hot",one_hot,change)],
                               remainder = 'passthrough')

predict = transformer.fit_transform(pridict)

In [42]:
model.predict(predict)

array([28897.95, 15866.35, 11288.83, 13157.12, 13049.11])