# Data Manipulation and Cleaning

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [2]:
## Make sure all columns are numerical
extended_car_sales_mising_data = r"C:\Users\cos_9\PycharmProjects\machine_learning_and_data_science_bootcamp\resources\car-sales-extended-missing-data.csv"
missing_car_sales = r"C:\Users\cos_9\PycharmProjects\machine_learning_and_data_science_bootcamp\resources\car-sales-missing-data.csv"
extended_car_sales = r"C:\Users\cos_9\PycharmProjects\machine_learning_and_data_science_bootcamp\resources\car-sales-extended.csv"
car_sales = pd.read_csv(extended_car_sales)

car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [3]:
len(car_sales)

1000

In [4]:
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [5]:
# Split the data into X and y

X = car_sales.drop("Price", axis=1)

y = car_sales.Price

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [7]:
model = RandomForestRegressor()

model.fit(X_train, y_train)

ValueError: could not convert string to float: 'Nissan'

#### NOTE: 

Our machine learning model cannot deal with strings so it outputs an error. Hence we need to convert those strings into some sort of numerical value. That will allow our Regressor to use the columns as features in our prediction algorithm

In [8]:
# Turn categories into numbers

categorical_features = ["Make", "Colour", "Doors"]

one_hot = OneHotEncoder()

transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough")

transformed_X = transformer.fit_transform(X)


In [9]:
pd.DataFrame(transformed_X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
996,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,155144.0
997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
998,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,215883.0


In [10]:
dummies = pd.get_dummies(car_sales[["Make", "Colour", "Doors"]])

dummies

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,4,0,1,0,0,0,0,0,0,1
1,5,1,0,0,0,0,1,0,0,0
2,4,0,1,0,0,0,0,0,0,1
3,4,0,0,0,1,0,0,0,0,1
4,3,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
995,4,0,0,0,1,1,0,0,0,0
996,3,0,0,1,0,0,0,0,0,1
997,4,0,0,1,0,0,1,0,0,0
998,4,0,1,0,0,0,0,0,0,1


In [11]:
print(X.shape, y.shape, transformed_X.shape)

(1000, 4) (1000,) (1000, 13)


In [12]:
# Refit the model

np.random.seed(42)

X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2)

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.3235867221569877

### What if there were missing values?

1. Fill them with some value (imputation)
2. Drop them

In [21]:
car_sales_missing = pd.read_csv(extended_car_sales_mising_data)

car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [22]:
X = car_sales_missing.drop("Price", axis=1)

y = car_sales_missing.Price

In [23]:
# Try convert stuff into numbers

categorical_features = ["Make", "Colour", "Doors"]

one_hot = OneHotEncoder(handle_unknown="error")

transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough",)

transformed_X = transformer.fit_transform(X)
transformed_X

<1000x16 sparse matrix of type '<class 'numpy.float64'>'
	with 4000 stored elements in Compressed Sparse Row format>

In [24]:
# Filling the enans

car_sales_missing["Make"].fillna("missing", inplace=True)

car_sales_missing["Colour"].fillna("missing", inplace=True)

car_sales_missing["Odometer (KM)"].fillna(car_sales_missing["Odometer (KM)"].mean(), inplace=True)

car_sales_missing["Doors"].fillna(4, inplace=True)

car_sales_missing.isna().sum()

Make              0
Colour            0
Odometer (KM)     0
Doors             0
Price            50
dtype: int64

In [26]:
# Remove columns with null price value

car_sales_missing.dropna(inplace=True)

In [27]:
X = car_sales_missing.drop("Price", axis=1)

y = car_sales_missing.Price

In [29]:
# Try convert stuff into numbers

categorical_features = ["Make", "Colour", "Doors"]

one_hot = OneHotEncoder(handle_unknown="error")

transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough",)

transformed_X = transformer.fit_transform(car_sales_missing)
transformed_X

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        3.54310e+04, 1.53230e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        1.92714e+05, 1.99430e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        8.47140e+04, 2.83430e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 0.00000e+00,
        6.66040e+04, 3.15700e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.15883e+05, 4.00100e+03],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.48360e+05, 1.27320e+04]])

In [30]:
car_sales_missing = pd.read_csv(extended_car_sales_mising_data)

car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [36]:
car_sales_missing.dropna(subset=["Price"], inplace=True)

X = car_sales_missing.drop("Price", axis=1)

y = car_sales_missing.Price

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [39]:
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")
door_imputer = SimpleImputer(strategy="constant", fill_value=4)
num_imputer = SimpleImputer(strategy="mean")

cat_columns = ["Make", "Colour"]

door_feature = ["Doors"]

num_feature = ["Odometer (KM)"]

imputer = ColumnTransformer([
    ("cat_imputer", cat_imputer, cat_columns),
    ("door_imputer", door_imputer, door_feature),
    ("num_imputer", num_imputer, num_feature)
])

filled_X_train = imputer.fit_transform(X_train)
filled_X_test = imputer.fit_transform(X_test)

print(filled_X_train, filled_X_test, sep="\n")

[['Toyota' 'Black' 4.0 86696.0]
 ['Toyota' 'Black' 4.0 20714.0]
 ['Toyota' 'White' 4.0 134415.0]
 ...
 ['Honda' 'Blue' 4.0 224900.0]
 ['missing' 'White' 3.0 19482.0]
 ['Honda' 'Blue' 4.0 199833.0]]
[['Toyota' 'Red' 4.0 133195.0]
 ['BMW' 'White' 5.0 37513.0]
 ['Toyota' 'White' 4.0 28809.0]
 ['Nissan' 'White' 4.0 82429.0]
 ['Nissan' 'White' 4.0 234161.0]
 ['BMW' 'Blue' 5.0 50363.0]
 ['Toyota' 'White' 4.0 193629.0]
 ['Nissan' 'White' 4.0 127386.0]
 ['Nissan' 'Blue' 4.0 224410.0]
 ['Honda' 'White' 4.0 132164.01162790696]
 ['Toyota' 'Blue' 4.0 95317.0]
 ['Toyota' 'Blue' 4.0 135080.0]
 ['Toyota' 'Black' 4.0 58773.0]
 ['Nissan' 'Blue' 4.0 228192.0]
 ['Honda' 'White' 4.0 166028.0]
 ['BMW' 'White' 3.0 242935.0]
 ['Toyota' 'Blue' 4.0 243969.0]
 ['Toyota' 'White' 4.0 95725.0]
 ['Nissan' 'Blue' 4.0 189182.0]
 ['BMW' 'Blue' 3.0 142189.0]
 ['Nissan' 'White' 4.0 64362.0]
 ['Toyota' 'Green' 4.0 88519.0]
 ['Toyota' 'Blue' 4.0 61337.0]
 ['Honda' 'Blue' 4.0 43981.0]
 ['Nissan' 'White' 4.0 82726.0]
 ['Toy

In [41]:
X_train_df = pd.DataFrame(filled_X_train, columns= cat_columns + door_feature + num_feature)
X_test_df = pd.DataFrame(filled_X_test, columns= cat_columns + door_feature + num_feature)

In [46]:
print(X_train_df, X_test_df, sep="\n")
print(X_train_df.isna().sum(), X_test_df.isna().sum(), sep="\n")

        Make Colour Doors  Odometer (KM)
0     Toyota  Black   4.0        86696.0
1     Toyota  Black   4.0        20714.0
2     Toyota  White   4.0       134415.0
3     Toyota  Green   4.0  130710.230137
4      Honda  White   4.0       145850.0
..       ...    ...   ...            ...
755   Nissan   Blue   4.0       111256.0
756    Honda  White   4.0       246079.0
757    Honda   Blue   4.0       224900.0
758  missing  White   3.0        19482.0
759    Honda   Blue   4.0       199833.0

[760 rows x 4 columns]
        Make   Colour Doors Odometer (KM)
0     Toyota      Red   4.0      133195.0
1        BMW    White   5.0       37513.0
2     Toyota    White   4.0       28809.0
3     Nissan    White   4.0       82429.0
4     Nissan    White   4.0      234161.0
..       ...      ...   ...           ...
185  missing    White   4.0      128072.0
186   Toyota    White   4.0      188338.0
187   Toyota      Red   4.0      241987.0
188    Honda  missing   4.0      150582.0
189   Nissan    White 

In [49]:
# Turn categories into numbers

transformed_X_train = transformer.fit_transform(X_train_df)

transformed_X_test = transformer.fit_transform(X_test_df)

print(transformed_X_train.toarray(), transformed_X_test.toarray(), sep="\n")

[[0.00000e+00 0.00000e+00 0.00000e+00 ... 1.00000e+00 0.00000e+00
  8.66960e+04]
 [0.00000e+00 0.00000e+00 0.00000e+00 ... 1.00000e+00 0.00000e+00
  2.07140e+04]
 [0.00000e+00 0.00000e+00 0.00000e+00 ... 1.00000e+00 0.00000e+00
  1.34415e+05]
 ...
 [0.00000e+00 1.00000e+00 0.00000e+00 ... 1.00000e+00 0.00000e+00
  2.24900e+05]
 [0.00000e+00 0.00000e+00 0.00000e+00 ... 0.00000e+00 0.00000e+00
  1.94820e+04]
 [0.00000e+00 1.00000e+00 0.00000e+00 ... 1.00000e+00 0.00000e+00
  1.99833e+05]]
[[0.00000e+00 0.00000e+00 0.00000e+00 ... 1.00000e+00 0.00000e+00
  1.33195e+05]
 [1.00000e+00 0.00000e+00 0.00000e+00 ... 0.00000e+00 1.00000e+00
  3.75130e+04]
 [0.00000e+00 0.00000e+00 0.00000e+00 ... 1.00000e+00 0.00000e+00
  2.88090e+04]
 ...
 [0.00000e+00 0.00000e+00 0.00000e+00 ... 1.00000e+00 0.00000e+00
  2.41987e+05]
 [0.00000e+00 1.00000e+00 0.00000e+00 ... 1.00000e+00 0.00000e+00
  1.50582e+05]
 [0.00000e+00 0.00000e+00 1.00000e+00 ... 1.00000e+00 0.00000e+00
  1.29188e+05]]


In [51]:
model = RandomForestRegressor()

model.fit(transformed_X_train, y_train)

model.score(transformed_X_test, y_test)

0.0997050593231118