# Section 1: Cleaning the Data

In [1]:
# Imports
import pandas as pd
from IPython.display import display

Load the dataset from a file to a dataframe. Drop any attributes with irrelevant data.

In [2]:
# Load the data file
df = pd.read_csv("./ford.csv")

# drop unecessary columns
df = df.drop(["tax"], axis=1)

# Drop impossible years
mask = df.year > 2022
index = df.loc[mask].index
df = df.drop(index=index, axis=1)

# Cars must have a valid engine size if they are Diesel or Petrol
mask = (df.engineSize == 0.0)
mask = mask & ( (df.fuelType == 'Diesel') | (df.fuelType == 'Petrol') )
index = df.loc[mask].index
df = df.drop(index=index, axis=1)

# drop records with ambiguous fuel type
mask = (df.fuelType == "Other")
index = df.loc[mask].index
df = df.drop(index=index, axis=1)

display(df.head())



Unnamed: 0,model,year,price,transmission,mileage,fuelType,mpg,engineSize
0,Fiesta,2017,12000,Automatic,15944,Petrol,57.7,1.0
1,Focus,2018,14000,Manual,9083,Petrol,57.7,1.0
2,Focus,2017,13000,Manual,12456,Petrol,57.7,1.0
3,Fiesta,2019,17500,Manual,10460,Petrol,40.3,1.5
4,Fiesta,2019,16500,Automatic,1482,Petrol,48.7,1.0


In [4]:
models = df.model.unique()
fuelTypes = df.fuelType.unique()
transmissions = df.transmission.unique()
engineSizes = df.engineSize.unique()

print("Models\n", models,
      "\n\nFuel Types\n", fuelTypes,
      "\n\nTransmissions\n", transmissions,
      "\n\nEngine Sizes\n", engineSizes)

print("\nnumber of models: ", models.shape[0])

Models
 [' Fiesta' ' Focus' ' Puma' ' Kuga' ' EcoSport' ' C-MAX' ' Mondeo' ' Ka+'
 ' Tourneo Custom' ' S-MAX' ' B-MAX' ' Edge' ' Tourneo Connect'
 ' Grand C-MAX' ' KA' ' Galaxy' ' Mustang' ' Grand Tourneo Connect'
 ' Fusion' ' Ranger' ' Streetka' ' Escort' ' Transit Tourneo'] 

Fuel Types
 ['Petrol' 'Diesel' 'Hybrid' 'Electric'] 

Transmissions
 ['Automatic' 'Manual' 'Semi-Auto'] 

Engine Sizes
 [1.  1.5 1.6 1.2 2.  1.1 2.3 1.4 5.  2.2 2.5 1.8 1.3 3.2 0.  1.7]

number of models:  23


### Convert categorical data to numeric format
The target feature is car model. Encode the categorical features as 0 or 1 using `pandas.get_dummies()`. Leave the target feature unchanged.

In [23]:
print(df.dtypes)

model            object
year              int64
price             int64
transmission     object
mileage           int64
fuelType         object
mpg             float64
engineSize      float64
dtype: object


In [24]:
transmissionNumeric = pd.get_dummies(df.transmission)
df = pd.concat([df.drop("transmission", axis=1), transmissionNumeric], axis=1)
fuelTypeNumeric = pd.get_dummies(df.fuelType)
df = pd.concat([df.drop("fuelType", axis=1), fuelTypeNumeric], axis=1)
display(df.head())
# df.to_csv("clean_Ford.csv", index=False)

Unnamed: 0,model,year,price,mileage,mpg,engineSize,Automatic,Manual,Semi-Auto,Diesel,Electric,Hybrid,Petrol
0,Fiesta,2017,12000,15944,57.7,1.0,1,0,0,0,0,0,1
1,Focus,2018,14000,9083,57.7,1.0,0,1,0,0,0,0,1
2,Focus,2017,13000,12456,57.7,1.0,0,1,0,0,0,0,1
3,Fiesta,2019,17500,10460,40.3,1.5,0,1,0,0,0,0,1
4,Fiesta,2019,16500,1482,48.7,1.0,1,0,0,0,0,0,1


In [25]:
df = pd.read_csv("clean_Ford.csv")

In [26]:
X = df.drop("model", axis=1)
# X.to_csv("X.csv", index=False)
y = df.model
# y.to_csv("y.csv", index=False)
