In [1]:
# Imports
import pandas as pd
from IPython.display import display

# Cleaning and analyzing the data

Load the dataset from a file to a dataframe. Drop any attributes with irrelevant data.

In [2]:
# Load the data file
df = pd.read_csv("./ford.csv")

# drop unecessary columns
df = df.drop(["tax"], axis=1)

print(df.shape)
display(df.head())

(17965, 8)


Unnamed: 0,model,year,price,transmission,mileage,fuelType,mpg,engineSize
0,Fiesta,2017,12000,Automatic,15944,Petrol,57.7,1.0
1,Focus,2018,14000,Manual,9083,Petrol,57.7,1.0
2,Focus,2017,13000,Manual,12456,Petrol,57.7,1.0
3,Fiesta,2019,17500,Manual,10460,Petrol,40.3,1.5
4,Fiesta,2019,16500,Automatic,1482,Petrol,48.7,1.0


In [3]:
models = df.model.unique()
fuelTypes = df.fuelType.unique()
transmissions = df.transmission.unique()

print("Models\n", models,
      "\n\nFuel Types\n", fuelTypes,
      "\n\nTransmissions\n", transmissions,)

Models
 [' Fiesta' ' Focus' ' Puma' ' Kuga' ' EcoSport' ' C-MAX' ' Mondeo' ' Ka+'
 ' Tourneo Custom' ' S-MAX' ' B-MAX' ' Edge' ' Tourneo Connect'
 ' Grand C-MAX' ' KA' ' Galaxy' ' Mustang' ' Grand Tourneo Connect'
 ' Fusion' ' Ranger' ' Streetka' ' Escort' ' Transit Tourneo'] 

Fuel Types
 ['Petrol' 'Diesel' 'Hybrid' 'Electric' 'Other'] 

Transmissions
 ['Automatic' 'Manual' 'Semi-Auto']


# Convert categorical data to numeric format
The target feature is car model. Use `sklearn.preprocessing.LabelEncoder()` to ordinalize the target feature. Also encode the other categorical features as 0 or 1 for each category using `pandas.get_dummies()`. 

In [4]:
print(df.dtypes)

model            object
year              int64
price             int64
transmission     object
mileage           int64
fuelType         object
mpg             float64
engineSize      float64
dtype: object


In [5]:
transmissionNumeric = pd.get_dummies(df.transmission)
df = pd.concat([df.drop("transmission", axis=1), transmissionNumeric], axis=1)
fuelTypeNumeric = pd.get_dummies(df.fuelType)
df = pd.concat([df.drop("fuelType", axis=1), fuelTypeNumeric], axis=1)
display(df.head())
df.to_csv("clean1_Ford.csv", index=False)

Unnamed: 0,model,year,price,mileage,mpg,engineSize,Automatic,Manual,Semi-Auto,Diesel,Electric,Hybrid,Other,Petrol
0,Fiesta,2017,12000,15944,57.7,1.0,1,0,0,0,0,0,0,1
1,Focus,2018,14000,9083,57.7,1.0,0,1,0,0,0,0,0,1
2,Focus,2017,13000,12456,57.7,1.0,0,1,0,0,0,0,0,1
3,Fiesta,2019,17500,10460,40.3,1.5,0,1,0,0,0,0,0,1
4,Fiesta,2019,16500,1482,48.7,1.0,1,0,0,0,0,0,0,1


# Drop attributes or records to fine-tune the KNN model

In [6]:
df = df.drop("year", axis=1) # poor predictor
# df = df.drop("price", axis=1)
df = df.drop("mileage", axis=1)
# df = df.drop("mpg", axis=1)
df = df.drop("engineSize", axis=1)
# df = df.drop(fuelTypes, axis=1) # questionable
# df = df.drop(transmissions, axis=1) # poor predictor

display(df.head())

Unnamed: 0,model,price,mpg,Automatic,Manual,Semi-Auto,Diesel,Electric,Hybrid,Other,Petrol
0,Fiesta,12000,57.7,1,0,0,0,0,0,0,1
1,Focus,14000,57.7,0,1,0,0,0,0,0,1
2,Focus,13000,57.7,0,1,0,0,0,0,0,1
3,Fiesta,17500,40.3,0,1,0,0,0,0,0,1
4,Fiesta,16500,48.7,1,0,0,0,0,0,0,1
