In [1]:
import numpy as np
import pandas as pd

In [2]:
np.random.seed(42)

## Getting the data

In [30]:
def import_data():
    file_names = ['audi', 'bmw', 'ford', 'hyundi', 'merc', 'skoda', 'toyota', 'vauxhall', 'vw']
    maker_names = ['Audi', 'BMW', 'Ford', 'Hyundai', 'Mercedes-Benz', 'Skoda', 'Toyota', 'Vauxhall', 'Volkswagen']
    df = pd.DataFrame(columns=['model', 'year', 'price', 'transmission', 'mileage', 'fuelType', 'tax', 'mpg', 'engineSize', 'make'])
    for i in range(len(file_names)):
        path = 'data/' + file_names[i] + '.csv'
        data = pd.read_csv(path)
        data['make'] = maker_names[i]
        if file_names[i] == 'hyundi':
            data.rename(columns={'tax(£)': 'tax'}, inplace=True)
        df = df.append(data, ignore_index=True)
    return df

In [37]:
auto = import_data()
auto.sample(10)

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,make
45678,A Class,2018,16195,Automatic,22000,Petrol,145,52.3,1.6,Mercedes-Benz
16967,5 Series,2019,31460,Automatic,1200,Diesel,145,62.8,2.0,BMW
85640,Golf,2019,16995,Semi-Auto,11989,Diesel,150,54.3,1.6,Volkswagen
32516,Focus,2016,9200,Manual,36500,Petrol,20,61.4,1.0,Ford
92207,Polo,2018,11591,Manual,22000,Petrol,150,58.9,1.0,Volkswagen
696,Q8,2019,83995,Semi-Auto,5978,Diesel,150,30.7,4.0,Audi
79975,Mokka X,2019,12689,Manual,12705,Petrol,145,39.2,1.4,Vauxhall
13969,5 Series,2020,27990,Semi-Auto,105,Hybrid,145,49.6,2.0,BMW
16938,M4,2020,59999,Semi-Auto,3000,Petrol,150,28.5,3.0,BMW
33455,EcoSport,2016,10195,Manual,14368,Diesel,30,64.2,1.5,Ford


In [38]:
auto.shape

(99187, 10)

We have 99,187 data points with 10 features

In [39]:
auto.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99187 entries, 0 to 99186
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         99187 non-null  object 
 1   year          99187 non-null  object 
 2   price         99187 non-null  object 
 3   transmission  99187 non-null  object 
 4   mileage       99187 non-null  object 
 5   fuelType      99187 non-null  object 
 6   tax           99187 non-null  object 
 7   mpg           99187 non-null  float64
 8   engineSize    99187 non-null  float64
 9   make          99187 non-null  object 
dtypes: float64(2), object(8)
memory usage: 7.6+ MB


Luckily, we don't have any null values. But, some of the numerical features <strong>(year, price, mileage, tax)</strong> are recorded as an object which we don't want. So, we need to transform them into int64 data type

In [42]:
auto = auto.astype({'year': 'int64',
                   'price': 'int64',
                   'mileage': 'int64',
                   'tax': 'int64'})
auto.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99187 entries, 0 to 99186
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         99187 non-null  object 
 1   year          99187 non-null  int64  
 2   price         99187 non-null  int64  
 3   transmission  99187 non-null  object 
 4   mileage       99187 non-null  int64  
 5   fuelType      99187 non-null  object 
 6   tax           99187 non-null  int64  
 7   mpg           99187 non-null  float64
 8   engineSize    99187 non-null  float64
 9   make          99187 non-null  object 
dtypes: float64(2), int64(4), object(4)
memory usage: 7.6+ MB


In [43]:
auto.sample(5)

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,make
90320,Passat,2017,9891,Manual,122000,Diesel,125,60.1,2.0,Volkswagen
38910,Focus,2016,8200,Automatic,47021,Petrol,125,51.4,1.0,Ford
19764,M4,2020,45488,Automatic,10,Petrol,150,34.0,3.0,BMW
6670,A5,2018,31050,Semi-Auto,17671,Petrol,145,36.7,3.0,Audi
21818,B-MAX,2017,11298,Semi-Auto,1397,Petrol,165,44.1,1.6,Ford


In [44]:
auto.describe()

Unnamed: 0,year,price,mileage,tax,mpg,engineSize
count,99187.0,99187.0,99187.0,99187.0,99187.0,99187.0
mean,2017.087723,16805.347656,23058.914213,120.299838,55.166825,1.66328
std,2.123934,9866.773417,21148.523721,63.150926,16.138522,0.557646
min,1970.0,450.0,1.0,0.0,0.3,0.0
25%,2016.0,9999.0,7425.0,125.0,47.1,1.2
50%,2017.0,14495.0,17460.0,145.0,54.3,1.6
75%,2019.0,20870.0,32339.0,145.0,62.8,2.0
max,2060.0,159999.0,323000.0,580.0,470.8,6.6


There are some odd numbers on the table above. We should check them further