# Vehicle Sales Analysis

## This analysis aims to analyse vehicle sales data based on three broad categories

- Condition of Use
- Body type of the vehicle
- And the vehicle listing 

### Importing the necessary libraries

In [1]:
import pandas as pd
import numpy as np
import glob

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use = "ggplot"
import seaborn as sns
import missingno as msno
plt.rcParams['figure.figsize'] = (10, 7) # For making the matplotlib plots the same size
sns.set(rc={'figure.figsize': (15, 7)})  # For making the seaborn plots the same size
pd.set_option('display.float_format', lambda x: '%.3f' % x) #prevent python from printing exponents

import warnings
warnings.filterwarnings('ignore')

### Importing the data

A quick glance through all the files at the same time

In [2]:
path = r'Datasets' # csv files location on my PC
csv_files = glob.glob(path + "/*.csv")
all_files = []

# loop over the list of csv files
for file in csv_files:
      
    # read the csv file
    df = pd.read_csv(file, header = 0, delimiter=";", decimal = ",", index_col=False)
    all_files.append(df) # saving all the file names in a list
      
    # print the location and filename
    print(f'Location:{file}')
    print(f'File Name:', file.split("\\")[-1])
    print(f'Shape/Size: {df.shape}')
      
    # print the content
    print('Content:')
    display(df)
    print()

Location:Datasets\bodytype.csv
File Name: bodytype.csv
Shape/Size: (15, 4)
Content:


Unnamed: 0,id,title,description,sailthru_tag
0,2,Saloons,Saloon vehicles,saloons
1,3,Hatchbacks,Hatchback Vehicles,hatchbacks
2,4,4 Wheel Drives & SUVs,4 Wheel Drives & SUVs,suvs
3,5,Station Wagons,Station Wagons,station wagons
4,6,Pickups,Pickups,pickups
5,7,Motorbikes,Motorbikes,motorbikes
6,8,Convertibles,Convertibles,convertibles
7,9,"Buses, Taxis and Vans","Buses, Taxis and Vans",vans
8,10,Trucks,Trucks,trucks
9,11,Machinery and Tractors,Machinery and Tractors,tractors



Location:Datasets\categories.csv
File Name: categories.csv
Shape/Size: (15920, 4)
Content:


Unnamed: 0,id,title,slug,price_guide
0,1,Alfa Romeo,alfa-romeo,0
1,2,Aston Martin,aston-martin,0
2,3,Audi,audi,0
3,4,Bentley,bentley,0
4,5,BMW,bmw,0
...,...,...,...,...
15915,16132,VS150,vespa-vs150,0
15916,16133,208 D,mercedes-benz-208-d,0
15917,16134,208DA,mercedes-benz-208da,0
15918,16135,Truck,jmc-truck,0



Location:Datasets\condition.csv
File Name: condition.csv
Shape/Size: (3, 3)
Content:


Unnamed: 0,id,title,description
0,1,Brand New,Brand New
1,2,Foreign Used,Foreign Used
2,3,Locally Used,Locally Used



Location:Datasets\listing.csv
File Name: listing.csv
Shape/Size: (775199, 5)
Content:


Unnamed: 0,id,old_id,title,location_id,listing_status_id
0,586520,1,Toyota Avalon,2,2
1,586521,2,Toyota Camry,2,2
2,586522,3,Toyota HiAce,2,2
3,586523,4,Honda Accord,2,2
4,586524,5,Man ERF,2,2
...,...,...,...,...,...
775194,2234858,0,Toyota RAV4 Limited,2,5
775195,2234859,0,Land Rover Range Rover Sport 4.4 V8,2,5
775196,2234860,0,Toyota Camry SE,2,5
775197,2234861,0,Lexus ES 350,2,5



Location:Datasets\trueprices.csv
File Name: trueprices.csv
Shape/Size: (195265, 11)
Content:


Unnamed: 0,id,make_id,model_id,series_id,is_verified_dealer,price,year_of_manufacture,domain_id,listing_id,condition_type_id,body_type_id
0,1,19,809,5540,0,400000,1997,6,1596322,3,2
1,2,57,1706,10782,0,1700000,2009,6,1596328,3,2
2,3,13339,13812,0,1,3300000,2010,6,1596330,3,2
3,4,30,1051,15359,0,2300000,2003,6,1596332,2,2
4,6,57,1698,10714,0,3700000,2008,6,1596337,2,2
...,...,...,...,...,...,...,...,...,...,...,...
195260,628346,29,13743,14321,1,22000000,2014,6,2234706,2,4
195261,628347,35,1184,0,1,6200000,2011,6,2234758,2,2
195262,628348,38,1242,0,1,3800000,2008,6,2234764,3,4
195263,628349,19,817,0,1,3800000,2014,6,2234765,3,4





### Data Preparation for Analysis

#### Data Quality/Modelling Check

The `categories`, `listing` and `trueprices` tables look really interesting and could pose data modelling issues. <br>
The first thing to try and resolve here is the data quality issue that may cause problems with modelling the data in a database.

In [3]:
bodytype = pd.read_csv(csv_files[0], header = 0, delimiter=";", decimal = ",", index_col=False)
categories = pd.read_csv(csv_files[1], header = 0, delimiter=";", decimal = ",", index_col=False)
condition = pd.read_csv(csv_files[2], header = 0, delimiter=";", decimal = ",", index_col=False)
listing = pd.read_csv(csv_files[3], header = 0, delimiter=";", decimal = ",", index_col=False)
trueprices = pd.read_csv(csv_files[4], header = 0, delimiter=";", decimal = ",", index_col=False)

In [4]:
# create a list of the table names alone

all_dfs = [
    bodytype,
    categories,
    condition,
    listing,
    trueprices
]

Assign attribute names to each table, it will come in handy later

In [5]:
bodytype.attrs['name'] = 'bodytype'
categories.attrs['name'] = 'categories'
condition.attrs['name'] = 'condition'
listing.attrs['name'] = 'listing'
trueprices.attrs['name'] = 'trueprices'

Check for duplicate entries in the `id` columns of each of the dataframes

In [6]:
def check_table_uniqueness(q):

    print(f"Shape/Size of table: {q.shape}")
    print(f"Number of unique category ids: {q.id.nunique()}")

    if q.shape[0] == q.id.nunique():
        print(f"There are no duplicate values in the tablename: {q.attrs['name']}")
    else:
        print(f"There are duplicate entries in the table: {q.attrs['name']}")

In [7]:
for df in all_dfs:
    check_table_uniqueness(df)
    print('\n')

Shape/Size of table: (15, 4)
Number of unique category ids: 15
There are no duplicate values in the tablename: bodytype


Shape/Size of table: (15920, 4)
Number of unique category ids: 15920
There are no duplicate values in the tablename: categories


Shape/Size of table: (3, 3)
Number of unique category ids: 3
There are no duplicate values in the tablename: condition


Shape/Size of table: (775199, 5)
Number of unique category ids: 775199
There are no duplicate values in the tablename: listing


Shape/Size of table: (195265, 11)
Number of unique category ids: 195265
There are no duplicate values in the tablename: trueprices




Checking for missing values

In [8]:
def missing_no(q):
    
    print(q.isna().sum())

In [9]:
# check for missing values

for df in all_dfs:
    print(f"{df.attrs['name']}")
    print(f"Shape/Size of table: {df.shape}")
    print(f"Number of unique category ids: {df.id.nunique()}")
    print(df.isna().sum())
    print('\n')


bodytype
Shape/Size of table: (15, 4)
Number of unique category ids: 15
id              0
title           0
description     0
sailthru_tag    0
dtype: int64


categories
Shape/Size of table: (15920, 4)
Number of unique category ids: 15920
id             0
title          6
slug           0
price_guide    0
dtype: int64


condition
Shape/Size of table: (3, 3)
Number of unique category ids: 3
id             0
title          0
description    0
dtype: int64


listing
Shape/Size of table: (775199, 5)
Number of unique category ids: 775199
id                      0
old_id                  0
title                7607
location_id             0
listing_status_id       0
dtype: int64


trueprices
Shape/Size of table: (195265, 11)
Number of unique category ids: 195265
id                     0
make_id                0
model_id               0
series_id              0
is_verified_dealer     0
price                  0
year_of_manufacture    0
domain_id              0
listing_id             0
condition

In [10]:
# merging the listing table with the trueprices table

listing.rename(columns={'id':'listing_id', 'title':'listing_title'}, inplace=True)
trueprices_extended = trueprices.merge(listing[['listing_title', 'listing_id']], how='left', left_on='listing_id', right_on='listing_id')
print(trueprices_extended.shape)
trueprices_extended.head()

(195265, 12)


Unnamed: 0,id,make_id,model_id,series_id,is_verified_dealer,price,year_of_manufacture,domain_id,listing_id,condition_type_id,body_type_id,listing_title
0,1,19,809,5540,0,400000,1997,6,1596322,3,2,Honda Accord EX V6 Automatic
1,2,57,1706,10782,0,1700000,2009,6,1596328,3,2,Toyota Corolla Corolla
2,3,13339,13812,0,1,3300000,2010,6,1596330,3,2,Acura TL
3,4,30,1051,15359,0,2300000,2003,6,1596332,2,2,Lexus ES 300
4,6,57,1698,10714,0,3700000,2008,6,1596337,2,2,Toyota Camry XLE Automatic


In [11]:
# merging the bodytype table with the trueprices table

bodytype.rename(columns={'id':'body_type_id', 'title':'bodytype_title'}, inplace=True)
trueprices_extended = trueprices_extended.merge(bodytype[['bodytype_title', 'body_type_id']], how='left', left_on='body_type_id', right_on='body_type_id')
print(trueprices_extended.shape)
trueprices_extended.head()

(195265, 13)


Unnamed: 0,id,make_id,model_id,series_id,is_verified_dealer,price,year_of_manufacture,domain_id,listing_id,condition_type_id,body_type_id,listing_title,bodytype_title
0,1,19,809,5540,0,400000,1997,6,1596322,3,2,Honda Accord EX V6 Automatic,Saloons
1,2,57,1706,10782,0,1700000,2009,6,1596328,3,2,Toyota Corolla Corolla,Saloons
2,3,13339,13812,0,1,3300000,2010,6,1596330,3,2,Acura TL,Saloons
3,4,30,1051,15359,0,2300000,2003,6,1596332,2,2,Lexus ES 300,Saloons
4,6,57,1698,10714,0,3700000,2008,6,1596337,2,2,Toyota Camry XLE Automatic,Saloons


In [12]:
# merging the condition table with the trueprices table

condition.rename(columns={'id':'condition_type_id', 'title':'condition_title'}, inplace=True)
trueprices_extended = trueprices_extended.merge(condition[['condition_title', 'condition_type_id']], how='left', left_on='condition_type_id', right_on='condition_type_id')
print(trueprices_extended.shape)
trueprices_extended.head()

(195265, 14)


Unnamed: 0,id,make_id,model_id,series_id,is_verified_dealer,price,year_of_manufacture,domain_id,listing_id,condition_type_id,body_type_id,listing_title,bodytype_title,condition_title
0,1,19,809,5540,0,400000,1997,6,1596322,3,2,Honda Accord EX V6 Automatic,Saloons,Locally Used
1,2,57,1706,10782,0,1700000,2009,6,1596328,3,2,Toyota Corolla Corolla,Saloons,Locally Used
2,3,13339,13812,0,1,3300000,2010,6,1596330,3,2,Acura TL,Saloons,Locally Used
3,4,30,1051,15359,0,2300000,2003,6,1596332,2,2,Lexus ES 300,Saloons,Foreign Used
4,6,57,1698,10714,0,3700000,2008,6,1596337,2,2,Toyota Camry XLE Automatic,Saloons,Foreign Used


In [19]:
for file in csv_files:
    print(file[9:-4])

bodytype
categories
condition
listing
trueprices
