In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df_one = pd.read_csv('Resources/Car details v3.csv')
df_two = pd.read_csv('Resources/CAR DETAILS FROM CAR DEKHO.csv')

In [3]:
def compareCounts(df_one, df_two, colName):
    print(f"df_one {colName} value counts:")
    print(df_one[colName].value_counts())
    print("\n")
    print(f"df_two {colName} value counts: ")
    print(df_two[colName].value_counts())

## Drop unneeded columns

In [4]:
df_one.dtypes

name              object
year               int64
selling_price      int64
km_driven          int64
fuel              object
seller_type       object
transmission      object
owner             object
mileage           object
engine            object
max_power         object
torque            object
seats            float64
dtype: object

In [5]:
df_one = df_one.drop(['mileage','engine','max_power','torque','seats'], axis=1)
df_one.dtypes

name             object
year              int64
selling_price     int64
km_driven         int64
fuel             object
seller_type      object
transmission     object
owner            object
dtype: object

In [6]:
df_two.dtypes

name             object
year              int64
selling_price     int64
km_driven         int64
fuel             object
seller_type      object
transmission     object
owner            object
dtype: object

## Fuel Type Cleanup

In [7]:
compareCounts(df_one, df_two, "fuel")

df_one fuel value counts:
Diesel    4402
Petrol    3631
CNG         57
LPG         38
Name: fuel, dtype: int64


df_two fuel value counts: 
Diesel      2153
Petrol      2123
CNG           40
LPG           23
Electric       1
Name: fuel, dtype: int64


In [8]:
df_one["fuel"] = df_one["fuel"].replace({"CNG" : "Other", "LPG" : "Other"})
df_two["fuel"] = df_two["fuel"].replace({"CNG" : "Other", "LPG" : "Other", "Electric" : "Other"})

In [9]:
compareCounts(df_one, df_two, "fuel")

df_one fuel value counts:
Diesel    4402
Petrol    3631
Other       95
Name: fuel, dtype: int64


df_two fuel value counts: 
Diesel    2153
Petrol    2123
Other       64
Name: fuel, dtype: int64


## Seller Type Cleanup

In [10]:
compareCounts(df_one, df_two, "seller_type")

df_one seller_type value counts:
Individual          6766
Dealer              1126
Trustmark Dealer     236
Name: seller_type, dtype: int64


df_two seller_type value counts: 
Individual          3244
Dealer               994
Trustmark Dealer     102
Name: seller_type, dtype: int64


## Transmission Cleanup

In [11]:
compareCounts(df_one, df_two, "transmission")

df_one transmission value counts:
Manual       7078
Automatic    1050
Name: transmission, dtype: int64


df_two transmission value counts: 
Manual       3892
Automatic     448
Name: transmission, dtype: int64


## Owner cleanup

In [12]:
compareCounts(df_one, df_two, "owner")

df_one owner value counts:
First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: owner, dtype: int64


df_two owner value counts: 
First Owner             2832
Second Owner            1106
Third Owner              304
Fourth & Above Owner      81
Test Drive Car            17
Name: owner, dtype: int64


In [13]:
# Drop records that have "Test Drive Car" as owner
df_one = df_one[~df_one["owner"].isin(['Test Drive Car'])]
df_two = df_two[~df_two["owner"].isin(['Test Drive Car'])]

## Sanity Check Numerical Columns

In [14]:
print(df_one["year"].describe(), "\n")
print(df_two["year"].describe())

count    8123.000000
mean     2013.800813
std         4.043437
min      1983.000000
25%      2011.000000
50%      2015.000000
75%      2017.000000
max      2020.000000
Name: year, dtype: float64 

count    4323.000000
mean     2013.065464
std         4.203865
min      1992.000000
25%      2011.000000
50%      2014.000000
75%      2016.000000
max      2020.000000
Name: year, dtype: float64


In [15]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)
print(df_one["selling_price"].describe(), "\n")
print(df_two["selling_price"].describe())

count       8123.00
mean      635953.99
std       799128.43
min        29999.00
25%       254499.50
50%       450000.00
75%       675000.00
max     10000000.00
Name: selling_price, dtype: float64 

count      4323.00
mean     502357.05
std      578794.36
min       20000.00
25%      202999.00
50%      350000.00
75%      600000.00
max     8900000.00
Name: selling_price, dtype: float64


In [16]:
print(df_one["km_driven"].describe(), "\n")
print(df_two["km_driven"].describe())

count      8123.00
mean      69853.48
std       56551.02
min           1.00
25%       35000.00
50%       60000.00
75%       98000.00
max     2360457.00
Name: km_driven, dtype: float64 

count     4323.00
mean     66459.83
std      46570.97
min          1.00
25%      35000.00
50%      60000.00
75%      90000.00
max     806599.00
Name: km_driven, dtype: float64


## Check for NA values

In [17]:
print("Total NA values in df_one: ", df_one.isnull().sum().sum())
print("Total NA values in df_two: ", df_two.isnull().sum().sum())

Total NA values in df_one:  0
Total NA values in df_two:  0


## Combine Prepared Data

In [18]:
master_data = pd.concat([df_one, df_two])
print(master_data.dtypes)
print("Total records: ", master_data["name"].count())
master_data.head()

name             object
year              int64
selling_price     int64
km_driven         int64
fuel             object
seller_type      object
transmission     object
owner            object
dtype: object
Total records:  12446


Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner


## Feature Extraction: Manfacturer

In [19]:
# Extract first word from the field to attempt to identify Make of vehicle
import re
pattern = "([^\s]+)"
master_names = master_data["name"].str.extract(pattern)
master_names.value_counts()

Maruti           3728
Hyundai          2236
Mahindra         1137
Tata             1095
Honda             716
Toyota            694
Ford              622
Chevrolet         418
Renault           373
Volkswagen        291
Skoda             173
BMW               159
Nissan            145
Datsun            102
Audi               97
Mercedes-Benz      89
Fiat               84
Jaguar             77
Volvo              71
Lexus              34
Jeep               34
Mitsubishi         20
Land               11
Ambassador          8
Force               7
Isuzu               6
Kia                 5
MG                  5
Daewoo              4
OpelCorsa           2
Opel                1
Peugeot             1
Ashok               1
dtype: int64

In [20]:
master_data["manufacturer"] = master_names
master_data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,manufacturer
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,Maruti
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,Skoda
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,Honda
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,Hyundai
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,Maruti


In [21]:
# Bin manufacturers with less than 20 records in the data as "Other"
low_count_makes = master_names.value_counts()[master_names.value_counts() < 20].index.to_flat_index().to_list()
# Above gives a list of tuples, which will need to be converted to a plain list for use in editing binned records
low_count_makes = [item for i in low_count_makes for item in i]
low_count_makes

['Land',
 'Ambassador',
 'Force',
 'Isuzu',
 'Kia',
 'MG',
 'Daewoo',
 'OpelCorsa',
 'Opel',
 'Peugeot',
 'Ashok']

In [22]:
master_data["manufacturer"][master_data["manufacturer"].isin(low_count_makes)] = "Other"
master_data["manufacturer"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Maruti           3728
Hyundai          2236
Mahindra         1137
Tata             1095
Honda             716
Toyota            694
Ford              622
Chevrolet         418
Renault           373
Volkswagen        291
Skoda             173
BMW               159
Nissan            145
Datsun            102
Audi               97
Mercedes-Benz      89
Fiat               84
Jaguar             77
Volvo              71
Other              51
Lexus              34
Jeep               34
Mitsubishi         20
Name: manufacturer, dtype: int64

## Save Output

In [23]:
master_data.to_csv("Resources/master_data.csv")