In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import psycopg2
from config import db_password
from sqlalchemy import create_engine

In [2]:
# Create the connection string using the password from the config.py file
db_string = f"postgres+psycopg2://postgres:{db_password}@127.0.0.1:5432/Car_Dekho_Prediction"

In [3]:
# Create the database engine
engine = create_engine(db_string)

In [4]:
# Connect to PostgreSQL server
dbConnection = engine.connect()

In [5]:
# Read data from PostgreSQL CarDetails database table and load into a DataFrame instance
df_two = pd.read_sql("select * from \"CarDetails\"", dbConnection);
pd.set_option('display.expand_frame_repr', False);

In [6]:
df_two.dtypes

index             int64
name             object
year              int64
selling_price     int64
km_driven         int64
fuel             object
seller_type      object
transmission     object
owner            object
dtype: object

In [7]:
# Read data from PostgreSQL CarEngineInfo, CarYear, UserInfo database tables and load into a DataFrame instance
df_one = pd.read_sql("""select distinct "CarEngineInfo"."name", "CarYear"."year", "CarEngineInfo"."selling_price", "CarEngineInfo"."km_driven", "CarEngineInfo"."fuel", "UserInfo"."seller_type", "UserInfo"."transmission", "UserInfo"."owner", "CarEngineInfo"."mileage", "CarEngineInfo"."engine", "CarEngineInfo"."max_power", "CarEngineInfo"."torque", "CarEngineInfo"."seats" from "CarEngineInfo" INNER JOIN "CarYear" ON "CarEngineInfo"."name" = "CarYear"."name" INNER JOIN "UserInfo" ON "UserInfo"."name" = "CarEngineInfo"."name" """, dbConnection);
pd.set_option('display.expand_frame_repr', False);

In [8]:
df_one.dtypes

name              object
year               int64
selling_price      int64
km_driven          int64
fuel              object
seller_type       object
transmission      object
owner             object
mileage           object
engine            object
max_power         object
torque            object
seats            float64
dtype: object

In [9]:
def compareCounts(df_one, df_two, colName):
    print(f"df_one {colName} value counts:")
    print(df_one[colName].value_counts())
    print("\n")
    print(f"df_two {colName} value counts: ")
    print(df_two[colName].value_counts())

## Drop unneeded columns

In [10]:
df_one = df_one.drop(['mileage','engine','max_power','torque','seats'], axis=1)
df_one.dtypes

name             object
year              int64
selling_price     int64
km_driven         int64
fuel             object
seller_type      object
transmission     object
owner            object
dtype: object

In [11]:
df_two = df_two.drop(['index'], axis=1)
df_two.dtypes

name             object
year              int64
selling_price     int64
km_driven         int64
fuel             object
seller_type      object
transmission     object
owner            object
dtype: object

## Fuel Type Cleanup

In [12]:
compareCounts(df_one, df_two, "fuel")

df_one fuel value counts:
Petrol    42814
Diesel    41423
CNG         530
LPG         230
Name: fuel, dtype: int64


df_two fuel value counts: 
Diesel      2153
Petrol      2123
CNG           40
LPG           23
Electric       1
Name: fuel, dtype: int64


In [13]:
df_one["fuel"] = df_one["fuel"].replace({"CNG" : "Other", "LPG" : "Other"})
df_two["fuel"] = df_two["fuel"].replace({"CNG" : "Other", "LPG" : "Other", "Electric" : "Other"})

In [14]:
compareCounts(df_one, df_two, "fuel")

df_one fuel value counts:
Petrol    42814
Diesel    41423
Other       760
Name: fuel, dtype: int64


df_two fuel value counts: 
Diesel    2153
Petrol    2123
Other       64
Name: fuel, dtype: int64


## Seller Type Cleanup

In [15]:
compareCounts(df_one, df_two, "seller_type")

df_one seller_type value counts:
Individual          67540
Dealer              16561
Trustmark Dealer      896
Name: seller_type, dtype: int64


df_two seller_type value counts: 
Individual          3244
Dealer               994
Trustmark Dealer     102
Name: seller_type, dtype: int64


## Transmission Cleanup

In [16]:
compareCounts(df_one, df_two, "transmission")

df_one transmission value counts:
Manual       83233
Automatic     1764
Name: transmission, dtype: int64


df_two transmission value counts: 
Manual       3892
Automatic     448
Name: transmission, dtype: int64


## Owner cleanup

In [17]:
compareCounts(df_one, df_two, "owner")

df_one owner value counts:
First Owner             35087
Second Owner            27254
Third Owner             14525
Fourth & Above Owner     8126
Test Drive Car              5
Name: owner, dtype: int64


df_two owner value counts: 
First Owner             2832
Second Owner            1106
Third Owner              304
Fourth & Above Owner      81
Test Drive Car            17
Name: owner, dtype: int64


In [18]:
# Drop records that have "Test Drive Car" as owner
df_one = df_one[~df_one["owner"].isin(['Test Drive Car'])]
df_two = df_two[~df_two["owner"].isin(['Test Drive Car'])]

## Sanity Check Numerical Columns

In [19]:
print(df_one["year"].describe(), "\n")
print(df_two["year"].describe())

count    84992.000000
mean      2012.599374
std          4.485532
min       1983.000000
25%       2010.000000
50%       2013.000000
75%       2016.000000
max       2020.000000
Name: year, dtype: float64 

count    4323.000000
mean     2013.065464
std         4.203865
min      1992.000000
25%      2011.000000
50%      2014.000000
75%      2016.000000
max      2020.000000
Name: year, dtype: float64


In [20]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)
print(df_one["selling_price"].describe(), "\n")
print(df_two["selling_price"].describe())

count      84992.00
mean      393614.67
std       299061.89
min        29999.00
25%       200000.00
50%       335000.00
75%       530000.00
max     10000000.00
Name: selling_price, dtype: float64 

count      4323.00
mean     502357.05
std      578794.36
min       20000.00
25%      202999.00
50%      350000.00
75%      600000.00
max     8900000.00
Name: selling_price, dtype: float64


In [21]:
print(df_one["km_driven"].describe(), "\n")
print(df_two["km_driven"].describe())

count     84992.00
mean      77091.61
std       59964.02
min           1.00
25%       43000.00
50%       70000.00
75%      100000.00
max     2360457.00
Name: km_driven, dtype: float64 

count     4323.00
mean     66459.83
std      46570.97
min          1.00
25%      35000.00
50%      60000.00
75%      90000.00
max     806599.00
Name: km_driven, dtype: float64


## Check for NA values

In [22]:
print("Total NA values in df_one: ", df_one.isnull().sum().sum())
print("Total NA values in df_two: ", df_two.isnull().sum().sum())

Total NA values in df_one:  0
Total NA values in df_two:  0


## Combine Prepared Data

In [23]:
master_data = pd.concat([df_one, df_two])
print(master_data.dtypes)
print("Total records: ", master_data["name"].count())
master_data.head()

name             object
year              int64
selling_price     int64
km_driven         int64
fuel             object
seller_type      object
transmission     object
owner            object
dtype: object
Total records:  89315


Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Ambassador CLASSIC 1500 DSL AC,2000,75000,90000,Diesel,Individual,Manual,Second Owner
1,Ambassador Classic 2000 DSZ AC PS,1994,99000,100000,Diesel,Individual,Manual,Second Owner
2,Ambassador Grand 1500 DSZ BSIII,2008,122000,60000,Diesel,Individual,Manual,Second Owner
3,Ambassador Grand 2000 DSZ PW CL,2008,200000,80000,Diesel,Individual,Manual,Third Owner
4,Ashok Leyland Stile LE,2013,300000,200000,Diesel,Individual,Manual,Second Owner


## Feature Extraction: Manfacturer

In [24]:
# Extract first word from the field to attempt to identify Make of vehicle
import re
pattern = "([^\s]+)"
master_names = master_data["name"].str.extract(pattern)
master_names.value_counts()

Maruti           47678
Hyundai          14916
Mahindra          5723
Tata              4561
Toyota            3417
Ford              3060
Honda             2487
Chevrolet         2479
Renault           2089
Volkswagen        1343
Nissan             379
Datsun             214
Skoda              196
Fiat               173
Audi               161
BMW                151
Mercedes-Benz      108
Jeep                42
Mitsubishi          38
Jaguar              22
Volvo               19
Force               11
Kia                 10
Land                 8
Ambassador           8
Daewoo               6
Isuzu                5
MG                   5
OpelCorsa            2
Lexus                1
Opel                 1
Peugeot              1
Ashok                1
dtype: int64

In [25]:
master_data["manufacturer"] = master_names
master_data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,manufacturer
0,Ambassador CLASSIC 1500 DSL AC,2000,75000,90000,Diesel,Individual,Manual,Second Owner,Ambassador
1,Ambassador Classic 2000 DSZ AC PS,1994,99000,100000,Diesel,Individual,Manual,Second Owner,Ambassador
2,Ambassador Grand 1500 DSZ BSIII,2008,122000,60000,Diesel,Individual,Manual,Second Owner,Ambassador
3,Ambassador Grand 2000 DSZ PW CL,2008,200000,80000,Diesel,Individual,Manual,Third Owner,Ambassador
4,Ashok Leyland Stile LE,2013,300000,200000,Diesel,Individual,Manual,Second Owner,Ashok


In [26]:
# Bin manufacturers with less than 20 records in the data as "Other"
low_count_makes = master_names.value_counts()[master_names.value_counts() < 20].index.to_flat_index().to_list()
# Above gives a list of tuples, which will need to be converted to a plain list for use in editing binned records
low_count_makes = [item for i in low_count_makes for item in i]
low_count_makes

['Volvo',
 'Force',
 'Kia',
 'Land',
 'Ambassador',
 'Daewoo',
 'Isuzu',
 'MG',
 'OpelCorsa',
 'Lexus',
 'Opel',
 'Peugeot',
 'Ashok']

In [27]:
master_data["manufacturer"][master_data["manufacturer"].isin(low_count_makes)] = "Other"
master_data["manufacturer"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Maruti           47678
Hyundai          14916
Mahindra          5723
Tata              4561
Toyota            3417
Ford              3060
Honda             2487
Chevrolet         2479
Renault           2089
Volkswagen        1343
Nissan             379
Datsun             214
Skoda              196
Fiat               173
Audi               161
BMW                151
Mercedes-Benz      108
Other               78
Jeep                42
Mitsubishi          38
Jaguar              22
Name: manufacturer, dtype: int64

In [29]:
# Bin manufacturers by continent
asian_makes = ["Hyundai", "Honda", "Toyota", "Nissan", "Datsun", "Lexus", "Mitsubishi"]
indian_makes = ["Maruti", "Mahindra", "Tata"]
european_makes = ["Renault", "Volkswagen", "Skoda", "BMW", "Audi", "Mercedez-Benz", "Fiat", "Jaguar", "Volvo"]
american_makes = ["Ford", "Chevrolet", "Jeep"]

master_data["made_in"] = master_data["manufacturer"]

master_data["made_in"][master_data["manufacturer"].isin(asian_makes)] = "Asia"
master_data["made_in"][master_data["manufacturer"].isin(indian_makes)] = "India"
master_data["made_in"][master_data["manufacturer"].isin(european_makes)] = "Europe"
master_data["made_in"][master_data["manufacturer"].isin(american_makes)] = "America"
master_data["made_in"][master_data["manufacturer"].isin(["Other"])] = "Unknown"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is 

## Feature Extraction: Price Bins

In [30]:
def getBin(record):
    if record["selling_price"] < 200000:
        return "20,000 - 200,000"
    elif record["selling_price"] < 400000:
        return "200,000 - 400,000"
    elif record["selling_price"] < 600000:
        return "400,000 - 600,000"
    else:
        return "600,000+"

In [31]:
master_data["price_group"] = master_data.apply(getBin, axis=1)

In [32]:
master_data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,manufacturer,made_in,price_group
0,Ambassador CLASSIC 1500 DSL AC,2000,75000,90000,Diesel,Individual,Manual,Second Owner,Other,Unknown,"20,000 - 200,000"
1,Ambassador Classic 2000 DSZ AC PS,1994,99000,100000,Diesel,Individual,Manual,Second Owner,Other,Unknown,"20,000 - 200,000"
2,Ambassador Grand 1500 DSZ BSIII,2008,122000,60000,Diesel,Individual,Manual,Second Owner,Other,Unknown,"20,000 - 200,000"
3,Ambassador Grand 2000 DSZ PW CL,2008,200000,80000,Diesel,Individual,Manual,Third Owner,Other,Unknown,"200,000 - 400,000"
4,Ashok Leyland Stile LE,2013,300000,200000,Diesel,Individual,Manual,Second Owner,Other,Unknown,"200,000 - 400,000"


## Save Output

In [None]:
master_data.to_csv("../Resources/master_data.csv")

In [33]:
# Close the database connection
dbConnection.close();