# Preprocessing Testing

In [47]:
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder


DATA_PATH = 'data/'

In [48]:
def load_housing_data(housing_path=DATA_PATH):
  csv_path = os.path.join(housing_path, "project_data.csv")
  return pd.read_csv(csv_path)

In [49]:
vehicles = load_housing_data()
vehicles.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [50]:
# All 'objects' are strings in our case
vehicles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           4340 non-null   object
 1   year           4340 non-null   int64 
 2   selling_price  4340 non-null   int64 
 3   km_driven      4340 non-null   int64 
 4   fuel           4340 non-null   object
 5   seller_type    4340 non-null   object
 6   transmission   4340 non-null   object
 7   owner          4340 non-null   object
dtypes: int64(3), object(5)
memory usage: 271.4+ KB


## Looking for Correlations

In [51]:
# Determines the correlation between numerical datapoints
corr_matric = vehicles.corr()


ValueError: could not convert string to float: 'Maruti 800 AC'

In [None]:
corr_matric['selling_price'].sort_values(ascending=False)

NameError: name 'corr_matric' is not defined

In [None]:
from pandas.plotting import scatter_matrix

attributes = ["selling_price", "year", "km_driven"]
scatter_matrix(vehicles[attributes], figsize=(12, 8))
# save_fig("scatter_matrix_plot")

In [None]:
vehicles.plot(kind="scatter", x="year", y="selling_price",
             alpha=0.2)
plt.axis([1990, 2022, 0, 3000000])
plt.show()

In [None]:
# Removing 'noise' aka instances that have a price greater than  1,750,000
# NOTE This applies to all data moving forward
vehicles.drop(vehicles[vehicles['selling_price'] >= 1750000].index, inplace = True)

vehicles.plot(kind="scatter", x="year", y="selling_price",
             alpha=0.2)
plt.axis([1990, 2022, 0, 3000000])
plt.show()

# ^^ This is the data for the first polynomial model ^^

# Encoding data
### Data that requires encoding:
name,	
fuel,
seller_type,
transmission,
owner,

## Encoding `owner` data

In [None]:
vehicles['owner'].value_counts()

owner
First Owner             2832
Second Owner            1106
Third Owner              304
Fourth & Above Owner      81
Test Drive Car            17
Name: count, dtype: int64

In [None]:
# Note these values are not actually integers
print('Actual data type = ', type(vehicles['owner'][0]))

Actual data type =  <class 'str'>


In [None]:
current_owner_strings = ['First Owner', 'Second Owner', 'Third Owner', 'Fourth & Above Owner', 'Test Drive Car']
new_strings = ['1', '2', '3', '4', '5']
vehicles['owner'] = vehicles['owner'].replace(current_owner_strings, new_strings)

# Convert strings to integers:
vehicles['owner'] = vehicles['owner'].astype('int64')

# Removing 'Test Drive Car' since it's NOT numerically related to other owner types
vehicles.drop(vehicles[vehicles['owner'] == 5].index, inplace = True)

# Show new changes
vehicles['owner'].value_counts()


owner
1    2832
2    1106
3     304
4      81
Name: count, dtype: int64

## Encoding `fuel` data

In [None]:
vehicles['fuel'].value_counts()

In [None]:
# Drop the electric car since we only have one instance and it will bias our models performance
vehicles.drop(vehicles[vehicles['fuel'] == 'Electric'].index, inplace = True)

In [None]:
fuel_dummies = pd.get_dummies(vehicles['fuel'])

vehicles = pd.concat([vehicles, fuel_dummies], axis=1)

# Drop the fuel column:
vehicles = vehicles.drop('fuel', axis=1)

vehicles.head()

Unnamed: 0,name,year,selling_price,km_driven,seller_type,transmission,owner,CNG,Diesel,Electric,LPG,Petrol
0,Maruti 800 AC,2007,60000,70000,Individual,Manual,First Owner,False,False,False,False,True
1,Maruti Wagon R LXI Minor,2007,135000,50000,Individual,Manual,First Owner,False,False,False,False,True
2,Hyundai Verna 1.6 SX,2012,600000,100000,Individual,Manual,First Owner,False,True,False,False,False
3,Datsun RediGO T Option,2017,250000,46000,Individual,Manual,First Owner,False,False,False,False,True
4,Honda Amaze VX i-DTEC,2014,450000,141000,Individual,Manual,Second Owner,False,True,False,False,False


## Encoding `seller_type`

In [None]:
vehicles['seller_type'].value_counts()

In [None]:
seller_type_dummies = pd.get_dummies(vehicles['seller_type'])

vehicles = pd.concat([vehicles, seller_type_dummies], axis=1)

# Drop the seller_type column:
vehicles = vehicles.drop('seller_type', axis=1)

vehicles.head()

## Encoding `transmission`

In [None]:
vehicles['transmission'].value_counts()

In [None]:
transmission_dummies = pd.get_dummies(vehicles['transmission'])

vehicles = pd.concat([vehicles, transmission_dummies], axis=1)

# Drop the transmission column:
vehicles = vehicles.drop('transmission', axis=1)

vehicles.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,owner,Automatic,Manual
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,First Owner,False,True
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,First Owner,False,True
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,First Owner,False,True
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,First Owner,False,True
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Second Owner,False,True


## Word embeddings will be used to encode `name` attribute, They can be dropped when performing predictions with regression models 