# Data Pre-processing

In [1]:
# Import libraries

import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sb
from scipy import stats

In [2]:
data_dir = 'data'

In [3]:
df = pd.read_csv(os.path.join(data_dir, 'vehicles_concat.csv'))

df

Unnamed: 0,price,model,condition,fuel,odometer,transmission,drive,size,type,paint_color
0,24895.0,jeep-cherokee,unspecified,gas,6973.0,automatic,4wd,unspecified,SUV,white
1,12999.0,nissan-altima,unspecified,gas,63810.0,unspecified,fwd,unspecified,sedan,unspecified
2,12799.0,hyundai-elantra,unspecified,gas,80210.0,automatic,unspecified,unspecified,sedan,silver
3,17850.0,ford-f150,unspecified,gas,76393.0,automatic,rwd,unspecified,unspecified,white
4,24150.0,dodge-challenger,unspecified,gas,74540.0,automatic,rwd,unspecified,coupe,white
5,9699.0,ford-fusion,unspecified,gas,98254.0,automatic,fwd,unspecified,sedan,white
6,9699.0,kia-optima,unspecified,gas,81847.0,automatic,unspecified,unspecified,sedan,brown
7,14999.0,jeep-cherokee,unspecified,gas,48384.0,automatic,4wd,unspecified,SUV,black
8,25499.0,gmc-sierra,unspecified,gas,103099.0,automatic,4wd,unspecified,pickup,black
9,20998.0,volkswagen-tiguan,unspecified,gas,6553.0,automatic,fwd,unspecified,SUV,white


In [4]:
def one_hot_encode(df, feature):
    dummies = pd.get_dummies(df[feature], prefix=feature)
    
    df = pd.concat([df, dummies], axis=1)
    
    return df

In [5]:
encoded_df = df.copy()

features = ['model', 'transmission', 'paint_color', 'fuel', 'type', 'drive', 'size', 'condition']

for feature in features:
    encoded_df = one_hot_encode(encoded_df, feature)
    
encoded_df = encoded_df.drop(columns=features)

encoded_df

Unnamed: 0,price,odometer,model_acura-ilx,model_acura-mdx,model_acura-rdx,model_acura-tlx,model_audi-a3,model_audi-a4,model_audi-a5,model_audi-a6,...,size_mid-size,size_sub-compact,size_unspecified,condition_excellent,condition_fair,condition_good,condition_like new,condition_new,condition_salvage,condition_unspecified
0,24895.0,6973.0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1,12999.0,63810.0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,12799.0,80210.0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,17850.0,76393.0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
4,24150.0,74540.0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
5,9699.0,98254.0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
6,9699.0,81847.0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
7,14999.0,48384.0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
8,25499.0,103099.0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
9,20998.0,6553.0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [6]:
encoded_df.to_csv(os.path.join(data_dir, 'vehicles_encoded.csv'), index=False)

In [7]:
encoded_df

Unnamed: 0,price,odometer,model_acura-ilx,model_acura-mdx,model_acura-rdx,model_acura-tlx,model_audi-a3,model_audi-a4,model_audi-a5,model_audi-a6,...,size_mid-size,size_sub-compact,size_unspecified,condition_excellent,condition_fair,condition_good,condition_like new,condition_new,condition_salvage,condition_unspecified
0,24895.0,6973.0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1,12999.0,63810.0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,12799.0,80210.0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,17850.0,76393.0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
4,24150.0,74540.0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
5,9699.0,98254.0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
6,9699.0,81847.0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
7,14999.0,48384.0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
8,25499.0,103099.0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
9,20998.0,6553.0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [8]:
from sklearn.model_selection import train_test_split

y = pd.DataFrame(encoded_df['price'])
X = encoded_df.drop(columns='price')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=1/3)

print(X_train.shape)
print(X_validation.shape)
print(X_test.shape)

(53440, 300)
(26721, 300)
(26721, 300)


In [9]:
X_test.to_csv(os.path.join(data_dir, 'test.csv'), header=False, index=False)
y_test.to_csv(os.path.join(data_dir, 'test_y.csv'), header=False, index=False)

pd.concat([y_train, X_train], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)
pd.concat([y_validation, X_validation], axis=1).to_csv(os.path.join(data_dir, 'validation.csv'), header=False, index=False)