## Load packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

## Load data

In [2]:
data = pd.read_csv('cars.csv')

## Drop bogus data features

In [3]:
# Drop 'location_region' column because data is bogus
data = data.drop(['location_region'], axis=1)

# Drop "feature_n" columns because I'm unsure what data they convey
for i in range(10):
    data = data.drop(['feature_'+str(i)], axis=1)

## Seperate out features that need to be transformed to numeric

In [4]:
bool_features = []
strg_features = []
features = data.columns
for feature in features:
    print('Feature = {}, Type = {}'.format(feature,type(data[feature][0])))
    if type(data[feature][0]) == np.bool_:
        bool_features.append(feature)
    if type(data[feature][0]) == str:
        strg_features.append(feature)

Feature = manufacturer_name, Type = <class 'str'>
Feature = model_name, Type = <class 'str'>
Feature = transmission, Type = <class 'str'>
Feature = color, Type = <class 'str'>
Feature = odometer_value, Type = <class 'numpy.int64'>
Feature = year_produced, Type = <class 'numpy.int64'>
Feature = engine_fuel, Type = <class 'str'>
Feature = engine_has_gas, Type = <class 'numpy.bool_'>
Feature = engine_type, Type = <class 'str'>
Feature = engine_capacity, Type = <class 'numpy.float64'>
Feature = body_type, Type = <class 'str'>
Feature = has_warranty, Type = <class 'numpy.bool_'>
Feature = state, Type = <class 'str'>
Feature = drivetrain, Type = <class 'str'>
Feature = price_usd, Type = <class 'numpy.float64'>
Feature = is_exchangeable, Type = <class 'numpy.bool_'>
Feature = number_of_photos, Type = <class 'numpy.int64'>
Feature = up_counter, Type = <class 'numpy.int64'>
Feature = duration_listed, Type = <class 'numpy.int64'>


## Convert boolean data features to numeric 1/0
*Boolean data can be converted to 1 for True or 0 for False to encode it numerically for downstream machine learning.*

In [5]:
for bool_feature in bool_features:
    data.loc[data[bool_feature] == True, bool_feature] = 1
    data.loc[data[bool_feature] == False, bool_feature] = 0

## One-hot-encode string data features
*Features that have string data (non-boolean) with more than 2 possible values need to be one-hot-encoded so that downstream machine learning algorithm can read the data properly. You can't just convert the different string values to integers (1, 2, 3, ... 81, ...) because machine learning algorithms will ascribe higher value to high integers when their should not be. For these data features, all values are equally valuable, they're just different and should be encoded as such.*

In [6]:
print()
for strg_feature in strg_features:
    # Print current feature's unique values
    print('Feature {}, Unique values = {}'.format(strg_feature, len(data[strg_feature].unique())))
    # One-hot-encode current categorical feature
    one_hot_encoder = OneHotEncoder(sparse=False)
    cur_vector = np.array(data[strg_feature].values).reshape(-1,1)
    cur_one_hot = one_hot_encoder.fit_transform(cur_vector)
    # Create column headings for one-hot-encoded categorical feature
    cols_to_add = []
    for i in range(cur_one_hot.shape[1]): cols_to_add.append(strg_feature+'_'+str(i))
    # Drop categorical feature and replace it with one-hot-encoded version
    data = data.drop(strg_feature, axis=1)
    data = pd.concat([data,pd.DataFrame(cur_one_hot,columns=cols_to_add)], axis=1)

Feature manufacturer_name, Unique values = 55
Feature model_name, Unique values = 1118
Feature transmission, Unique values = 2
Feature color, Unique values = 12
Feature engine_fuel, Unique values = 6
Feature engine_type, Unique values = 3
Feature body_type, Unique values = 12
Feature state, Unique values = 3
Feature drivetrain, Unique values = 3


In [8]:
# Print out size of transformed data
print('Data size = {} x {}'.format(data.shape[0],data.shape[1]))

Data size = 38531 x 1224


In [10]:
# Print out sample of transformed data
data.head()

Unnamed: 0,odometer_value,year_produced,engine_has_gas,engine_capacity,has_warranty,price_usd,is_exchangeable,number_of_photos,up_counter,duration_listed,...,body_type_8,body_type_9,body_type_10,body_type_11,state_0,state_1,state_2,drivetrain_0,drivetrain_1,drivetrain_2
0,190000,2010,0,2.5,0,10900.0,0,9,13,16,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,290000,2002,0,3.0,0,5000.0,1,12,54,83,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,402000,2001,0,2.5,0,2800.0,1,4,72,151,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,10000,1999,0,3.0,0,9999.0,1,9,42,86,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,280000,2001,0,2.5,0,2134.11,1,14,7,7,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
