In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
raw = pd.read_csv('vehicles.csv', low_memory=False)
raw.dtypes

barrels08     float64
barrelsA08    float64
charge120     float64
charge240     float64
city08          int64
               ...   
modifiedOn     object
startStop      object
phevCity        int64
phevHwy         int64
phevComb        int64
Length: 84, dtype: object

In [3]:
def sort_features(df : pd.DataFrame) -> None:
    n = df.shape[0]
    z = np.zeros(n)

    for column in df.columns:
        feature = df[column]
        dtype = feature.dtype
        missing = feature.isna().sum()

        if dtype == object:
            print(f'[{column}]\ndtype: {str(dtype):13}missing: {missing:<7}{(missing/n)*100:5.1f}%\n')
            continue
        
        feature = feature.to_numpy()
        zeros = np.isclose(z, feature).sum() + np.isclose(z-1, feature).sum()
        print(f'[{column}]\ndtype: {str(dtype):13}missing: {missing:<7}{(missing/n)*100:5.1f}%\t nulls: {zeros:<7}{(zeros/n)*100:5.1f}%\n')

sort_features(raw)

[barrels08]
dtype: float64      missing: 0        0.0%	 nulls: 0        0.0%

[barrelsA08]
dtype: float64      missing: 0        0.0%	 nulls: 46302   96.1%

[charge120]
dtype: float64      missing: 0        0.0%	 nulls: 48202  100.0%

[charge240]
dtype: float64      missing: 0        0.0%	 nulls: 46952   97.4%

[city08]
dtype: int64        missing: 0        0.0%	 nulls: 0        0.0%

[city08U]
dtype: float64      missing: 0        0.0%	 nulls: 29656   61.5%

[cityA08]
dtype: int64        missing: 0        0.0%	 nulls: 46302   96.1%

[cityA08U]
dtype: float64      missing: 0        0.0%	 nulls: 46862   97.2%

[cityCD]
dtype: float64      missing: 0        0.0%	 nulls: 48142   99.9%

[cityE]
dtype: float64      missing: 0        0.0%	 nulls: 46898   97.3%

[cityUF]
dtype: float64      missing: 0        0.0%	 nulls: 47860   99.3%

[co2]
dtype: int64        missing: 0        0.0%	 nulls: 32901   68.3%

[co2A]
dtype: int64        missing: 0        0.0%	 nulls: 47505   98.6%

[co2TailpipeAG

In [4]:
# Dropped due to 0 or -1 values with > 50%
drop_nulls = ['barrelsA08', 'charge120', 'charge240', 'city08U', 'cityA08', 'cityA08U', 'cityCD', 'cityE', 'cityUF', 'co2TailpipeAGpm', 'comb08U', 'combA08', 'combA08U', 'combE', 'combinedCD', 'combinedUF', 'feScore', 'ghgScore', 'ghgScoreA', 'fuelCostA08', 'highway08U', 'highwayA08', 'highwayA08U', 'highwayCD', 'highwayE', 'highwayUF', 'hlv', 'hpv', 'lv2', 'lv4', 'phevBlended', 'pv2', 'pv4', 'range', 'rangeCity', 'rangeCityA', 'rangeHwy', 'rangeHwyA', 'UCityA', 'UHighwayA', 'charge240b', 'phevCity', 'phevHwy', 'phevComb']

# Dropped due to NA values
drop_missing = ['guzzler', 'trans_dscr', 'tCharger', 'sCharger', 'atvType', 'fuelType2', 'rangeA', 'mfrCode', 'c240Dscr', 'c240bDscr', 'startStop']

# Dropped due to metadata or irrelevant data
drop_misc = ['createdOn', 'modifiedOn', 'youSaveSpend', 'baseModel', 'engId', 'id']

In [5]:
clean = raw.drop(columns=drop_nulls+drop_missing+drop_misc)
clean.dtypes

barrels08         float64
city08              int64
co2                 int64
co2A                int64
co2TailpipeGpm    float64
comb08              int64
cylinders         float64
displ             float64
drive              object
eng_dscr           object
fuelCost08          int64
fuelType           object
fuelType1          object
highway08           int64
make               object
model              object
mpgData            object
trany              object
UCity             float64
UHighway          float64
VClass             object
year                int64
evMotor            object
dtype: object

In [6]:
np.unique(clean['VClass'], return_counts=True)

(array(['Compact Cars', 'Large Cars', 'Midsize Cars',
        'Midsize Station Wagons', 'Midsize-Large Station Wagons',
        'Minicompact Cars', 'Minivan - 2WD', 'Minivan - 4WD',
        'Small Pickup Trucks', 'Small Pickup Trucks 2WD',
        'Small Pickup Trucks 4WD', 'Small Sport Utility Vehicle 2WD',
        'Small Sport Utility Vehicle 4WD', 'Small Station Wagons',
        'Special Purpose Vehicle', 'Special Purpose Vehicle 2WD',
        'Special Purpose Vehicle 4WD', 'Special Purpose Vehicles',
        'Special Purpose Vehicles/2wd', 'Special Purpose Vehicles/4wd',
        'Sport Utility Vehicle - 2WD', 'Sport Utility Vehicle - 4WD',
        'Standard Pickup Trucks', 'Standard Pickup Trucks 2WD',
        'Standard Pickup Trucks 4WD', 'Standard Pickup Trucks/2wd',
        'Standard Sport Utility Vehicle 2WD',
        'Standard Sport Utility Vehicle 4WD', 'Subcompact Cars',
        'Two Seaters', 'Vans', 'Vans Passenger', 'Vans, Cargo Type',
        'Vans, Passenger Type'], dty

In [7]:
vclass_map = {
    'Minicompact Cars':                     'Compact',
    'Subcompact Cars':                      'Compact',
    'Compact Cars':                         'Compact',
    'Small Station Wagons':                 'Compact',
    'Midsize Cars':                         'Midsize',
    'Midsize Station Wagons':               'Midsize',
    'Midsize-Large Station Wagons':         'Midsize',
    'Small Sport Utility Vehicle 2WD':      'Compact SUV',
    'Small Sport Utility Vehicle 4WD':      'Compact SUV',
    'Sport Utility Vehicle - 2WD':          'Midsize SUV',
    'Sport Utility Vehicle - 4WD':          'Midsize SUV',
    'Standard Sport Utility Vehicle 2WD':   'Midsize SUV',
    'Standard Sport Utility Vehicle 4WD':   'Midsize SUV',
    'Special Purpose Vehicle':              'Special Purpose',
    'Special Purpose Vehicle 2WD':          'Special Purpose',
    'Special Purpose Vehicle 4WD':          'Special Purpose',
    'Special Purpose Vehicles':             'Special Purpose',
    'Special Purpose Vehicles/2wd':         'Special Purpose',
    'Special Purpose Vehicles/4wd':         'Special Purpose',
    'Small Pickup Trucks':                  'Small Pickup',
    'Small Pickup Trucks 2WD':              'Small Pickup',
    'Small Pickup Trucks 4WD':              'Small Pickup',
    'Standard Pickup Trucks':               'Standard Pickup',
    'Standard Pickup Trucks 2WD':           'Standard Pickup',
    'Standard Pickup Trucks 4WD':           'Standard Pickup',
    'Standard Pickup Trucks/2wd':           'Standard Pickup',
    'Large Cars':                           'Fullsize',
    'Minivan - 2WD':                        'Minivan',
    'Minivan - 4WD':                        'Minivan',
    'Vans':                                 'Van',
    'Vans Passenger':                       'Van',
    'Vans, Cargo Type':                     'Van',
    'Vans, Passenger Type':                 'Van',
    'Two Seaters':                          'Two Seater'
}

In [8]:
np.unique(clean['VClass'].map(vclass_map), return_counts=True)

(array(['Compact', 'Compact SUV', 'Fullsize', 'Midsize', 'Midsize SUV',
        'Minivan', 'Small Pickup', 'Special Purpose', 'Standard Pickup',
        'Two Seater', 'Van'], dtype=object),
 array([15561,  2992,  2680,  7007,  5862,   460,  1366,  2553,  5367,
         2450,  1904], dtype=int64))