In [4]:
import pandas as pd
import numpy as np

In [2]:
auto = pd.read_csv('Auto.csv')

In [3]:
auto

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
392,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
393,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
394,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
395,28.0,4,120.0,79,2625,18.6,82,1,ford ranger


In [6]:
np.unique(auto['horsepower'])

array(['100', '102', '103', '105', '107', '108', '110', '112', '113',
       '115', '116', '120', '122', '125', '129', '130', '132', '133',
       '135', '137', '138', '139', '140', '142', '145', '148', '149',
       '150', '152', '153', '155', '158', '160', '165', '167', '170',
       '175', '180', '190', '193', '198', '200', '208', '210', '215',
       '220', '225', '230', '46', '48', '49', '52', '53', '54', '58',
       '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70',
       '71', '72', '74', '75', '76', '77', '78', '79', '80', '81', '82',
       '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93',
       '94', '95', '96', '97', '98', '?'], dtype=object)

In [7]:
# Convert the '?' to NaN
auto = pd.read_csv('Auto.csv', na_values=['?'])

In [8]:
np.unique(auto['horsepower'])

array([ 46.,  48.,  49.,  52.,  53.,  54.,  58.,  60.,  61.,  62.,  63.,
        64.,  65.,  66.,  67.,  68.,  69.,  70.,  71.,  72.,  74.,  75.,
        76.,  77.,  78.,  79.,  80.,  81.,  82.,  83.,  84.,  85.,  86.,
        87.,  88.,  89.,  90.,  91.,  92.,  93.,  94.,  95.,  96.,  97.,
        98., 100., 102., 103., 105., 107., 108., 110., 112., 113., 115.,
       116., 120., 122., 125., 129., 130., 132., 133., 135., 137., 138.,
       139., 140., 142., 145., 148., 149., 150., 152., 153., 155., 158.,
       160., 165., 167., 170., 175., 180., 190., 193., 198., 200., 208.,
       210., 215., 220., 225., 230.,  nan])

In [9]:
auto.shape

(397, 9)

In [10]:
auto_new = auto.dropna()
auto_new.shape

(392, 9)

In [12]:
auto_new.head(10)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino
5,15.0,8,429.0,198.0,4341,10.0,70,1,ford galaxie 500
6,14.0,8,454.0,220.0,4354,9.0,70,1,chevrolet impala
7,14.0,8,440.0,215.0,4312,8.5,70,1,plymouth fury iii
8,14.0,8,455.0,225.0,4425,10.0,70,1,pontiac catalina
9,15.0,8,390.0,190.0,3850,8.5,70,1,amc ambassador dpl


## Quantitative predictors
- mpg (miles per galon)
- displacement
- horsepower
- weight
- acceleration
- year

## Qualitative predictors
- name
- origin (it can be either 1, 2 or 3)

## Quantitative or Qualitative
- cylinders (can be 3, 4, 5, 6 or 8) can be treated as categorical
- origin (can be 1, 2 or 3) can be treated as categorical

In [15]:
np.unique(auto_new['cylinders'])

array([3, 4, 5, 6, 8], dtype=int64)

## Range of each quantitative predictor

In [52]:
def min_max_col(table, predictor):
    # Computes the minimum and maximum of a 
    # given column (predictor) from a dataframe (table)
    return np.min(table[predictor]), np.max(table[predictor])

#auto_new.columns
for predictor in ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year', 'origin']:
    min_p, max_p = min_max_col(auto_new, predictor)
    #print('{} range: ({}, {})'.format(predictor, min_p, max_p))
    print('{} range:'.format(predictor.title()) + '({}, {})'.format(min_p, max_p).rjust(30-len(predictor), '_'))

Mpg range:________________(9.0, 46.6)
Cylinders range:_______________(3, 8)
Displacement range:_____(68.0, 455.0)
Horsepower range:_______(46.0, 230.0)
Weight range:____________(1613, 5140)
Acceleration range:_______(8.0, 24.8)
Year range:__________________(70, 82)
Origin range:__________________(1, 3)


## Mean and Standard deviation of quantitative predictors

In [58]:
for predictor in ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year', 'origin']:
    print('{}: Mean={:.2f}, Std={:.2f}'.format(predictor.title(), np.mean(auto_new[predictor]), np.std(auto_new[predictor])))

Mpg: Mean=23.45, Std=7.80
Cylinders: Mean=5.47, Std=1.70
Displacement: Mean=194.41, Std=104.51
Horsepower: Mean=104.47, Std=38.44
Weight: Mean=2977.58, Std=848.32
Acceleration: Mean=15.54, Std=2.76
Year: Mean=75.98, Std=3.68
Origin: Mean=1.58, Std=0.80
