In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
pd.set_option("display.max_rows",210)
pd.set_option("display.max_columns",27)

np.random.seed(seed=5)

In [2]:
df = pd.read_csv("auto.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
symboling            205 non-null int64
normalized-losses    205 non-null int64
make                 205 non-null object
fuel-type            205 non-null object
aspiration           205 non-null object
num-of-doors         205 non-null object
body-style           205 non-null object
drive-wheels         205 non-null object
engine-location      205 non-null object
wheel-base           205 non-null float64
length               205 non-null float64
width                205 non-null float64
height               205 non-null float64
curb-weight          205 non-null int64
engine-type          205 non-null object
num-of-cylinders     205 non-null object
engine-size          205 non-null int64
fuel-system          205 non-null object
bore                 205 non-null object
stroke               205 non-null object
compression-ratio    205 non-null float64
horsepower           205 non-null

In [4]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,100,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,100,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,100,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [5]:
df.shape

(205, 26)

### Need to transform object datatypes to integer

In [6]:
df.dtypes

symboling              int64
normalized-losses      int64
make                  object
fuel-type             object
aspiration            object
num-of-doors          object
body-style            object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
engine-type           object
num-of-cylinders      object
engine-size            int64
fuel-system           object
bore                  object
stroke                object
compression-ratio    float64
horsepower            object
peak-rpm              object
city-mpg               int64
highway-mpg            int64
price                  int64
dtype: object

In [7]:
df['num-of-cylinders'].value_counts()

four      159
six        24
five       11
eight       5
two         4
three       1
twelve      1
Name: num-of-cylinders, dtype: int64

In [8]:
mask1 = {'four':4,'six':6,'five':5,'eight':8,'two':2,'twelve':12,'three':3}
df['num-of-cylinders'].replace(mask1,inplace=True)

In [9]:
df['num-of-cylinders'].value_counts()

4     159
6      24
5      11
8       5
2       4
12      1
3       1
Name: num-of-cylinders, dtype: int64

In [10]:
df['bore'].value_counts()

3.62    23
3.19    20
3.15    15
3.03    12
2.97    12
3.46     9
3.43     8
3.31     8
3.78     8
3.27     7
2.91     7
3.54     6
3.58     6
3.39     6
3.05     6
3.01     5
3.70     5
?        4
3.35     4
3.74     3
3.59     3
3.17     3
3.50     2
3.47     2
3.80     2
3.94     2
3.33     2
3.63     2
3.13     2
3.24     2
3.61     1
2.92     1
3.08     1
3.34     1
2.68     1
2.99     1
2.54     1
3.76     1
3.60     1
Name: bore, dtype: int64

In [11]:
mask2 = {'?':2.07}
df['bore'].replace(mask2,inplace=True)

In [12]:
df['bore'].value_counts()

3.62    23
3.19    20
3.15    15
3.03    12
2.97    12
3.46     9
3.31     8
3.43     8
3.78     8
3.27     7
2.91     7
3.58     6
3.05     6
3.54     6
3.39     6
3.70     5
3.01     5
3.35     4
2.07     4
3.59     3
3.17     3
3.74     3
3.13     2
3.63     2
3.94     2
3.80     2
3.50     2
3.33     2
3.47     2
3.24     2
2.92     1
3.61     1
3.08     1
3.34     1
2.68     1
2.99     1
2.54     1
3.76     1
3.60     1
Name: bore, dtype: int64

In [13]:
df['bore'] = df['bore'].astype('float64')

In [14]:
df['bore'].dtypes

dtype('float64')

In [15]:
df['stroke'].value_counts()

3.40    20
3.03    14
3.15    14
3.23    14
3.39    13
2.64    11
3.35     9
3.29     9
3.46     8
3.11     6
3.50     6
3.41     6
3.58     6
3.07     6
3.27     6
3.19     6
3.52     5
3.64     5
3.54     4
3.47     4
3.86     4
?        4
3.90     3
2.90     3
4.17     2
3.08     2
2.19     2
2.68     2
3.10     2
2.80     2
2.76     1
3.21     1
3.16     1
2.07     1
2.36     1
2.87     1
3.12     1
Name: stroke, dtype: int64

In [16]:
mask3 = {'?':2.08}
df['stroke'].replace(mask3,inplace=True)

In [17]:
df['stroke'].value_counts()

3.40    20
3.03    14
3.15    14
3.23    14
3.39    13
2.64    11
3.35     9
3.29     9
3.46     8
3.07     6
3.11     6
3.50     6
3.41     6
3.58     6
3.27     6
3.19     6
3.52     5
3.64     5
3.86     4
3.47     4
3.54     4
2.08     4
3.90     3
2.90     3
3.10     2
2.68     2
2.19     2
3.08     2
2.80     2
4.17     2
2.87     1
3.12     1
2.07     1
3.16     1
3.21     1
2.76     1
2.36     1
Name: stroke, dtype: int64

In [18]:
df['stroke'] = df['stroke'].astype('float64')

In [19]:
df['bore'].dtypes

dtype('float64')

In [20]:
df['horsepower'].value_counts()

68     19
70     11
69     10
116     9
110     8
95      7
114     6
101     6
62      6
160     6
88      6
97      5
145     5
102     5
84      5
76      5
82      5
92      4
111     4
123     4
86      4
207     3
121     3
152     3
182     3
90      3
73      3
85      3
?       2
52      2
162     2
155     2
176     2
161     2
94      2
56      2
112     2
100     2
184     2
156     2
58      1
175     1
288     1
143     1
140     1
48      1
262     1
78      1
115     1
72      1
134     1
135     1
142     1
200     1
55      1
120     1
64      1
60      1
106     1
154     1
Name: horsepower, dtype: int64

In [21]:
mask4 = {'?':50}
df['horsepower'].replace(mask4,inplace=True)

In [22]:
df['horsepower'] = df['horsepower'].astype('float64')

In [23]:
df['horsepower'].dtypes

dtype('float64')

In [24]:
df['peak-rpm'].value_counts()

5500    37
4800    36
5000    27
5200    23
5400    13
6000     9
4500     7
5800     7
5250     7
4200     5
4150     5
4750     4
4350     4
5900     3
4250     3
5100     3
4400     3
?        2
6600     2
5750     1
5300     1
5600     1
4900     1
4650     1
Name: peak-rpm, dtype: int64

In [25]:
mask5 = {'?':4300}
df['peak-rpm'].replace(mask5,inplace=True)

In [26]:
df['peak-rpm'] = df['peak-rpm'].astype('float64')

In [27]:
df['peak-rpm'].dtypes

dtype('float64')

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
symboling            205 non-null int64
normalized-losses    205 non-null int64
make                 205 non-null object
fuel-type            205 non-null object
aspiration           205 non-null object
num-of-doors         205 non-null object
body-style           205 non-null object
drive-wheels         205 non-null object
engine-location      205 non-null object
wheel-base           205 non-null float64
length               205 non-null float64
width                205 non-null float64
height               205 non-null float64
curb-weight          205 non-null int64
engine-type          205 non-null object
num-of-cylinders     205 non-null int64
engine-size          205 non-null int64
fuel-system          205 non-null object
bore                 205 non-null float64
stroke               205 non-null float64
compression-ratio    205 non-null float64
horsepower           205 non-nul

In [29]:
#Save as csv
#df.to_csv("train.csv",index=False)