In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
%matplotlib inline

In [2]:
mpg_df = pd.read_csv('car-mpg.csv')

In [3]:
mpg_df.head(15)

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,origin,car_type,car_name
0,18.0,8,307.0,130,3504,12.0,70,1,0,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,0,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,0,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,0,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,0,ford torino
5,15.0,8,429.0,198,4341,10.0,70,1,0,ford galaxie 500
6,14.0,8,454.0,220,4354,9.0,70,1,0,chevrolet impala
7,14.0,8,440.0,215,4312,8.5,70,1,0,plymouth fury iii
8,14.0,8,455.0,225,4425,10.0,70,1,0,pontiac catalina
9,15.0,8,390.0,190,3850,8.5,70,1,0,amc ambassador dpl


In [4]:
mpg_df.drop('car_name', axis=1, inplace=True)

In [5]:
mpg_df

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,origin,car_type
0,18.0,8,307.0,130,3504,12.0,70,1,0
1,15.0,8,350.0,165,3693,11.5,70,1,0
2,18.0,8,318.0,150,3436,11.0,70,1,0
3,16.0,8,304.0,150,3433,12.0,70,1,0
4,17.0,8,302.0,140,3449,10.5,70,1,0
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1,1
394,44.0,4,97.0,52,2130,24.6,82,2,1
395,32.0,4,135.0,84,2295,11.6,82,1,1
396,28.0,4,120.0,79,2625,18.6,82,1,1


In [6]:
mpg_df["origin"] = mpg_df['origin'].replace({1:'america', 2:'europe', 3:'asia'})

In [7]:
mpg_df

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,origin,car_type
0,18.0,8,307.0,130,3504,12.0,70,america,0
1,15.0,8,350.0,165,3693,11.5,70,america,0
2,18.0,8,318.0,150,3436,11.0,70,america,0
3,16.0,8,304.0,150,3433,12.0,70,america,0
4,17.0,8,302.0,140,3449,10.5,70,america,0
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,america,1
394,44.0,4,97.0,52,2130,24.6,82,europe,1
395,32.0,4,135.0,84,2295,11.6,82,america,1
396,28.0,4,120.0,79,2625,18.6,82,america,1


In [8]:
mpg_df = pd.get_dummies(mpg_df, columns=['origin'])

In [9]:
mpg_df

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,car_type,origin_america,origin_asia,origin_europe
0,18.0,8,307.0,130,3504,12.0,70,0,1,0,0
1,15.0,8,350.0,165,3693,11.5,70,0,1,0,0
2,18.0,8,318.0,150,3436,11.0,70,0,1,0,0
3,16.0,8,304.0,150,3433,12.0,70,0,1,0,0
4,17.0,8,302.0,140,3449,10.5,70,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1,1,0,0
394,44.0,4,97.0,52,2130,24.6,82,1,0,0,1
395,32.0,4,135.0,84,2295,11.6,82,1,1,0,0
396,28.0,4,120.0,79,2625,18.6,82,1,1,0,0


In [10]:
mpg_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
mpg,398.0,23.514573,7.815984,9.0,17.5,23.0,29.0,46.6
cyl,398.0,5.454774,1.701004,3.0,4.0,4.0,8.0,8.0
disp,398.0,193.425879,104.269838,68.0,104.25,148.5,262.0,455.0
wt,398.0,2970.424623,846.841774,1613.0,2223.75,2803.5,3608.0,5140.0
acc,398.0,15.56809,2.757689,8.0,13.825,15.5,17.175,24.8
yr,398.0,76.01005,3.697627,70.0,73.0,76.0,79.0,82.0
car_type,398.0,0.530151,0.499718,0.0,0.0,1.0,1.0,1.0
origin_america,398.0,0.625628,0.484569,0.0,0.0,1.0,1.0,1.0
origin_asia,398.0,0.198492,0.399367,0.0,0.0,0.0,0.0,1.0
origin_europe,398.0,0.175879,0.381197,0.0,0.0,0.0,0.0,1.0


In [11]:
temp = pd.DataFrame(mpg_df.hp.str.isdigit())

In [12]:
temp['hp'].value_counts()

True     392
False      6
Name: hp, dtype: int64

In [13]:
temp[temp['hp']==False]

Unnamed: 0,hp
32,False
126,False
330,False
336,False
354,False
374,False


In [14]:
mpg_df['hp'].iloc[[1, 32, 126, 330, 336, 354, 374]]

1      165
32       ?
126      ?
330      ?
336      ?
354      ?
374      ?
Name: hp, dtype: object

In [15]:
mpg_df = mpg_df.replace('?', np.nan)

In [16]:
mpg_df[mpg_df.isnull().any(axis=1)]

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,car_type,origin_america,origin_asia,origin_europe
32,25.0,4,98.0,,2046,19.0,71,1,1,0,0
126,21.0,6,200.0,,2875,17.0,74,0,1,0,0
330,40.9,4,85.0,,1835,17.3,80,1,0,0,1
336,23.6,4,140.0,,2905,14.3,80,1,1,0,0
354,34.5,4,100.0,,2320,15.8,81,1,0,0,1
374,23.0,4,151.0,,3035,20.5,82,1,1,0,0


In [17]:
mpg_df.median()

mpg                 23.0
cyl                  4.0
disp               148.5
hp                  93.5
wt                2803.5
acc                 15.5
yr                  76.0
car_type             1.0
origin_america       1.0
origin_asia          0.0
origin_europe        0.0
dtype: float64

In [18]:
# mpg_df.fillna(mpg_df.median(), inplace=True)

mpg_df = mpg_df.apply(lambda x:x.fillna(x.median(), axis=0))

In [19]:
mpg_df.dtypes

mpg               float64
cyl                 int64
disp              float64
hp                 object
wt                  int64
acc               float64
yr                  int64
car_type            int64
origin_america      uint8
origin_asia         uint8
origin_europe       uint8
dtype: object

In [20]:
mpg_df['hp'] = mpg_df['hp'].astype('float64')

In [None]:
mpg_df_attr