The goal of the project is to build a model in order to predict a car's consumption fuel given the features from the [Auto MPG Data Set](https://archive.ics.uci.edu/ml/datasets/auto+mpg).
---



## 0- Environment preparation & Import libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import pickle
import plotly.express as px
import plotly.figure_factory as ff

from sklearn.linear_model import LinearRegression
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
%cd /content/gdrive/My Drive/Data Science/Auto MPG/data

/content/gdrive/My Drive/Data Science/Auto MPG/data


## 1- Data Exploration and Visualization

In [3]:
columns = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin', 'car name']
auto_df = pd.read_csv('auto-mpg.data', delim_whitespace=True, names=columns)

In [4]:
auto_df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [5]:
auto_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    float64
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB


In [6]:
auto_df['horsepower'].value_counts()

150.0    22
90.00    20
88.00    19
110.0    18
100.0    17
         ..
158.0     1
148.0     1
103.0     1
230.0     1
91.00     1
Name: horsepower, Length: 94, dtype: int64

In [7]:
auto_df.isin(['?']).any()

mpg             False
cylinders       False
displacement    False
horsepower       True
weight          False
acceleration    False
model year      False
origin          False
car name        False
dtype: bool

In [8]:
# replace '?' with nan
auto_df.loc[auto_df['horsepower'] == '?', 'horsepower'] = np.nan

In [9]:
auto_df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

In [10]:
# do a multivariate imputation for the feature 'horsepower'
imp = IterativeImputer(max_iter=10, random_state=0)

In [11]:
auto_df.iloc[:, :-1] = imp.fit_transform(auto_df.iloc[:, :-1])

In [12]:
auto_df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

In [13]:
auto_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    float64
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    float64
 4   weight        398 non-null    float64
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    float64
 7   origin        398 non-null    float64
 8   car name      398 non-null    object 
dtypes: float64(8), object(1)
memory usage: 28.1+ KB


In [14]:
# cast the columns 'cylinders', 'model year' and 'origin' as 'int'
auto_df['cylinders'] = auto_df['cylinders'].astype('int')
auto_df['model year'] = auto_df['model year'].astype('int')
auto_df['origin'] = auto_df['origin'].astype('int')

In [15]:
auto_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    float64
 4   weight        398 non-null    float64
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(5), int64(3), object(1)
memory usage: 28.1+ KB


In [16]:
auto_df['origin'].value_counts()

1    249
3     79
2     70
Name: origin, dtype: int64

In [17]:
auto_df.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,104.049278,2970.424623,15.56809,76.01005,1.572864
std,7.815984,1.701004,104.269838,38.399473,846.841774,2.757689,3.697627,0.802055
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0,1.0
25%,17.5,4.0,104.25,75.0,2223.75,13.825,73.0,1.0
50%,23.0,4.0,148.5,92.5,2803.5,15.5,76.0,1.0
75%,29.0,8.0,262.0,125.0,3608.0,17.175,79.0,2.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0,3.0


In [18]:
corr_df = auto_df[['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'origin']].corr()

In [19]:
fig1 = ff.create_annotated_heatmap(z = corr_df.values, x=list(corr_df.columns), \
                                   y=list(corr_df.columns), annotation_text=np.round(corr_df.values, 2), \
                                   colorscale='Viridis', showscale=True)
update1 = fig1.update_layout(title = 'Heatmap of the correlation matrix of Auto MPG dataset', \
                            xaxis={'side': 'bottom'})
fig1.show()

In [20]:
fig2 = px.scatter_matrix(auto_df, dimensions=['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'origin'], \
                         title = 'Scatterplot of the features mpg, cylinders, displacement, weight, acceleration, origin', height=1000)
fig2.show()

## 2- Data preprocessing

In [22]:
data = auto_df.copy()

In [23]:
data.columns

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model year', 'origin', 'car name'],
      dtype='object')

In [24]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    float64
 4   weight        398 non-null    float64
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(5), int64(3), object(1)
memory usage: 28.1+ KB


In [25]:
data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [26]:
# scaling the columns displacement, horsepower, weight, acceleration
sc = StandardScaler()
data[['displacement', 'horsepower', 'weight', 'acceleration']] = \
sc.fit_transform(data[['displacement', 'horsepower', 'weight', 'acceleration']])

In [27]:
data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,1.090604,0.67666,0.63087,-1.295498,70,1,chevrolet chevelle malibu
1,15.0,8,1.503514,1.589278,0.854333,-1.477038,70,1,buick skylark 320
2,18.0,8,1.196232,1.198156,0.55047,-1.658577,70,1,plymouth satellite
3,16.0,8,1.061796,1.198156,0.546923,-1.295498,70,1,amc rebel sst
4,17.0,8,1.042591,0.937408,0.565841,-1.840117,70,1,ford torino


In [28]:
data['model year'].value_counts()

73    40
78    36
76    34
82    31
75    30
81    29
80    29
79    29
70    29
77    28
72    28
71    28
74    27
Name: model year, dtype: int64

In [30]:
# get dummies of the column 'model year'
data = pd.get_dummies(data, prefix='model_year', columns=['model year'], drop_first=True)

In [29]:
# get dummies of the column 'origin'
data = pd.get_dummies(data, prefix='origin', columns=['origin'], drop_first=True)

In [31]:
data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,car name,origin_2,origin_3,model_year_71,model_year_72,model_year_73,model_year_74,model_year_75,model_year_76,model_year_77,model_year_78,model_year_79,model_year_80,model_year_81,model_year_82
0,18.0,8,1.090604,0.67666,0.63087,-1.295498,chevrolet chevelle malibu,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,15.0,8,1.503514,1.589278,0.854333,-1.477038,buick skylark 320,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,18.0,8,1.196232,1.198156,0.55047,-1.658577,plymouth satellite,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,16.0,8,1.061796,1.198156,0.546923,-1.295498,amc rebel sst,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,17.0,8,1.042591,0.937408,0.565841,-1.840117,ford torino,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [32]:
data.shape

(398, 21)

In [33]:
# split the data
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, 1:], data.iloc[:, 0], \
                                                    random_state=2)

In [34]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((298, 20), (100, 20), (298,), (100,))

In [35]:
# drop the column 'car name'
X_train.drop('car name', axis=1, inplace=True)
X_test.drop('car name', axis=1, inplace=True)

## 3- Prediction model with Linear Regression

In [36]:
# build a prediction model
lr = LinearRegression()
lr.fit(X_train, y_train)
y_train_pred_lr = lr.predict(X_train)
y_test_pred_lr = lr.predict(X_test)

In [37]:
lr.score(X_train, y_train), lr.score(X_test, y_test)

(0.8368904886707419, 0.8930851345849047)

In [38]:
!pwd

/content/gdrive/My Drive/Data Science/Auto MPG/data


In [39]:
# save model
pickle.dump(lr, open('../auto_mpg_model.pkl', 'wb'))

In [40]:
loaded_model = pickle.load(open('../auto_mpg_model.pkl', 'rb'))

In [113]:
# test loaded model
loaded_model.score(X_train, y_train), loaded_model.score(X_test, y_test)

(0.8368904886707419, 0.8930851345849047)

In [74]:
# save scaler
pickle.dump(sc, open('../auto_mpg_scaler.pkl', 'wb'))

In [75]:
loaded_scaler = pickle.load(open('../auto_mpg_scaler.pkl', 'rb'))

In [None]:
#### test loaded scaler

In [95]:
data2 = auto_df.copy()

In [96]:
data2.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [97]:
data2.shape

(398, 9)

In [98]:
# get dummies of the column 'model year'
data2 = pd.get_dummies(data2, prefix='model_year', columns=['model year'], drop_first=True)

In [99]:
# get dummies of the column 'origin'
data2 = pd.get_dummies(data2, prefix='origin', columns=['origin'], drop_first=True)

In [100]:
data2.shape

(398, 21)

In [101]:
data3 = data2.copy()

In [102]:
data2[['displacement', 'horsepower', 'weight', 'acceleration']] = \
sc.transform(data2[['displacement', 'horsepower', 'weight', 'acceleration']])

In [103]:
data2.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,car name,model_year_71,model_year_72,model_year_73,model_year_74,model_year_75,model_year_76,model_year_77,model_year_78,model_year_79,model_year_80,model_year_81,model_year_82,origin_2,origin_3
0,18.0,8,1.090604,0.67666,0.63087,-1.295498,chevrolet chevelle malibu,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,15.0,8,1.503514,1.589278,0.854333,-1.477038,buick skylark 320,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,18.0,8,1.196232,1.198156,0.55047,-1.658577,plymouth satellite,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,16.0,8,1.061796,1.198156,0.546923,-1.295498,amc rebel sst,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,17.0,8,1.042591,0.937408,0.565841,-1.840117,ford torino,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [104]:
data3.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,car name,model_year_71,model_year_72,model_year_73,model_year_74,model_year_75,model_year_76,model_year_77,model_year_78,model_year_79,model_year_80,model_year_81,model_year_82,origin_2,origin_3
0,18.0,8,307.0,130.0,3504.0,12.0,chevrolet chevelle malibu,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,15.0,8,350.0,165.0,3693.0,11.5,buick skylark 320,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,18.0,8,318.0,150.0,3436.0,11.0,plymouth satellite,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,16.0,8,304.0,150.0,3433.0,12.0,amc rebel sst,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,17.0,8,302.0,140.0,3449.0,10.5,ford torino,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [105]:
data3[['displacement', 'horsepower', 'weight', 'acceleration']] = \
loaded_scaler.transform(data3[['displacement', 'horsepower', 'weight', 'acceleration']])

In [106]:
data3.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,car name,model_year_71,model_year_72,model_year_73,model_year_74,model_year_75,model_year_76,model_year_77,model_year_78,model_year_79,model_year_80,model_year_81,model_year_82,origin_2,origin_3
0,18.0,8,1.090604,0.67666,0.63087,-1.295498,chevrolet chevelle malibu,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,15.0,8,1.503514,1.589278,0.854333,-1.477038,buick skylark 320,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,18.0,8,1.196232,1.198156,0.55047,-1.658577,plymouth satellite,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,16.0,8,1.061796,1.198156,0.546923,-1.295498,amc rebel sst,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,17.0,8,1.042591,0.937408,0.565841,-1.840117,ford torino,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [107]:
data2.drop('car name', axis=1, inplace=True)
data3.drop('car name', axis=1, inplace=True)

In [108]:
data2.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year_71,model_year_72,model_year_73,model_year_74,model_year_75,model_year_76,model_year_77,model_year_78,model_year_79,model_year_80,model_year_81,model_year_82,origin_2,origin_3
0,18.0,8,1.090604,0.67666,0.63087,-1.295498,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,15.0,8,1.503514,1.589278,0.854333,-1.477038,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,18.0,8,1.196232,1.198156,0.55047,-1.658577,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,16.0,8,1.061796,1.198156,0.546923,-1.295498,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,17.0,8,1.042591,0.937408,0.565841,-1.840117,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [109]:
data3.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year_71,model_year_72,model_year_73,model_year_74,model_year_75,model_year_76,model_year_77,model_year_78,model_year_79,model_year_80,model_year_81,model_year_82,origin_2,origin_3
0,18.0,8,1.090604,0.67666,0.63087,-1.295498,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,15.0,8,1.503514,1.589278,0.854333,-1.477038,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,18.0,8,1.196232,1.198156,0.55047,-1.658577,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,16.0,8,1.061796,1.198156,0.546923,-1.295498,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,17.0,8,1.042591,0.937408,0.565841,-1.840117,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [110]:
data2.shape, data3.shape

((398, 20), (398, 20))

In [111]:
X2 = data2.iloc[:, 1:]
y2 = data2.iloc[:, 0]

X3 = data3.iloc[:, 1:]
y3 = data3.iloc[:, 0]

In [116]:
loaded_model.score(X2, y2), loaded_model.score(X3, y3)

(0.6948936404923207, 0.6948936404923207)