1. loading all the libraries
2. reading the dataset
3. select the feature (independent) and the target (dependent)
4. split the dataset into train and test
5. create the model and fit it with training data
6. predict the target for test data
7. evaluate the model
8. visualize the model
9. save the model
10. load the model

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
automobile_data_url = 'https://raw.githubusercontent.com/digipodium/Datasets/main/regression/automobile.csv'
df = pd.read_csv(automobile_data_url, index_col=0)

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [5]:
# lets create a simple linear regression model on Horsepower vs Price

In [6]:
# preprocessing in this dataset is required
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder

numerical_col = ['normalized-losses','wheel-base','length','width','height',
'curb-weight','engine-size','bore','stroke','compression-ratio','horsepower',
'peak-rpm','city-mpg','highway-mpg','price']

categorical_col = ['symboling','make','fuel-type','aspiration','num-of-doors','body-style',
'drive-wheels','engine-location','engine-type','num-of-cylinders','fuel-system']

# remove all ? from dataframe
df.replace('?', np.nan, inplace=True)

# numerical column with missing values
df[numerical_col].isnull().sum()

df[categorical_col].isnull().sum()

num_col_with_na = ['normalized-losses','bore','stroke','horsepower','peak-rpm']
cat_col_with_na = ['num-of-doors']

num_imp = SimpleImputer()
cat_imp = SimpleImputer(strategy='most_frequent')

df[num_col_with_na] = num_imp.fit_transform(df[num_col_with_na])
df[cat_col_with_na] = cat_imp.fit_transform(df[cat_col_with_na])

binary_cols = ['fuel-type','aspiration','num-of-doors','engine-location']
ordinal_enc = OrdinalEncoder()

df[binary_cols]= ordinal_enc.fit_transform(df[binary_cols])
df[categorical_col]

Unnamed: 0,symboling,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,engine-type,num-of-cylinders,fuel-system
0,3,alfa-romero,1.0,0.0,1.0,convertible,rwd,0.0,dohc,four,mpfi
1,3,alfa-romero,1.0,0.0,1.0,convertible,rwd,0.0,dohc,four,mpfi
2,1,alfa-romero,1.0,0.0,1.0,hatchback,rwd,0.0,ohcv,six,mpfi
3,2,audi,1.0,0.0,0.0,sedan,fwd,0.0,ohc,four,mpfi
4,2,audi,1.0,0.0,0.0,sedan,4wd,0.0,ohc,five,mpfi
...,...,...,...,...,...,...,...,...,...,...,...
200,-1,volvo,1.0,0.0,0.0,sedan,rwd,0.0,ohc,four,mpfi
201,-1,volvo,1.0,1.0,0.0,sedan,rwd,0.0,ohc,four,mpfi
202,-1,volvo,1.0,0.0,0.0,sedan,rwd,0.0,ohcv,six,mpfi
203,-1,volvo,0.0,1.0,0.0,sedan,rwd,0.0,ohc,six,idi
