In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score

In [2]:
data = pd.read_csv("car-mpg.csv")

In [3]:
data.head()

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,origin,car_type,car_name
0,18.0,8,307.0,130,3504,12.0,70,1,0,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,0,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,0,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,0,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,0,ford torino


In [4]:
#Drop car name - since this has not effect on the dependent variable that is mielage per gallon (mpg)
#Replace origin into 1,2,3 .. do not forget to get_dummies
#Replace ? with NaN
#Replace all nan with median


data = data.drop(["car_name"], axis = 1)
data["origin"] = data["origin"].replace({1: "america", 2 : "europe", 3 : "asia"})
data = pd.get_dummies(data, columns=["origin"])
data = data.replace("?", np.nan)
# data = data.apply(lambda x: x.fillna(x.median()), axis = 0)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   mpg             398 non-null    float64
 1   cyl             398 non-null    int64  
 2   disp            398 non-null    float64
 3   hp              392 non-null    object 
 4   wt              398 non-null    int64  
 5   acc             398 non-null    float64
 6   yr              398 non-null    int64  
 7   car_type        398 non-null    int64  
 8   origin_america  398 non-null    bool   
 9   origin_asia     398 non-null    bool   
 10  origin_europe   398 non-null    bool   
dtypes: bool(3), float64(3), int64(4), object(1)
memory usage: 26.2+ KB


In [6]:
data.isna().sum()


mpg               0
cyl               0
disp              0
hp                6
wt                0
acc               0
yr                0
car_type          0
origin_america    0
origin_asia       0
origin_europe     0
dtype: int64

In [7]:
data.hp = pd.to_numeric(data.hp)
data.hp = data.hp.fillna(data.hp.median())

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   mpg             398 non-null    float64
 1   cyl             398 non-null    int64  
 2   disp            398 non-null    float64
 3   hp              398 non-null    float64
 4   wt              398 non-null    int64  
 5   acc             398 non-null    float64
 6   yr              398 non-null    int64  
 7   car_type        398 non-null    int64  
 8   origin_america  398 non-null    bool   
 9   origin_asia     398 non-null    bool   
 10  origin_europe   398 non-null    bool   
dtypes: bool(3), float64(4), int64(4)
memory usage: 26.2 KB


In [9]:
X = data.drop(["mpg"], axis = 1) #independent variable
Y = data[["mpg"]] #dependent variable

In [14]:
#Scaling the data 


X_s = preprocessing.scale(X)
X_s = pd.DataFrame(X_s, columns = X.columns) #converting scales data into dataframe

Y_s = preprocessing.scale(Y)
Y_s = pd.DataFrame(Y_s, columns = Y.columns)

    

In [13]:
Y_s.head()

Unnamed: 0,mpg
0,-0.706439
1,-1.090751
2,-0.706439
3,-0.962647
4,-0.834543


In [16]:
X_s.head()

Unnamed: 0,cyl,disp,hp,wt,acc,yr,car_type,origin_america,origin_asia,origin_europe
0,1.498191,1.090604,0.673118,0.63087,-1.295498,-1.627426,-1.062235,0.773559,-0.497643,-0.461968
1,1.498191,1.503514,1.589958,0.854333,-1.477038,-1.627426,-1.062235,0.773559,-0.497643,-0.461968
2,1.498191,1.196232,1.197027,0.55047,-1.658577,-1.627426,-1.062235,0.773559,-0.497643,-0.461968
3,1.498191,1.061796,1.197027,0.546923,-1.295498,-1.627426,-1.062235,0.773559,-0.497643,-0.461968
4,1.498191,1.042591,0.935072,0.565841,-1.840117,-1.627426,-1.062235,0.773559,-0.497643,-0.461968


In [19]:
x_train, x_test, y_train, y_test = train_test_split(X_s, Y_s, train_size=0.7, random_state=1)
x_train.shape

(278, 10)