### we will construct a linear model that explains the relationship car's mpg(miles per gallon\milage) has with other atributes

### step1 : Import Libraries

In [30]:
import pandas as pd
import numpy as np

In [3]:
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
import seaborn as sns

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split  #sklearn package's randomized data splitting function

### Step2 : Load and review data

In [53]:
car_df = pd.read_csv('auto-mpg.csv')

In [7]:
car_df.shape

(398, 9)

In [8]:
car_df.sample(5)  #out of 9 columns, 8 are independent & 1 column ie mpg is dependent variable. coz we need to find its relation

Unnamed: 0,mpg,cylinders,displacement,hp,weight,acceleration,year,origin,car name
333,32.7,6,168.0,132,2910,11.4,80,3,datsun 280-zx
114,26.0,4,98.0,90,2265,15.5,73,2,fiat 124 sport coupe
68,13.0,8,350.0,155,4502,13.5,72,1,buick lesabre custom
135,18.0,6,225.0,105,3613,16.5,74,1,plymouth satellite sebring
358,31.6,4,120.0,74,2635,18.3,81,3,mazda 626


In [9]:
## among all attributes car name is really not required for finding car's milage. so we can drop it

car_df = car_df.drop('car name', axis = 1)
car_df.head()

Unnamed: 0,mpg,cylinders,displacement,hp,weight,acceleration,year,origin
0,18.0,8,307.0,130,3504,12.0,70,1
1,15.0,8,350.0,165,3693,11.5,70,1
2,18.0,8,318.0,150,3436,11.0,70,1
3,16.0,8,304.0,150,3433,12.0,70,1
4,17.0,8,302.0,140,3449,10.5,70,1


In [10]:
## origin must be a place but its being represented by numbers. So we have to talk to the team and decide if
# we can replace it with names

car_df['origin'] = car_df['origin'].replace({1: 'america', 2: 'europe', 3: 'asia'})
car_df.head()

Unnamed: 0,mpg,cylinders,displacement,hp,weight,acceleration,year,origin
0,18.0,8,307.0,130,3504,12.0,70,america
1,15.0,8,350.0,165,3693,11.5,70,america
2,18.0,8,318.0,150,3436,11.0,70,america
3,16.0,8,304.0,150,3433,12.0,70,america
4,17.0,8,302.0,140,3449,10.5,70,america


In [11]:
## again origin is a text. we can convert it into numerical representation by 'One hot encoding'.
# With one-hot, we convert each categorical value into a new categorical column and 
#assign a binary value of 1 or 0 to those columns. 

car_df = pd.get_dummies(car_df, columns=['origin'])
car_df.sample(5)

Unnamed: 0,mpg,cylinders,displacement,hp,weight,acceleration,year,origin_america,origin_asia,origin_europe
127,19.0,6,232.0,100,2901,16.0,74,1,0,0
345,35.1,4,81.0,60,1760,16.1,81,0,1,0
283,20.2,6,232.0,90,3265,18.2,79,1,0,0
51,30.0,4,79.0,70,2074,19.5,71,0,0,1
389,22.0,6,232.0,112,2835,14.7,82,1,0,0


### Dealing with missing values

In [12]:
car_df.describe()   #hp is missing coz its not being recognized as a number.

Unnamed: 0,mpg,cylinders,displacement,weight,acceleration,year,origin_america,origin_asia,origin_europe
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,2970.424623,15.56809,76.01005,0.625628,0.198492,0.175879
std,7.815984,1.701004,104.269838,846.841774,2.757689,3.697627,0.484569,0.399367,0.381197
min,9.0,3.0,68.0,1613.0,8.0,70.0,0.0,0.0,0.0
25%,17.5,4.0,104.25,2223.75,13.825,73.0,0.0,0.0,0.0
50%,23.0,4.0,148.5,2803.5,15.5,76.0,1.0,0.0,0.0
75%,29.0,8.0,262.0,3608.0,17.175,79.0,1.0,0.0,0.0
max,46.6,8.0,455.0,5140.0,24.8,82.0,1.0,1.0,1.0


In [13]:
car_df.describe(include = 'all')   #now we can see hp

Unnamed: 0,mpg,cylinders,displacement,hp,weight,acceleration,year,origin_america,origin_asia,origin_europe
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0
unique,,,,94.0,,,,,,
top,,,,150.0,,,,,,
freq,,,,22.0,,,,,,
mean,23.514573,5.454774,193.425879,,2970.424623,15.56809,76.01005,0.625628,0.198492,0.175879
std,7.815984,1.701004,104.269838,,846.841774,2.757689,3.697627,0.484569,0.399367,0.381197
min,9.0,3.0,68.0,,1613.0,8.0,70.0,0.0,0.0,0.0
25%,17.5,4.0,104.25,,2223.75,13.825,73.0,0.0,0.0,0.0
50%,23.0,4.0,148.5,,2803.5,15.5,76.0,1.0,0.0,0.0
75%,29.0,8.0,262.0,,3608.0,17.175,79.0,1.0,0.0,0.0


In [14]:
car_df.dtypes     #hp is being recognized as object coz some values are missing.

mpg               float64
cylinders           int64
displacement      float64
hp                 object
weight              int64
acceleration      float64
year                int64
origin_america      uint8
origin_asia         uint8
origin_europe       uint8
dtype: object

In [15]:
car_df.isnull().sum()  #eventhough there are missing values for hp, its showing no missing values. coz ? is there in place of missing value


mpg               0
cylinders         0
displacement      0
hp                0
weight            0
acceleration      0
year              0
origin_america    0
origin_asia       0
origin_europe     0
dtype: int64

In [16]:
hpIsDigit = pd.DataFrame(car_df.hp.str.isdigit())  # if string is made of digits, store True. Otherwise False

car_df[hpIsDigit['hp'] == False]  #this will give table of observations where hp is False ie non digit ie '?'

Unnamed: 0,mpg,cylinders,displacement,hp,weight,acceleration,year,origin_america,origin_asia,origin_europe
32,25.0,4,98.0,?,2046,19.0,71,1,0,0
126,21.0,6,200.0,?,2875,17.0,74,1,0,0
330,40.9,4,85.0,?,1835,17.3,80,0,0,1
336,23.6,4,140.0,?,2905,14.3,80,1,0,0
354,34.5,4,100.0,?,2320,15.8,81,0,0,1
374,23.0,4,151.0,?,3035,20.5,82,1,0,0


In [17]:
# now we replace the missing values ie False values ie '?' by NaN (Not a Number)

car_df['hp'] = car_df['hp'].replace('?', np.nan)
car_df[hpIsDigit['hp']==False]

Unnamed: 0,mpg,cylinders,displacement,hp,weight,acceleration,year,origin_america,origin_asia,origin_europe
32,25.0,4,98.0,,2046,19.0,71,1,0,0
126,21.0,6,200.0,,2875,17.0,74,1,0,0
330,40.9,4,85.0,,1835,17.3,80,0,0,1
336,23.6,4,140.0,,2905,14.3,80,1,0,0
354,34.5,4,100.0,,2320,15.8,81,0,0,1
374,23.0,4,151.0,,3035,20.5,82,1,0,0


##### There are various ways to handle with missing data. Drop the rows, replace it with median, etc. out of 398 rows 6 rows has NaN in 'hp'. 

In [18]:
car_df.median()

mpg                 23.0
cylinders            4.0
displacement       148.5
hp                  93.5
weight            2803.5
acceleration        15.5
year                76.0
origin_america       1.0
origin_asia          0.0
origin_europe        0.0
dtype: float64

In [19]:
# we can replace the missing values by that column's median value.

MedianFiller = lambda x: x.fillna(x.median()) # we define a function for filling median values.

car_df = car_df.apply(MedianFiller, axis = 0) #we apply median values for missing value.

car_df[hpIsDigit['hp']==False]  #we can see that NaN is replaced by median values.

Unnamed: 0,mpg,cylinders,displacement,hp,weight,acceleration,year,origin_america,origin_asia,origin_europe
32,25.0,4,98.0,93.5,2046,19.0,71,1,0,0
126,21.0,6,200.0,93.5,2875,17.0,74,1,0,0
330,40.9,4,85.0,93.5,1835,17.3,80,0,0,1
336,23.6,4,140.0,93.5,2905,14.3,80,1,0,0
354,34.5,4,100.0,93.5,2320,15.8,81,0,0,1
374,23.0,4,151.0,93.5,3035,20.5,82,1,0,0


In [20]:
# since hp dtype is object we can change it to float

car_df['hp'] = car_df['hp'].astype('float64')
car_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   mpg             398 non-null    float64
 1   cylinders       398 non-null    int64  
 2   displacement    398 non-null    float64
 3   hp              398 non-null    float64
 4   weight          398 non-null    int64  
 5   acceleration    398 non-null    float64
 6   year            398 non-null    int64  
 7   origin_america  398 non-null    uint8  
 8   origin_asia     398 non-null    uint8  
 9   origin_europe   398 non-null    uint8  
dtypes: float64(4), int64(3), uint8(3)
memory usage: 23.1 KB


### Bivariate Plot

In [21]:
# sns.pairplot(car_df, diag_kind = 'kde')     #sometimes its difficult to infer from plotting all the plot at once.

In [22]:
# car_df_attr = car_df.iloc[:, 0:7]         #we have only considered first 7 columns.
# sns.pairplot(car_df_attr, diag_kind = 'kde')

In [23]:
# observation between mpg and other attributes indicate realtion is not really linear.
# However plot indicates linearity would still capture quite a bit of useful information / pattern.

### Split Data

In [24]:
# lets build our linear model. We declare our dependent and independent variables ie x and y

# mpg is our dependent variable.
y = car_df[['mpg']]

#all other attributes except mpg are independent variables
x = car_df.drop(['mpg'], axis = 1)

In [25]:
# split x and y into training and test set in 70:30 ratio.

x_train, x_test , y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state = 1)

#random_state = 1 will consider same set for training as well as testing giving same result. Not that important to write.

### Fit Linear Model

In [26]:
reg_model = LinearRegression()      #we are instantiating our linear model
reg_model.fit(x_train, y_train)     #we fill in traing and testing data

LinearRegression()

In [27]:
reg_model.score(x_train, y_train)   #this gives R^2 score for training data which is 81.41%.

0.8141025501610559

In [29]:
reg_model.score(x_test, y_test)     #R^2 for testing data which is 84.33% which means our model is good.

0.8433135132808826

In [55]:
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures


poly = PolynomialFeatures(degree = 2, interaction_only = True) #product of 2 or 3 diff parameters is interaction terms

x_train2 = poly.fit_transform(x_train)
x_test2 = poly.fit_transform(x_test)

poly_clf = LinearRegression()
poly_clf.fit(x_train2, y_train)

# y_pred = poly_clf.predict(x_train2)
# # y_pred

poly_clf.score(x_train2, y_train)

0.9015333049476941

In [56]:
poly_clf.score(x_test2, y_test)

0.8647737718423605

In [47]:
car_df.corr()

# correlation bet dependent and independent (mpg & other variables) should be high. And corr must be low among independent.
#but there are few independent variables having high correlation among them. We must reduce this corr as much as possible
# one way to do is to drop that varaible. But maybe not ideal for all cases. It depends on problem.


Unnamed: 0,mpg,cylinders,displacement,hp,weight,acceleration,year,origin_america,origin_asia,origin_europe
mpg,1.0,-0.775396,-0.804203,-0.773453,-0.831741,0.420289,0.579267,-0.568192,0.442174,0.259022
cylinders,-0.775396,1.0,0.950721,0.841284,0.896017,-0.505419,-0.348746,0.604351,-0.396479,-0.352861
displacement,-0.804203,0.950721,1.0,0.895778,0.932824,-0.543684,-0.370164,0.651407,-0.433505,-0.373886
hp,-0.773453,0.841284,0.895778,1.0,0.862442,-0.68659,-0.413733,0.485418,-0.318972,-0.282877
weight,-0.831741,0.896017,0.932824,0.862442,1.0,-0.417457,-0.306564,0.598398,-0.440817,-0.298843
acceleration,0.420289,-0.505419,-0.543684,-0.68659,-0.417457,1.0,0.288137,-0.250806,0.109144,0.204473
year,0.579267,-0.348746,-0.370164,-0.413733,-0.306564,0.288137,1.0,-0.139883,0.193101,-0.024489
origin_america,-0.568192,0.604351,0.651407,0.485418,0.598398,-0.250806,-0.139883,1.0,-0.643317,-0.597198
origin_asia,0.442174,-0.396479,-0.433505,-0.318972,-0.440817,0.109144,0.193101,-0.643317,1.0,-0.229895
origin_europe,0.259022,-0.352861,-0.373886,-0.282877,-0.298843,0.204473,-0.024489,-0.597198,-0.229895,1.0
