In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline



In [2]:
data = pd.read_csv('CO2_Emissions_Canada.csv')
data.head(10)

Unnamed: 0,Make,Model,Vehicle Class,Engine Size(L),Cylinders,Transmission,Fuel Type,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km)
0,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244
5,ACURA,RLX,MID-SIZE,3.5,6,AS6,Z,11.9,7.7,10.0,28,230
6,ACURA,TL,MID-SIZE,3.5,6,AS6,Z,11.8,8.1,10.1,28,232
7,ACURA,TL AWD,MID-SIZE,3.7,6,AS6,Z,12.8,9.0,11.1,25,255
8,ACURA,TL AWD,MID-SIZE,3.7,6,M6,Z,13.4,9.5,11.6,24,267
9,ACURA,TSX,COMPACT,2.4,4,AS5,Z,10.6,7.5,9.2,31,212


In [3]:
# drop duplicate values
data.drop_duplicates(inplace = True)
data.duplicated().sum()

0

In [4]:
cols = ['Vehicle Class', 'Transmission','Fuel Type', 'Cylinders', 'Engine Size(L)', 'Fuel Consumption Comb (L/100 km)']
X = data[cols]
Y = data[['CO2 Emissions(g/km)']]
X.head()

Unnamed: 0,Vehicle Class,Transmission,Fuel Type,Cylinders,Engine Size(L),Fuel Consumption Comb (L/100 km)
0,COMPACT,AS5,Z,4,2.0,8.5
1,COMPACT,M6,Z,4,2.4,9.6
2,COMPACT,AV7,Z,4,1.5,5.9
3,SUV - SMALL,AS6,Z,6,3.5,11.1
4,SUV - SMALL,AS6,Z,6,3.5,10.6


In [5]:
from sklearn.preprocessing import OrdinalEncoder
oc = OrdinalEncoder()
cols = ['Vehicle Class', 'Transmission', 'Fuel Type']
X[cols] = oc.fit_transform(X[cols])

In [6]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, random_state = 2)

In [7]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(x_train, y_train)

In [8]:
y_pred = reg.predict(x_test)

# Regularized regresion Models
- Used to control overfitting
- Works by applying a penalty on the cost function
<br>

- 1. Lasso (L1 Regularization)
- 2. Ridge (L2 Regularization)

<img align = 'left' src = 'regularization.png' style = 'width:400px; hieght:150px' />

In [17]:
from sklearn.linear_model import Lasso                               # Regression with L1 Regularization
regressor = Lasso(alpha = 1)                                         # alpha : regularization parameter
regressor.fit(x_train, y_train)

print('Training set score : ', regressor.score(x_train, y_train))
print('Testing set score  : ', regressor.score(x_test, y_test))

Training set score :  0.902672738114896
Testing set score  :  0.9027236505162947


In [18]:
from sklearn.linear_model import Ridge                               # Regression with L2 Regularization
regressor = Ridge(alpha = 1)
regressor.fit(x_train, y_train)

print('Training set score : ', regressor.score(x_train, y_train))
print('Testing set score  : ', regressor.score(x_test, y_test))

Training set score :  0.9032624563324023
Testing set score  :  0.9011729942228526


In [19]:
from sklearn.linear_model import ElasticNet                               # Regression with L1 and L2 Regularization
regressor = ElasticNet(alpha = 1)
regressor.fit(x_train, y_train)

print('Training set score : ', regressor.score(x_train, y_train))
print('Testing set score  : ', regressor.score(x_test, y_test))

Training set score :  0.8966101304213263
Testing set score  :  0.9006610986984465


---
# ====================================================
### Concepts covered
- Data preprocessing
- Simple Linear regression
- Multiple regression
- Polynomial regression
- Regression evaluation metrics
- Underfitting and overfitting
- Bias Variance tradeoff
- Regularization and Regularized regression models
        - Lasso (L1)
        - Ridge (L2)
        - ElasticNet (L1 + L2)
- K Nearest Neighbors