In [16]:
%matplotlib inline

# Numerical libraries
import numpy as np   

# Import Linear Regression machine learning library
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score

# to handle data in form of rows and columns 
import pandas as pd    

# importing ploting libraries
import matplotlib.pyplot as plt   

#importing seaborn for statistical plots
import seaborn as sns

In [17]:
mpg_df = pd.read_csv("car-mpg.csv")  
mpg_df = mpg_df.drop('car_name', axis=1)
mpg_df['origin'] = mpg_df['origin'].replace({1: 'america', 2: 'europe', 3: 'asia'})

# **************** IMPORTANT ****************
mpg_df = pd.get_dummies(mpg_df, columns=['origin'])
mpg_df = mpg_df.replace('?', np.nan)
mpg_df = mpg_df.apply(lambda x: x.fillna(x.median()),axis=0)
mpg_df['hp'] = mpg_df['hp'].astype('float64')

In [18]:
mpg_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 11 columns):
mpg               398 non-null float64
cyl               398 non-null int64
disp              398 non-null float64
hp                398 non-null float64
wt                398 non-null int64
acc               398 non-null float64
yr                398 non-null int64
car_type          398 non-null int64
origin_america    398 non-null int64
origin_asia       398 non-null int64
origin_europe     398 non-null int64
dtypes: float64(4), int64(7)
memory usage: 34.3 KB


# separate independent and dependent variables

In [19]:
# Copy all the predictor variables into X dataframe. Since 'mpg' is dependent variable drop it
X = mpg_df.drop('mpg', axis=1)

# Copy the 'mpg' column alone into the y dataframe. This is the dependent variable
y = mpg_df[['mpg']]


In [20]:
from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler()  
X_scaled = scaler.fit_transform(X)  
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=1)

# fit a simple linear model

In [22]:
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

print("Intercept: ", regression_model.intercept_)
for idx, col_name in enumerate(X.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

Intercept:  [23.66510774]
The coefficient for cyl is 2.5059518049385026
The coefficient for disp is 2.5357082860560514
The coefficient for hp is -1.7889335736325294
The coefficient for wt is -5.551819873098727
The coefficient for acc is 0.11485734803440747
The coefficient for yr is 2.9318465482116087
The coefficient for car_type is 2.977869737601945
The coefficient for origin_america is -0.583295529016598
The coefficient for origin_asia is 0.34749313804322646
The coefficient for origin_europe is 0.3774164680868858


In [23]:
print(regression_model.score(X_train, y_train))
print(regression_model.score(X_test, y_test))


0.8343770256960538
0.8513421387780067


## Iteration 2 - Linear regression 
### Understand Rule for Dummy Variable Regression
### Remove 1 Dummy variable


In [24]:

X_train = X_train.drop('origin_europe', axis=1)
X_test = X_test.drop('origin_europe', axis=1)

In [25]:
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

print("Intercept: ", regression_model.intercept_)
for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

Intercept:  [23.66510774]
The coefficient for cyl is 2.5059518049384977
The coefficient for disp is 2.535708286056056
The coefficient for hp is -1.788933573632531
The coefficient for wt is -5.551819873098724
The coefficient for acc is 0.11485734803440689
The coefficient for yr is 2.931846548211609
The coefficient for car_type is 2.9778697376019405
The coefficient for origin_america is -1.0630595900778563
The coefficient for origin_asia is -0.04791335024147009


In [26]:
print(regression_model.score(X_train, y_train))
print(regression_model.score(X_test, y_test))

0.8343770256960538
0.8513421387780066


## IMPORTANT NOTE
#### The above results show that one less variable should be  defined for dummy variables
#### This can be achieved using following code (drop_first=True)
**----------------------------------------------------------------------------------------------**

mpg_df = pd.get_dummies(mpg_df, columns=['origin'], drop_first=True)

**----------------------------------------------------------------------------------------------**

# PCA

In [27]:
from sklearn.decomposition import PCA
pca = PCA()
X_train_proj = pca.fit_transform(X_train)
X_test_proj = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_
explained_variance

array([0.61449005, 0.14120175, 0.09578941, 0.07096353, 0.03542753,
       0.02509771, 0.00720165, 0.00707072, 0.00275764])

In [31]:
X_train.shape

(278, 9)

### Decide how many compoenents to choose

In [32]:
from sklearn.decomposition import PCA
pca = PCA(n_components=8)
X_train_proj = pca.fit_transform(X_train)
X_test_proj = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_
explained_variance

array([0.61449005, 0.14120175, 0.09578941, 0.07096353, 0.03542753,
       0.02509771, 0.00720165, 0.00707072])

In [33]:
X_train_proj = pd.DataFrame(X_train_proj)
regression_model = LinearRegression()
regression_model.fit(X_train_proj, y_train)

print("Intercept: ", regression_model.intercept_)
for idx, col_name in enumerate(X_train_proj.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

Intercept:  [23.60071942]
The coefficient for 0 is -2.822003696489196
The coefficient for 1 is -0.3760626399887572
The coefficient for 2 is -1.94203868601019
The coefficient for 3 is -1.3733788609007944
The coefficient for 4 is -0.29974259627863464
The coefficient for 5 is 1.5363691596046132
The coefficient for 6 is -5.797952574442316
The coefficient for 7 is 3.5980794190850576


In [34]:
print(regression_model.score(X_train_proj, y_train))
print(regression_model.score(X_test_proj, y_test))


0.8333611049281529
0.8479020242530155
