In [None]:
%matplotlib inline

# Numerical libraries
import numpy as np   

# Import Linear Regression machine learning library
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.metrics import r2_score

# to handle data in form of rows and columns 
import pandas as pd    

# importing ploting libraries
import matplotlib.pyplot as plt   

#importing seaborn for statistical plots
import seaborn as sns

In [None]:
mpg_df = pd.read_csv("car-mpg.csv")  
mpg_df = mpg_df.drop('car_name', axis=1)
mpg_df['origin'] = mpg_df['origin'].replace({1: 'america', 2: 'europe', 3: 'asia'})
mpg_df = pd.get_dummies(mpg_df, columns=['origin'])
mpg_df = mpg_df.replace('?', np.nan)
mpg_df = mpg_df.apply(lambda x: x.fillna(x.median()),axis=0)
mpg_df['hp'] = mpg_df['hp'].astype('float64')

In [None]:
mpg_df.info()

# separate independent and dependent variables

In [None]:
# Copy all the predictor variables into X dataframe. Since 'mpg' is dependent variable drop it
X = mpg_df.drop('mpg', axis=1)

# Copy the 'mpg' column alone into the y dataframe. This is the dependent variable
y = mpg_df[['mpg']]


In [None]:
from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler()  
X_scaled = scaler.fit_transform(X)  
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)


In [None]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=1)

# fit a simple linear model

In [None]:
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

print("Intercept: ", regression_model.intercept_)
for idx, col_name in enumerate(X.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

In [None]:
print(regression_model.score(X_train, y_train))
print(regression_model.score(X_test, y_test))


## Iteration 2 - Linear regression 
### Understand Rule for Dummy Variable Regression
### Remove 1 Dummy variable


In [None]:

X_train = X_train.drop('origin_europe', axis=1)
X_test = X_test.drop('origin_europe', axis=1)

In [None]:
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

print("Intercept: ", regression_model.intercept_)
for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

In [None]:
print(regression_model.score(X_train, y_train))
print(regression_model.score(X_test, y_test))

## IMPORTANT NOTE
#### The above results show that one less variable should be  defined for dummy variables
#### This can be achieved using following code (drop_first=True)
**----------------------------------------------------------------------------------------------**

mpg_df = pd.get_dummies(mpg_df, columns=['origin'], drop_first=True)

**----------------------------------------------------------------------------------------------**

# PCA

In [None]:
from sklearn.decomposition import PCA
pca = PCA()
X_train_proj = pca.fit_transform(X_train)
X_test_proj = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_
explained_variance

### Decide how many compoenents to choose

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=8)
X_train_proj = pca.fit_transform(X_train)
X_test_proj = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_
explained_variance

In [None]:
X_train_proj = pd.DataFrame(X_train_proj)
regression_model = LinearRegression()
regression_model.fit(X_train_proj, y_train)

print("Intercept: ", regression_model.intercept_)
for idx, col_name in enumerate(X_train_proj.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

In [None]:
print(regression_model.score(X_train_proj, y_train))
print(regression_model.score(X_test_proj, y_test))
