# Multiple Linear Regression

## Import relevant libraries

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.linear_model import LinearRegression

## Loading Data

In [None]:
raw_data = pd.read_csv('car-data.csv')
raw_data.head()

In [None]:
raw_data

## Explore Descriptive Statistics

In [None]:
raw_data.describe(include='all')

## Missing Values

In [None]:
raw_data.isnull().sum()

In [None]:
data_no_mv = raw_data.dropna(axis=0)

In [None]:
data_no_mv.describe(include='all')

## Distribution

In [None]:
sns.distplot(data_no_mv['Price'])

## Outliers

In [None]:
q = data_no_mv['Price'].quantile(0.95)
q

In [None]:
data_1 = data_no_mv[data_no_mv['Price']<q]

In [None]:
sns.distplot(data_1['Price'])

In [None]:
data_1.describe(include='all')

## Wrong Values

In [None]:
sns.distplot(data_1['EngineV'])

In [None]:
data_2 = data_1[data_1['EngineV']<6.5]

In [None]:
sns.distplot(data_2['EngineV'])

## Final Data Preparation

In [None]:
data_2.describe()

In [None]:
sns.distplot(data_2['Mileage'])

In [None]:
q = data_2['Mileage'].quantile(0.99)
data_3 = data_2[data_2['Mileage']<q]
sns.distplot(data_3['Mileage'])

In [None]:
sns.distplot(data_3['Year'])

In [None]:
q = data_3['Year'].quantile(0.01)
data_4 = data_3[data_3['Year']>q]
sns.distplot(data_4['Year'])

In [None]:
data_cleaned = data_4.reset_index(drop=True)
data_cleaned.describe(include='all')

In [None]:
data_cleaned = data_cleaned.drop(['Model'],axis=1)
data_cleaned.describe(include='all')

# Checking OLS Assumptions

### Assumption Linearity

In [None]:
plt.scatter(data_cleaned['Year'],data_cleaned['Price'])

In [None]:
f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True, figsize = (15,3))
ax1.scatter(data_cleaned['Year'], data_cleaned['Price'])
ax1.set_title('PRICE AND YEAR')
ax2.scatter(data_cleaned['EngineV'], data_cleaned['Price'])
ax2.set_title('PRICE AND ENGINE')
ax3.scatter(data_cleaned['Mileage'], data_cleaned['Price'])
ax3.set_title('PRICE AND MILEAGE')
plt.show()

In [None]:
# Violation OLS Assumptions = Linearity
# Fix: Log Transformation
log_price = np.log(data_cleaned['Price'])
data_cleaned['log_price'] = log_price
data_cleaned

In [None]:
f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True, figsize = (15,3))
ax1.scatter(data_cleaned['Year'], data_cleaned['log_price'])
ax1.set_title('LOG PRICE AND YEAR')
ax2.scatter(data_cleaned['EngineV'], data_cleaned['log_price'])
ax2.set_title('LOG PRICE AND ENGINE')
ax3.scatter(data_cleaned['Mileage'], data_cleaned['log_price'])
ax3.set_title('LOG PRICE AND MILEAGE')
plt.show()

In [None]:
data_cleaned = data_cleaned.drop(['Price'],axis=1)

### Assumption Multicollinearity

In [None]:
data_cleaned.columns.values

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

variables = data_cleaned[['Mileage', 'Year', 'EngineV']]
vif = pd.DataFrame()

vif["VIF"] = [variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]

vif["Features"] = variables.columns

In [None]:
vif

In [None]:
data_no_multicollinearity = data_cleaned.drop(['Year'],axis=1)

In [None]:
data_no_multicollinearity.describe(include='all')

# Creating Dummy Variables

In [None]:
data_with_dummies = pd.get_dummies(data_no_multicollinearity, drop_first=True)

In [None]:
data_with_dummies.head()

In [None]:
data_with_dummies.columns.values

In [None]:
cols = ['log_price', 'Mileage', 'EngineV', 'Brand_BMW',
       'Brand_Mercedes-Benz', 'Brand_Mitsubishi', 'Brand_Renault',
       'Brand_Toyota', 'Brand_Volkswagen', 'Body_hatch', 'Body_other',
       'Body_sedan', 'Body_vagon', 'Body_van', 'Engine Type_Gas',
       'Engine Type_Other', 'Engine Type_Petrol', 'Registration_yes']

In [None]:
data_preprocessed = data_with_dummies[cols]
data_preprocessed.head()

# Linear Regression Model

In [None]:
targets = data_preprocessed['log_price']
inputs = data_preprocessed.drop(['log_price'],axis=1)

### Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(inputs)
inputs_scaled = scaler.transform(inputs)

### Train Test Split (= Prevent Overfitting)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(inputs_scaled, targets, test_size=0.2, random_state=365)

### Create the Regression

In [None]:
reg = LinearRegression()
reg.fit(x_train, y_train)

In [None]:
y_hat = reg.predict(x_train)

In [None]:
plt.scatter(y_train, y_hat)
plt.xlabel('Targets (y_train)')
plt.ylabel('Predictions (y_hat)')
plt.xlim(6,13)
plt.ylim(6,13)
plt.show()

In [None]:
sns.distplot(y_train - y_hat)
plt.title("Residuals PDF")

In [None]:
reg.score(x_train, y_train)

In [None]:
reg.intercept_

In [None]:
reg.coef_

In [None]:
reg_summary = pd.DataFrame(inputs.columns, columns=['Features'])
reg_summary['Weights'] = reg.coef_
reg_summary

# Testing

In [None]:
y_hat_test = reg.predict(x_test)

In [None]:
plt.scatter(y_test, y_hat_test, alpha=0.2)
plt.xlabel('Targets (y_test)')
plt.ylabel('Predictions (y_hat_test)')
plt.xlim(6,13)
plt.ylim(6,13)
plt.show()

In [None]:
df_pf = pd.DataFrame(np.exp(y_hat_test), columns=['Predictions'])
df_pf.head()

In [None]:
df_pf['Targets'] = np.exp(y_test)
df_pf

In [None]:
y_test = y_test.reset_index(drop=True)
y_test.head()

In [None]:
df_pf['Targets'] = np.exp(y_test)
df_pf

In [None]:
df_pf['Residuals'] = df_pf['Targets'] - df_pf['Predictions']
df_pf

In [None]:
df_pf['Difference%'] = np.absolute(df_pf['Residuals']/df_pf['Targets']*100)
df_pf

In [None]:
df_pf.describe()