In [1]:
# Ordinary Least Squares Regression (No Regularization)
import pandas as pd
print (pd.__version__)

0.23.0


In [2]:
# Read data
auto_data = pd.read_csv('C:/Users/chait/Desktop/Machine Learning/Datasets/automobile.csv',engine='python')
auto_data.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [3]:
# Mark missing values as NaN (Not a Number) 
import numpy as np
auto_data = auto_data.replace('?',np.nan)
auto_data.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [None]:
# Convert columns with numeric values to numeric data type
auto_data['price'] = pd.to_numeric(auto_data['price'],errors='coerce')
auto_data['horsepower'] = pd.to_numeric(auto_data['horsepower'],errors='coerce')

In [None]:
# Drop useless features
auto_data = auto_data.drop('normalized-losses',axis=1)
auto_data.head()

In [None]:
# Map categorical data with numeric significance to numeric values
#auto_data['num-of-cylinders'].describe()
cylinders_dict = {'two':2,'three':3,'four':4,'five':'5','six':6,'seven':7,
                  'eight':8,'twelve':12}
auto_data['num-of-cylinders'].replace(cylinders_dict, inplace = True)
auto_data['num-of-cylinders'].head()

In [None]:
# Convert remaining categorical data to one-hot encoding form
auto_data = pd.get_dummies(auto_data, columns= ['make','fuel-type','aspiration','num-of-doors','body-style','drive-wheels','engine-location','engine-type','fuel-system'])
auto_data.head()

In [None]:
# Replace rows with missing values
auto_data = auto_data.dropna()
auto_data[auto_data.isnull().any(axis=1)]

In [4]:
# Split Dataset into training and test sets
from sklearn.model_selection import train_test_split

X=auto_data.drop('price',axis=1)
Y=auto_data['price']

x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2, random_state=0)

In [None]:
# Select model and train/fit it
from sklearn.linear_model import LinearRegression

linear_model = LinearRegression()
linear_model.fit(x_train,y_train)
linear_model.score(x_train,y_train)

In [None]:
# Predict the labels for test set
y_predict = linear_model.predict(x_test)

In [None]:
coef = pd.Series(linear_model.coef_, x_train.columns.sort_values())
print(coef)

In [None]:
# Evaluate the model using metrics
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import math
r2=r2_score(y_test,y_predict)
mse=mean_squared_error(y_test,y_predict)
rmse=math.sqrt(mse)
print('R2 Score-->',r2,'\nMean Squared Error-->',mse,'\nRoot Mean Squared Error-->',rmse)

In [None]:
#Visualize Predicted and Actual data
%pylab inline
pylab.rcParams['figure.figsize'] = (15,6)
plt.plot(y_predict, label = 'Predicted')
plt.plot(y_test.values, label ='Actual')
plt.ylabel('Price')
plt.legend()
plt.show()