<a href="https://colab.research.google.com/github/Foursteps-tech/Machine_Learning/blob/main/Regression/Car_Price_Prediction_LR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/Foursteps-tech/Machine_Learning.git

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
from sklearn.linear_model import LinearRegression

# reading the dataset
cars = pd.read_csv("Machine_Learning/Regression/CarPrice_Assignment.csv")

In [None]:
# summary of the dataset: 205 rows, 26 columns, no null values
print(cars.info())

In [None]:
cars.head()

In [None]:
# symboling: -2 (least risky) to +3 most risky
# Most cars are 0,1,2
cars['symboling'].astype('category').value_counts()

In [None]:
# aspiration: An (internal combustion) engine property showing 
# whether the oxygen intake is through standard (atmospheric pressure)
# or through turbocharging (pressurised oxygen intake)

cars['aspiration'].astype('category').value_counts()

In [None]:
# drivewheel: frontwheel, rarewheel or four-wheel drive 
cars['drivewheel'].astype('category').value_counts()

In [None]:
# wheelbase: distance between centre of front and rarewheels
sns.distplot(cars['wheelbase'])
plt.show()

In [None]:
# curbweight: weight of car without occupants or baggage
sns.distplot(cars['curbweight'])
plt.show()

In [None]:
# stroke: volume of the engine (the distance traveled by the 
# piston in each cycle)
sns.distplot(cars['stroke'])
plt.show()

In [None]:
# compression ration: ration of volume of compression chamber 
# at largest capacity to least capacity
sns.distplot(cars['compressionratio'])
plt.show()

In [None]:
# target variable: price of car
sns.distplot(cars['price'])
plt.show()

In [None]:
cars_numeric = cars.select_dtypes(include=['float64', 'int'])
cars_numeric.head()

In [None]:
# dropping symboling and car_ID 
cars_numeric = cars_numeric.drop(['symboling', 'car_ID'], axis=1)
cars_numeric.head()

In [54]:
# correlation matrix
cor = cars_numeric.corr()
cor

Unnamed: 0,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
wheelbase,1.0,0.874587,0.795144,0.589435,0.776386,0.569329,0.48875,0.160959,0.249786,0.353294,-0.360469,-0.470414,-0.544082,0.577816
carlength,0.874587,1.0,0.841118,0.491029,0.877728,0.68336,0.606454,0.129533,0.158414,0.552623,-0.287242,-0.670909,-0.704662,0.68292
carwidth,0.795144,0.841118,1.0,0.27921,0.867032,0.735433,0.55915,0.182942,0.181129,0.640732,-0.220012,-0.642704,-0.677218,0.759325
carheight,0.589435,0.491029,0.27921,1.0,0.295572,0.067149,0.171071,-0.055307,0.261214,-0.108802,-0.320411,-0.04864,-0.107358,0.119336
curbweight,0.776386,0.877728,0.867032,0.295572,1.0,0.850594,0.64848,0.16879,0.151362,0.750739,-0.266243,-0.757414,-0.797465,0.835305
enginesize,0.569329,0.68336,0.735433,0.067149,0.850594,1.0,0.583774,0.203129,0.028971,0.809769,-0.24466,-0.653658,-0.67747,0.874145
boreratio,0.48875,0.606454,0.55915,0.171071,0.64848,0.583774,1.0,-0.055909,0.005197,0.573677,-0.254976,-0.584532,-0.587012,0.553173
stroke,0.160959,0.129533,0.182942,-0.055307,0.16879,0.203129,-0.055909,1.0,0.18611,0.08094,-0.067964,-0.042145,-0.043931,0.079443
compressionratio,0.249786,0.158414,0.181129,0.261214,0.151362,0.028971,0.005197,0.18611,1.0,-0.204326,-0.435741,0.324701,0.265201,0.067984
horsepower,0.353294,0.552623,0.640732,-0.108802,0.750739,0.809769,0.573677,0.08094,-0.204326,1.0,0.131073,-0.801456,-0.770544,0.808139


In [None]:
# plotting correlations on a heatmap

# figure size
plt.figure(figsize=(16,8))

# heatmap
sns.heatmap(cor, cmap="YlGnBu", annot=True)
plt.show()

In [None]:
# variable formats
cars.info()

In [None]:
# converting symboling to categorical
cars['symboling'] = cars['symboling'].astype('object')
cars.info()

In [None]:
# CarName: first few entries
cars['CarName'][:30]

In [None]:
carnames = cars['CarName'].apply(lambda x: x.split(" ")[0])
carnames[:30]

In [26]:
# New column car_company
cars['car_company'] = cars['CarName'].apply(lambda x: x.split(" ")[0])

In [None]:
# look at all values 
cars['car_company'].astype('category').value_counts()

In [28]:
# replacing misspelled car_company names

# volkswagen
cars.loc[(cars['car_company'] == "vw") | 
         (cars['car_company'] == "vokswagen")
         , 'car_company'] = 'volkswagen'

# porsche
cars.loc[cars['car_company'] == "porcshce", 'car_company'] = 'porsche'

# toyota
cars.loc[cars['car_company'] == "toyouta", 'car_company'] = 'toyota'

# nissan
cars.loc[cars['car_company'] == "Nissan", 'car_company'] = 'nissan'

# mazda
cars.loc[cars['car_company'] == "maxda", 'car_company'] = 'mazda'

In [None]:
cars['car_company'].astype('category').value_counts()

In [30]:
# drop carname variable
cars = cars.drop('CarName', axis=1)

In [None]:
cars.info()

In [55]:
# split into X and y
X = cars.loc[:, ['symboling', 'fueltype', 'aspiration', 'doornumber',
       'carbody', 'drivewheel', 'enginelocation', 'wheelbase', 'carlength',
       'carwidth', 'carheight', 'curbweight', 'enginetype', 'cylindernumber',
       'enginesize', 'fuelsystem', 'boreratio', 'stroke', 'compressionratio',
       'horsepower', 'peakrpm', 'citympg', 'highwaympg',
       'car_company']]

y = cars['price']

In [56]:
#X = X.drop("stroke", axis=1)  do this only if you want to increase the accuracy

In [None]:
# creating dummy variables for categorical variables

# subset all categorical variables
cars_categorical = X.select_dtypes(include=['object'])
cars_categorical.head()

In [None]:
# convert into dummies
cars_dummies = pd.get_dummies(cars_categorical, drop_first=True)
cars_dummies.head()

In [59]:
# drop categorical variables 
X = X.drop(list(cars_categorical.columns), axis=1)


In [60]:
# concat dummy variables with X
X = pd.concat([X, cars_dummies], axis=1)

In [None]:
X

In [None]:
# scaling the features
from sklearn.preprocessing import scale

# storing column names in cols, since column names are (annoyingly) lost after 
# scaling (the df is converted to a numpy array)
cols = X.columns
X = pd.DataFrame(scale(X))
X.columns = cols
X.columns

In [63]:
# split into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size=0.7,
                                                    test_size = 0.3, random_state=100)

In [None]:
# Building the first model with all the features

# instantiate
lm = LinearRegression()

# fit
lm.fit(X_train, y_train)

In [None]:
lm.score(X_train,y_train)

In [None]:
lm.score(X_test,y_test)

In [None]:
# predict 
y_pred = lm.predict(X_test)

# metrics
from sklearn.metrics import r2_score

print(r2_score(y_true=y_test, y_pred=y_pred))

In [45]:
from sklearn.preprocessing import PolynomialFeatures

In [46]:
poly = PolynomialFeatures(degree=2, interaction_only=True)
X_train2 = poly.fit_transform(X_train)
X_test2 = poly.fit_transform(X_test)

In [None]:
len(X_train2)

In [50]:
nlm = LinearRegression()

In [None]:
nlm.fit(X_train2, y_train)

In [52]:
y_pred1 = nlm.predict(X_test2)

In [None]:
print(nlm.score(X_train2, y_train))

print(r2_score(y_true=y_test, y_pred=y_pred1))