<a href="https://colab.research.google.com/github/deepanrajm/machine_learning/blob/master/Regression/Car_Price_Prediction_LR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/deepanrajm/machine_learning.git

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
from sklearn.linear_model import LinearRegression

# reading the dataset
cars = pd.read_csv("machine_learning/Regression/CarPrice_Assignment.csv")

In [None]:
# summary of the dataset: 205 rows, 26 columns, no null values
print(cars.info())

In [None]:
cars.head()

In [None]:
# symboling: -2 (least risky) to +3 most risky
# Most cars are 0,1,2
cars['symboling'].astype('category').value_counts()

In [6]:
# aspiration: An (internal combustion) engine property showing
# whether the oxygen intake is through standard (atmospheric pressure)
# or through turbocharging (pressurised oxygen intake)

cars['aspiration'].astype('category').value_counts()

aspiration
std      168
turbo     37
Name: count, dtype: int64

In [None]:
# drivewheel: frontwheel, rarewheel or four-wheel drive
cars['drivewheel'].astype('category').value_counts()

In [None]:
# wheelbase: distance between centre of front and rarewheels
sns.distplot(cars['wheelbase'])
plt.show()

In [None]:
# curbweight: weight of car without occupants or baggage
sns.distplot(cars['curbweight'])
plt.show()

In [None]:
# stroke: volume of the engine (the distance traveled by the
# piston in each cycle)
sns.distplot(cars['stroke'])
plt.show()

In [None]:
# compression ration: ration of volume of compression chamber
# at largest capacity to least capacity
sns.distplot(cars['compressionratio'])
plt.show()

In [None]:
# target variable: price of car
sns.distplot(cars['price'])
plt.show()

In [None]:
cars_numeric = cars.select_dtypes(include=['float64', 'int'])
cars_numeric.head()

In [None]:
# dropping symboling and car_ID
cars_numeric = cars_numeric.drop(['symboling', 'car_ID'], axis=1)
cars_numeric.head()

In [None]:
# correlation matrix
cor = cars_numeric.corr()
cor

In [None]:
# plotting correlations on a heatmap

# figure size
plt.figure(figsize=(16,8))

# heatmap
sns.heatmap(cor, cmap="YlGnBu", annot=True)
plt.show()

In [None]:
# variable formats
cars.info()

In [None]:
# converting symboling to categorical
cars['symboling'] = cars['symboling'].astype('object')
cars.info()

In [None]:
# CarName: first few entries
cars['CarName'][:30]

In [None]:
carnames = cars['CarName'].apply(lambda x: x.split(" ")[0])
carnames[:30]

In [21]:
# New column car_company
cars['car_company'] = cars['CarName'].apply(lambda x: x.split(" ")[0])

In [None]:
# look at all values
cars['car_company'].astype('category').value_counts()

In [23]:
# replacing misspelled car_company names

# volkswagen
cars.loc[(cars['car_company'] == "vw") |
         (cars['car_company'] == "vokswagen")
         , 'car_company'] = 'volkswagen'

# porsche
cars.loc[cars['car_company'] == "porcshce", 'car_company'] = 'porsche'

# toyota
cars.loc[cars['car_company'] == "toyouta", 'car_company'] = 'toyota'

# nissan
cars.loc[cars['car_company'] == "Nissan", 'car_company'] = 'nissan'

# mazda
cars.loc[cars['car_company'] == "maxda", 'car_company'] = 'mazda'

In [None]:
cars['car_company'].astype('category').value_counts()

In [25]:
# drop carname variable
cars = cars.drop('CarName', axis=1)

In [None]:
cars.info()

In [27]:
# split into X and y
X = cars.loc[:, ['symboling', 'fueltype', 'aspiration', 'doornumber',
       'carbody', 'drivewheel', 'enginelocation', 'wheelbase', 'carlength',
       'carwidth', 'carheight', 'curbweight', 'enginetype', 'cylindernumber',
       'enginesize', 'fuelsystem', 'boreratio', 'stroke', 'compressionratio',
       'horsepower', 'peakrpm', 'citympg', 'highwaympg',
       'car_company']]

y = cars['price']

In [28]:
#X = X.drop("stroke", axis=1)  do this only if you want to increase the accuracy

In [None]:
# creating dummy variables for categorical variables

# subset all categorical variables
cars_categorical = X.select_dtypes(include=['object'])
cars_categorical.head()

In [None]:
# convert into dummies
cars_dummies = pd.get_dummies(cars_categorical, drop_first=True)
cars_dummies.head()

In [31]:
# drop categorical variables
X = X.drop(list(cars_categorical.columns), axis=1)


In [32]:
# concat dummy variables with X
X = pd.concat([X, cars_dummies], axis=1)

In [None]:
X

In [None]:
# scaling the features
from sklearn.preprocessing import scale

# storing column names in cols, since column names are (annoyingly) lost after
# scaling (the df is converted to a numpy array)
cols = X.columns
X = pd.DataFrame(scale(X))
X.columns = cols
X.columns

In [35]:
# split into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=0.7,
                                                    test_size = 0.3, random_state=100)

In [None]:
# Building the first model with all the features

# instantiate
lm = LinearRegression()

# fit
lm.fit(X_train, y_train)

In [None]:
lm.score(X_train,y_train)

In [None]:
lm.score(X_test,y_test)

In [None]:
# predict
y_pred = lm.predict(X_test)

# metrics
from sklearn.metrics import r2_score

print(r2_score(y_true=y_test, y_pred=y_pred))

In [40]:
from sklearn.preprocessing import PolynomialFeatures

In [41]:
poly = PolynomialFeatures(degree=2, interaction_only=True)
X_train2 = poly.fit_transform(X_train)
X_test2 = poly.fit_transform(X_test)

In [None]:
len(X_train2)

In [43]:
nlm = LinearRegression()

In [None]:
nlm.fit(X_train2, y_train)

In [45]:
y_pred1 = nlm.predict(X_test2)

In [None]:
print(nlm.score(X_train2, y_train))

print(r2_score(y_true=y_test, y_pred=y_pred1))

In [52]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import f_regression
from matplotlib import pyplot
def select_features(X_train, y_train, X_test):

 fs = SelectKBest(score_func=f_regression, k='all')

 fs.fit(X_train, y_train)

 X_train_fs = fs.transform(X_train)

 X_test_fs = fs.transform(X_test)
 return X_train_fs, X_test_fs, fs

In [None]:
X_train_fs, X_test_fs, fs = select_features(X_train, y_train, X_test)

for i in range(len(fs.scores_)):
 print('Feature %d: %f' % (i, fs.scores_[i]))

pyplot.bar([i for i in range(len(fs.scores_))], fs.scores_)
pyplot.show()

In [49]:

def select_features(X_train, y_train, X_test):

 fs = SelectKBest(score_func=mutual_info_regression, k='all')

 fs.fit(X_train, y_train)

 X_train_fs = fs.transform(X_train)

 X_test_fs = fs.transform(X_test)
 return X_train_fs, X_test_fs, fs

In [None]:
X_train_fs, X_test_fs, fs = select_features(X_train, y_train, X_test)

for i in range(len(fs.scores_)):
 print('Feature %d: %f' % (i, fs.scores_[i]))
pyplot.bar([i for i in range(len(fs.scores_))], fs.scores_)
pyplot.show()

In [54]:
def select_features(X_train, y_train, X_test):

 fs = SelectKBest(score_func=f_regression, k=30)

 fs.fit(X_train, y_train)

 X_train_fs = fs.transform(X_train)

 X_test_fs = fs.transform(X_test)
 return X_train_fs, X_test_fs, fs

In [None]:
X_train_fs, X_test_fs, fs = select_features(X_train, y_train, X_test)

model = LinearRegression()
model.fit(X_train_fs, y_train)

yhat = model.predict(X_test_fs)

print(r2_score(y_true=y_test, y_pred=yhat))



In [56]:
def select_features(X_train, y_train, X_test):

 fs = SelectKBest(score_func=mutual_info_regression, k=30)

 fs.fit(X_train, y_train)

 X_train_fs = fs.transform(X_train)

 X_test_fs = fs.transform(X_test)
 return X_train_fs, X_test_fs, fs

In [None]:
X_train_fs, X_test_fs, fs = select_features(X_train, y_train, X_test)

model = LinearRegression()
model.fit(X_train_fs, y_train)

yhat = model.predict(X_test_fs)

print(r2_score(y_true=y_test, y_pred=yhat))