In [None]:
# Loading libraries required and reading dataset into python
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import plotly
import plotly.offline as py
from plotly.offline import iplot, init_notebook_mode
plotly.offline.init_notebook_mode(connected=True) 
import plotly.graph_objs as go
import plotly.express as px
py.init_notebook_mode(connected = True) # Required to use plotly offline in jupyter notebook
from scipy.stats import skew
from scipy import stats
from scipy.stats.stats import pearsonr

from scipy.stats import norm

In [None]:
# Reading in dataset
df = pd.read_csv('used_cars.csv',encoding = 'latin-1')
df.head()

In [None]:
df = df.drop(['vin'],axis = 1)

In [None]:
df['color'].fillna('unkown',inplace = True)
df['condition'].fillna('unknown',inplace = True)
df['cylinders'].fillna('unknown',inplace = True)
df['transmission'].fillna('unknown',inplace = True)
df['type'].fillna('unknown',inplace = True)
df['size'].fillna('unknown',inplace = True)
df['title'].fillna('unknown',inplace = True)
df['make'].fillna('unknown',inplace = True)
df['drive'].fillna('unknown',inplace = True)
df['price'] = df['price'].fillna(0)
df['price'] = df.price.astype(int)

In [None]:
new_df = df[df['price'] >= 1000]
price_df = new_df[new_df['price'] <= 200000]
rec_df = price_df[price_df['odometer'] > 10000 ]
#rec_df = sec_df[sec_df['odometer'] <=  ]
rec_df.shape

In [None]:
rec_df['odometer'].describe()

In [None]:
missing = rec_df.isna().sum()
missing = missing[missing>0]
missing_percent = missing/rec_df.shape[0] * 100
df_missing = pd.DataFrame([missing, missing_percent], index = ['total', 'missing percent']).T
df_missing.sort_values(['missing percent'], ascending = [False])

In [None]:
rec_df = rec_df.dropna()
rec_df.shape

In [None]:
rec_df.dtypes

In [None]:
rec_df['odometer'] = rec_df.odometer.astype(int)
rec_df['year'] = rec_df.year.astype(object)

In [None]:
categori = ['object']
categorical = rec_df.select_dtypes(include= categori)

In [None]:
rec_df.head()

In [None]:
rec_df = rec_df.applymap(lambda s:s.lower() if type(s) == str else s)

In [None]:
#histogram of SalePrice to see the distribution 
fig, (ax1, ax2) = plt.subplots(1,2,figsize=(14,4))
sns.distplot(rec_df['price'], ax = ax1)
ax1.set_ylabel('Frequency')
ax1.set_title('Car Price Distribution')
#QQ-plot
stats.probplot(rec_df['price'], plot=plt)
plt.show()

In [None]:
pricey_log = np.log1p(rec_df['price'])
#histogram of SalePrice to see the distribution after log transformation
fig, (ax1, ax2) = plt.subplots(1,2,figsize=(14,4))
sns.distplot(pricey_log, ax = ax1)
(mu, sigma) = norm.fit(pricey_log)
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
ax1.set_ylabel('Frequency')
ax1.set_title('Car Price Distribution')
#QQ-plot
stats.probplot(pricey_log, plot=plt)
plt.show()

In [None]:
rec_df['price_log'] = np.log(rec_df['price'])
rec_df['odometer_log'] = np.log(rec_df['odometer']+10)

In [None]:
#histogram of the car mileage to see the distribution 
fig, (ax1, ax2) = plt.subplots(1,2,figsize=(14,4))
sns.distplot(rec_df['odometer'], ax = ax1)
ax1.set_xlabel('Mileage')
ax1.set_ylabel('Frequency')
ax1.set_title('Car Mileage Distribution')
#QQ-plot
stats.probplot(rec_df['odometer'], plot=plt)
plt.show()

In [None]:
#histogram of SalePrice to see the distribution after log transformation
fig, (ax1, ax2) = plt.subplots(1,2,figsize=(14,4))
sns.distplot(rec_df['odometer_log'], ax = ax1)
(mu, sigma) = norm.fit(rec_df['odometer_log'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
ax1.set_xlabel('Mileage')
ax1.set_ylabel('Frequency')
ax1.set_title('Log Car Mileage Distribution')
#QQ-plot
stats.probplot(rec_df['odometer_log'], plot=plt)
plt.show()

In [None]:
rec_df.dtypes

In [None]:
# Data Exploration

In [None]:
# How expensive are the cars? 
print('The average age of the cars is {:,.0f} years old.'.format(2020 - rec_df.year.mean()))
print('The average mileage of the cars is {:,.0f} miles.'.format(rec_df.odometer.mean()))
print('The cheapest car is for ${:,.0f} dollars and the most expensive is for ${:,.0f}.'.format(
    rec_df.price.min(), rec_df.price.max()))
print('The average car price is ${:,.0f}, while median is ${:,.0f}.'.format(
    rec_df.price.mean(), rec_df.price.median()))
print('-' * 30)
rec_df.price.hist(bins=75, rwidth=.8, figsize=(14,4))
plt.title('How expensive are the cars?')
plt.show()

In [None]:
categori = ['object']
categorical = rec_df.select_dtypes(include= categori)
categorical= categorical.drop(columns = ['make','manufacturer'],axis =1)

In [None]:
fig, axes = plt.subplots(round(len(categorical.columns) / 4), 3, figsize=(20,15))

for i, ax in enumerate(fig.axes):
    if i < len(categorical.columns):
        ax.set_xticklabels(ax.xaxis.get_majorticklabels(), rotation=45)
        sns.countplot(x=categorical.columns[i], alpha=0.7, data=categorical, ax=ax,palette='Blues_d')

fig.tight_layout()

In [None]:
fig = plt.figure(figsize= (15,5))
plt.subplot(1,2,1)
plt.title('model')
rec_df['manufacturer'].value_counts().sort_values().nlargest(10).plot(kind = 'bar')
plt.xticks(rotation =90)
plt.subplot(1,2,2)
plt.title('make')
rec_df['make'].value_counts().sort_values().nlargest(10).plot(kind = 'bar')

In [None]:
rec_df.head()

In [None]:
# Box Plot for Categorical Features
f = pd.melt(rec_df, id_vars=['price'], value_vars=sorted(categorical))
g = sns.FacetGrid(f, col='variable', col_wrap=3, sharex=False, sharey=False, size=4)
g = g.map(sns.boxplot, 'value', 'price')
[plt.setp(ax.get_xticklabels(), rotation=90) for ax in g.axes.flat]
g.fig.tight_layout()
plt.show()

In [None]:
rec_df['price_log'] = np.log(rec_df['price'])
rec_df['odometer_log'] = np.log(rec_df['odometer']+10)

In [None]:
rec_df.head()

In [None]:
fig = plt.figure(figsize= (15,4))
plt.subplot(1,2,1)
plt.title('price')
rec_df['price_log'].hist()
plt.subplot(1,2,2)
plt.title('odometer')
rec_df['odometer_log'].hist()
plt.show()

In [None]:
rec_df.dtypes

In [None]:
df_final = pd.get_dummies(rec_df,drop_first=True)
print(df_final.columns)

In [None]:
df_final.head()

In [None]:
df_final = df_final.drop(['price',''])

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as mae
#X_head = df_final.iloc[:, df_final.columns != 'price']
X = df_final.drop(['price','price_log','odometer'], axis=1)
y = df_final['price_log']
X = StandardScaler().fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=0)

In [None]:
rf_model = RandomForestRegressor(random_state=1)
rf_model.fit(X_train,y_train)
pred = rf_model.predict(X_test)

In [None]:
print(mae(y_test, pred))
print(df_final['price'].mean())

In [None]:
rf_model.score(X_test,y_test)