In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from category_encoders import OneHotEncoder # transformer for non numeric value
import seaborn as sns
from sklearn.linear_model import Ridge # Train model
from sklearn.pipeline import make_pipeline # Pipeline
from sklearn.impute import SimpleImputer  # Impute missing values
from sklearn.metrics import mean_absolute_error # Evaluate model
from sklearn.utils.validation import check_is_fitted # check for if data fitted to model
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory



# Import

In [2]:
df = pd.read_csv("ford.csv")
print(df.shape)
df.info()
df.head()

# Evaluate


In [3]:
# One of our main feature is mileage
df["mileage"].describe()

There is big difference in mean and median value of mileage, and min mileage are even 1 and maximum mileage are in even 5 digits,  
for good prediction we have to remove this extreme values from either ends by removing outliers

In [4]:
# Remove outliers from the numeric data
# mileage

plt.hist(df["mileage"])
plt.xlabel("Mileage")
plt.ylabel("Frequency")
plt.title("Histogram: Mileage");

Data is more skewed toward the left

In [5]:
plt.boxplot(df["mileage"], vert=False)
plt.xlabel("Mileage")
plt.title("Boxplot: Mileage");

after looking at boxplot we can say that their are outliers

In [6]:
low, high = df["mileage"].quantile([0.05,0.90])
mask_ma = df["mileage"].between(low,high)
df = df[mask_ma]

In [7]:
plt.boxplot(df["mileage"], vert=False)
plt.xlabel("Mileage")
plt.title("Boxplot: Mileage");

In [8]:
# Drop values with more than half missing values
df.isnull().sum()

There is no null value

In [9]:
# Drop columns with low- and high- cardinalty from categorical data
# Year of production is also categorical function
df["year"] = df["year"].astype("object")
df.select_dtypes("object").nunique()


In [10]:
df.drop(columns=["transmission","fuelType"],inplace=True)

In [11]:
# Drop columns with multi-colinearity
corr = df.select_dtypes("number").drop(columns="price").corr()  # droping price column 'coz its our target 
corr

We can get the idea of which columns has multi-colinearity after looking at table above, but it is always easy and effective to do this by visualisation.

In [12]:
# Heatmap
sns.heatmap(corr);

after looking this visualisation or above "corr" table we can say that their is no multicolinearity in our feature columns,

Done! It looks like we're going to use our six features to predict the price

In [13]:
df.head()

# Split Data

In [14]:
feature = ["model","year","mileage","mpg","engineSize"]
target = "price"

X = df[feature]
y = df[target]
# split data into training and testing set
X_train,X_test,y_train,y_test = train_test_split(X,y,shuffle=True, train_size=0.8)

# Build Model

## Baseline Model

In [15]:
y_mean = y_train.mean()
y_pred_baseline = [y_mean] * len(y_train)

print("Mean car price:", y_mean.round(2))
print("Mean Absolute Error for Baseline Model:",mean_absolute_error(y_train,y_pred_baseline).round(2))

Our trained model should perform better than this so we can say that our model's performance is good

## Iterate

In [16]:
# Pipeline
model = make_pipeline(
    OneHotEncoder(use_cat_names=True), # Encoding non numerical values
    SimpleImputer(),# Imput missing values
    Ridge() # Predictor of linear_model (part of linear regression)
)

# Fit
model.fit(X_train,y_train)

In [17]:
# Check
check_is_fitted(model["ridge"])

In [18]:
# Predict for training data
y_pred_train = model.predict(X_train)
y_pred_train = pd.Series(y_pred_train)  # Convert array of predicted value into Series

In [19]:
# Evaluate
print("Mean Absolute Error for training model:", mean_absolute_error(y_train,y_pred_train).round(2))

Mean Absolute Error for training model is very much less than that of baseline model, that mean our model's performance is good

# Test Model

Let's have a look at testing model, see if it generalize on test data

In [20]:
# Predict
y_pred_test = pd.Series(model.predict(X_test))
# Evaluate
print("Mean Absolute Error for Test Model:", mean_absolute_error(y_test,y_pred_test).round(2))

Mean Absolute Error for Test Model is very lower than that of baseline model and very similar of training model.