## Importing libraries

In [33]:
import joblib
import numpy as np
import pandas as pd


from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.metrics import r2_score,make_scorer


## Data Ingestion

In [5]:
df = pd.read_csv('car-details.csv',usecols=["company",
                                            "year",
                                            "owner",
                                            "fuel",
                                            "km_driven",
                                            "mileage_mpg",
                                            "engine_cc",
                                            "seats",
                                            "selling_price"])
df.head(3)

  return method()
  return method()


Unnamed: 0,company,year,owner,fuel,km_driven,mileage_mpg,engine_cc,seats,selling_price
0,Maruti,2014,First,Diesel,145500,55.0,1248.0,5.0,450000
1,Skoda,2014,Second,Diesel,120000,49.7,1498.0,5.0,370000
2,Honda,2006,Third,Petrol,140000,41.6,1497.0,5.0,158000


## Analyzing The Data

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6926 entries, 0 to 6925
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   company        6926 non-null   object 
 1   year           6926 non-null   int64  
 2   owner          6926 non-null   object 
 3   fuel           6926 non-null   object 
 4   km_driven      6926 non-null   int64  
 5   mileage_mpg    6718 non-null   float64
 6   engine_cc      6718 non-null   float64
 7   seats          6718 non-null   float64
 8   selling_price  6926 non-null   int64  
dtypes: float64(3), int64(3), object(3)
memory usage: 487.1+ KB


In [15]:
df.isnull().sum()

company            0
year               0
owner              0
fuel               0
km_driven          0
mileage_mpg      208
engine_cc        208
seats            208
selling_price      0
dtype: int64

## Splitting the data

In [16]:
X = df.drop(columns=['selling_price'])
y = df['selling_price'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=28,shuffle = True)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(5194, 8)
(1732, 8)
(5194,)
(1732,)


## Data preprocessing

In [17]:
num_cols = X_train.select_dtypes(exclude = ['object']).columns.to_list()
cat_cols = [col for col in X_train.columns if col not in num_cols]

print(num_cols)
print(cat_cols)

['year', 'km_driven', 'mileage_mpg', 'engine_cc', 'seats']
['company', 'owner', 'fuel']


## Create Preprocessing pipelines

In [18]:
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

scaler = StandardScaler()
ohe = OneHotEncoder(handle_unknown='ignore')

num_pipe = Pipeline([('num_imputer',num_imputer),
                     ('scaler',scaler)])

cat_pipe = Pipeline([('cat_imputer',cat_imputer),
                    ('ohe',ohe)])

preprocessor = ColumnTransformer([('num_pipe',num_pipe,num_cols),
                                 ('cat_pipe',cat_pipe,cat_cols)])



## Model Training

In [29]:
regressor = RandomForestRegressor(random_state=28,max_depth=5)
model = Pipeline(steps=[('preprocessor',preprocessor),
                       ('regressor',regressor)])

model.fit(X_train,y_train)

## Evaluate Model Based on Adj. R2 score

In [30]:
def adj_r2(y_true,y_pred,**kwargs):
    n,p = kwargs['data'].shape   # here it fetches the value associated with keyword 'data'
    r2 = r2_score(y_true,y_pred)
    
    adj_r2 = 1-((1-r2)*(n-1)/(n-p-1))
    
    return adj_r2

In [31]:
train_r2 = adj_r2(y_train,model.predict(X_train),data = X_train)
test_r2 = adj_r2(y_test,model.predict(X_test),data = X_test)

print(train_r2)
print(test_r2)

0.8286446678869135
0.7917116004035043


So, we see that the model is not overfitting when max_depth is limited to 5.

## Create custom scorer compatible with sklearn

In [32]:
adj_r2_scorer = make_scorer(adj_r2,response_method= 'predict',greater_is_better=True,data = X_train)

## Cross validation Scoring of our model

In [40]:
cv_scores = cross_val_score(model,
                            X_train,
                            y_train,
                            scoring=adj_r2_scorer,
                            cv = 4,
                            n_jobs= -1)

In [41]:
np.mean(cv_scores)

0.7013156154556666

In [42]:
cv_scores

array([0.68079097, 0.59394098, 0.79617879, 0.73435172])

## Saving a joblib model

In [48]:
joblib.dump(model,'carprice_model.joblib')

['carprice_model.joblib']

## Loading and Making predictions from model file

In [49]:
loaded_model = joblib.load('carprice_model.joblib')
loaded_model

In [50]:
df.sample(3)

  return method()


Unnamed: 0,company,year,owner,fuel,km_driven,mileage_mpg,engine_cc,seats,selling_price
2565,Renault,2016,First,Diesel,120000,46.7,1461.0,5.0,645000
5072,Hyundai,2014,First,Petrol,28800,43.72,1197.0,5.0,484999
4806,Honda,2006,Third,Petrol,178500,41.6,1497.0,5.0,200000


In [51]:
X_new = pd.DataFrame(data = dict(
                                company = ['Honda'],
                                year = [2006],
                                owner = ['Third'],
                                fuel = ['Petrol'],
                                km_driven = [178500],
                                mileage_mpg = [41.60],
                                engine_cc = [1497.0],
                                seats = [5.0]))
X_new

  return method()
  return method()


Unnamed: 0,company,year,owner,fuel,km_driven,mileage_mpg,engine_cc,seats
0,Honda,2006,Third,Petrol,178500,41.6,1497.0,5.0


In [54]:
predicted_price = loaded_model.predict(X_new)[0]

In [55]:
predicted_price

136616.2468518013

In [64]:
X_new = pd.DataFrame(data = dict(
                                company = ['Honda'],
                                year = [2008],
                                owner = ['First'],
                                fuel = ['Petrol'],
                                km_driven = [1000],
                                mileage_mpg = [41.60],
                                engine_cc = [1497.0],
                                seats = [5.0]))

predicted_price = loaded_model.predict(X_new)[0]
predicted_price

159594.60390843457

In [67]:
print('Car of given specification will cost Rs.{}'.format(np.round(predicted_price,-3)))

Car of given specification will cost Rs.160000.0
