# Sklearn Workshop 3: Car Price Prediction (Regression)


In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Step 1: ETL

In [34]:
df = pd.read_csv("data/car.csv")

In [35]:
# df.describe()  #no missing values, seems like....
# df.dtypes
# df['selling_price'].value_counts()  #no need to check imbalanced because this is a regression problem
df.head()

#?Note: We don't need the year.....because year can be linearly correlated with price but with no meaning due to inflation anyway
#?      Some missing values in seats?
#?      Wow, so many object.....we need encoding :-(
#?      We don't need name too....because name won't be related to price.....
#?      Oh...we have to fix the mileage and remove kmpl....
#?      We have to remove CC in engine
#?      We have to remove bhp
#?      Based on domain expert, we shall drop torque (ok?)

#selling_price:  our target

#continuous: km_driven, mileage, engine, max_power, torque
#discrete:   seats, fuel, seller_type, transmission, owner, 

#seats can be both continuous or discrete...up to you

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


### 1.1  Fix the columns (extract the meaning from the text)

In [36]:
df['owner'].value_counts()

First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: owner, dtype: int64

In [37]:
df['owner'] = df['owner'].map({'First Owner': 1, 'Second Owner': 2, 'Third Owner': 3, 'Fourth & Above Owner': 4, 'Test Drive Car': 5})

In [38]:
df['owner'].value_counts()

1    5289
2    2105
3     555
4     174
5       5
Name: owner, dtype: int64

In [39]:
df.fuel.value_counts()

Diesel    4402
Petrol    3631
CNG         57
LPG         38
Name: fuel, dtype: int64

In [40]:
#let's delete all rows with CNG and LPG
#because CNG and LPG use a different mileage system, i.e., km/kg which is different from kmpl for Diesel and Petrol
condCNG = df.fuel == 'CNG'
condLPG = df.fuel == 'LPG'
condCNG_index = np.where(condCNG | condLPG)  #retrieve the index
df.drop(condCNG_index[0], axis=0, inplace=True)

In [41]:
df.fuel.value_counts()

Diesel    4402
Petrol    3631
Name: fuel, dtype: int64

In [42]:
df.mileage = df.mileage.str.split(" ").str[0]

In [43]:
df.engine = df.engine.str.split(" ").str[0]

In [44]:
df.max_power = df.max_power.str.split(" ").str[0]

In [45]:
df = df.rename(columns={'name': 'brand'})

In [46]:
df.brand = df.brand.str.split(" ").str[0]  #a scientist choice, no right no wrong....

In [47]:
df = df.drop(columns=['torque'])

In [48]:
df.head()

Unnamed: 0,brand,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,Maruti,2014,450000,145500,Diesel,Individual,Manual,1,23.4,1248,74.0,5.0
1,Skoda,2014,370000,120000,Diesel,Individual,Manual,2,21.14,1498,103.52,5.0
2,Honda,2006,158000,140000,Petrol,Individual,Manual,3,17.7,1497,78.0,5.0
3,Hyundai,2010,225000,127000,Diesel,Individual,Manual,1,23.0,1396,90.0,5.0
4,Maruti,2007,130000,120000,Petrol,Individual,Manual,1,16.1,1298,88.2,5.0


In [49]:
df.mileage = df.mileage.astype('float')
df.engine  =  df.engine.astype('float')
df.max_power = df.max_power.astype('float')

In [50]:
df.dtypes

brand             object
year               int64
selling_price      int64
km_driven          int64
fuel              object
seller_type       object
transmission      object
owner              int64
mileage          float64
engine           float64
max_power        float64
seats            float64
dtype: object

### Step 2: EDA

In [51]:
df.head(1)

Unnamed: 0,brand,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,Maruti,2014,450000,145500,Diesel,Individual,Manual,1,23.4,1248.0,74.0,5.0


In [52]:
dis_col = ['year', 'brand', 'fuel', 'seller_type', 'transmission', 'owner', 'seats']
con_col = ['km_driven', 'mileage', 'engine', 'max_power']

In [53]:
#plot the categorical columns
# for col in dis_col:
#     sns.barplot(x = df[col], y = df['selling_price'])  #since our y is continuous, we put x as our categories, and use bar plot
#     plt.xticks(rotation=90)
#     plt.show()

#? We learn that:  brand, fuel, seller_type, transmission, owner*, seats
#? We have to check the owner, and maybe fix the outliers...........
#? So we deleted the test drive, so now we can use owner as another nice feature, because 1 owner is most expensive....

In [54]:
#we found out there are only five test drive cars...and they are "ridicously expensive"
#also we are not interested in predicting price of test_drive cars
#so let's delete them
cond = df.owner != 5
df = df[cond]

#another way
# cond = df.owner == 5
# index = np.where(cond)
# df = df.drop(index)

In [55]:
df.owner.value_counts()

1    5238
2    2073
3     547
4     170
Name: owner, dtype: int64

In [56]:
# for col in con_col:
#     sns.scatterplot(x=df[col], y=df['selling_price'])  #use scatter plot because both are continuous
#     plt.show()
    
#? We learn that:  year, km_driven, engine, max_power

In [57]:
# plt.figure(figsize=(20, 10))
# sns.heatmap(df.corr(), annot=True)

### Conclusion:

We will use:   **year, km_driven, engine, max_power, brand, fuel, seller_type, transmission, owner, seats**

### Step 3: Splitting

In [58]:
X = df[ ['year', 'km_driven', 'engine', 'max_power', 'brand', 'fuel', 'seller_type', 'transmission', 'owner', 'seats'] ]
y = df['selling_price']

assert X.ndim == 2
assert y.ndim == 1

X.shape, y.shape

((8028, 10), (8028,))

In [59]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=999)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((5619, 10), (2409, 10), (5619,), (2409,))

### Step 4: Preprocessing

##### 4.1 Filling missing values

In [60]:
#double check that no missing values....
# X_train.isna().sum()     #fix engine, max_power, seats
# X_test.isna().sum()      #fix engine, max_power, seats
# y_train.isna().sum()   #no need to fix y_train
# y_test.isna().sum()    #no need to fix y_test

In [61]:
#since engine, max_power are continuous, we check whether to replace with mean() or median()
#for seats, we can replace with ratio or majority.
#note that we replace the testing set USING training statistics.

# sns.displot(X_train.max_power)
# X_train.max_power.mean(), X_train.max_power.median()  

#! median for engine, max_power

In [62]:
X_train['max_power'].fillna(X_train['max_power'].median(), inplace=True)
X_train['engine'].fillna(X_train['engine'].median(), inplace=True)

assert X_train['max_power'].isna().sum() == 0 
assert X_train['engine'].isna().sum() == 0 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['max_power'].fillna(X_train['max_power'].median(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['engine'].fillna(X_train['engine'].median(), inplace=True)


In [63]:
X_test['max_power'].fillna(X_test['max_power'].median(), inplace=True)
X_test['engine'].fillna(X_test['engine'].median(), inplace=True)

assert X_test['max_power'].isna().sum() == 0 
assert X_test['engine'].isna().sum() == 0 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['max_power'].fillna(X_test['max_power'].median(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['engine'].fillna(X_test['engine'].median(), inplace=True)


In [65]:
ratio = X_train.seats.value_counts(normalize=True)

#1. find how many are missing
cond          = X_train.seats.isna()
missing       = X_train.seats.isna().sum()
missing_index = X_train[cond].index

#np.where(cond) gives you the implicit index

#2. define the series
series = pd.Series(np.random.choice(list(ratio.index), p=list(ratio), 
                            size=missing), index = missing_index)

#3. fill the number with this series
X_train.seats.fillna(series, inplace=True)

# print(X_train.seats.value_counts(normalize=True))

X_train.seats.isna().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.seats.fillna(series, inplace=True)


0

In [66]:
#1. find how many are missing
cond          = X_test.seats.isna()
missing       = X_test.seats.isna().sum()
missing_index = X_test[cond].index

#np.where(cond) gives you the implicit index

#2. define the series
series = pd.Series(np.random.choice(list(ratio.index), p=list(ratio), 
                            size=missing), index = missing_index)

#3. fill the number with this series
X_test.seats.fillna(series, inplace=True)

# print(X_test.seats.value_counts(normalize=True))

X_test.seats.isna().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test.seats.fillna(series, inplace=True)


0

In [67]:
X_train.isna().sum()
X_test.isna().sum()

year            0
km_driven       0
engine          0
max_power       0
brand           0
fuel            0
seller_type     0
transmission    0
owner           0
seats           0
dtype: int64

##### 4.2 Standardization

In [68]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
col_to_scale = ['km_driven', 'engine', 'max_power']

X_train.loc[:, col_to_scale] = sc.fit_transform(X_train.loc[:, col_to_scale]) #we scale all features, because all our features are continuous
X_test.loc[:,  col_to_scale] = sc.transform(X_test.loc[:, col_to_scale])

#we did not transform y_test or y_train

#after standardize, the mean should be zero; the std should be 1
for feature in col_to_scale:
    assert np.isclose(X_train.loc[:, feature].mean(), 0, atol = 0.0001)  #cannot == 0 because is near 0 not 0
    assert np.isclose(X_train.loc[:, feature].std(),  1, atol = 0.0001)
    #we don't need to assert for X_test, because we use training statistics to transform X_test, so it won't be mean0 std1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.loc[:, col_to_scale] = sc.fit_transform(X_train.loc[:, col_to_scale]) #we scale all features, because all our features are continuous
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test.loc[:,  col_to_scale] = sc.transform(X_test.loc[:, col_to_scale])


##### 4.3 Encoding

In [69]:
X_train.head()

Unnamed: 0,year,km_driven,engine,max_power,brand,fuel,seller_type,transmission,owner,seats
6474,2017,-0.752937,-0.915487,-0.694423,Renault,Petrol,Individual,Manual,1,5.0
112,2013,1.33265,0.25865,1.006721,Hyundai,Diesel,Individual,Manual,2,5.0
5365,2015,0.915533,0.014961,-0.212643,Renault,Diesel,Individual,Manual,1,8.0
1606,2012,0.331569,1.460982,0.82548,Mahindra,Diesel,Individual,Manual,3,8.0
846,2016,-0.83636,-0.514709,-0.264262,Mahindra,Petrol,Individual,Manual,1,6.0


In [70]:
len(X_train.fuel.unique())  #label encode
X_train['fuel'] = X_train['fuel'].map({'Petrol': 0, 'Diesel': 1})
X_test['fuel']  = X_test['fuel'].map({'Petrol': 0, 'Diesel': 1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['fuel'] = X_train['fuel'].map({'Petrol': 0, 'Diesel': 1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['fuel']  = X_test['fuel'].map({'Petrol': 0, 'Diesel': 1})


In [71]:
len(X_train.brand.unique())  #one-hot encode (get dummy)
X_train = pd.get_dummies(X_train, columns=['brand'], drop_first=True)
X_test  = pd.get_dummies(X_test, columns=['brand'], drop_first=True)

In [72]:
len(X_train.seller_type.unique())  #one-hot encode (get dummy)
X_train = pd.get_dummies(X_train, columns=['seller_type'], drop_first=True)
X_test  = pd.get_dummies(X_test, columns=['seller_type'], drop_first=True)

In [73]:
len(X_train.transmission.unique())  #label encode
X_train['transmission'] = X_train['transmission'].map({'Manual': 0, 'Automatic': 1})
X_test['transmission'] = X_test['transmission'].map({'Manual': 0, 'Automatic': 1})

In [74]:
X_train.head()

Unnamed: 0,year,km_driven,engine,max_power,fuel,transmission,owner,seats,brand_Audi,brand_BMW,...,brand_Opel,brand_Peugeot,brand_Renault,brand_Skoda,brand_Tata,brand_Toyota,brand_Volkswagen,brand_Volvo,seller_type_Individual,seller_type_Trustmark Dealer
6474,2017,-0.752937,-0.915487,-0.694423,0,0,1,5.0,0,0,...,0,0,1,0,0,0,0,0,1,0
112,2013,1.33265,0.25865,1.006721,1,0,2,5.0,0,0,...,0,0,0,0,0,0,0,0,1,0
5365,2015,0.915533,0.014961,-0.212643,1,0,1,8.0,0,0,...,0,0,1,0,0,0,0,0,1,0
1606,2012,0.331569,1.460982,0.82548,1,0,3,8.0,0,0,...,0,0,0,0,0,0,0,0,1,0
846,2016,-0.83636,-0.514709,-0.264262,0,0,1,6.0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [75]:
X_test.head()

Unnamed: 0,year,km_driven,engine,max_power,fuel,transmission,owner,seats,brand_Audi,brand_BMW,...,brand_Mitsubishi,brand_Nissan,brand_Renault,brand_Skoda,brand_Tata,brand_Toyota,brand_Volkswagen,brand_Volvo,seller_type_Individual,seller_type_Trustmark Dealer
6798,2011,-0.002125,-0.115946,-0.608391,1,0,2,5.0,0,0,...,0,0,0,0,1,0,0,0,1,0
4850,2014,0.665262,-0.414012,-0.070403,1,0,1,7.0,0,0,...,0,0,0,0,0,0,0,0,1,0
4495,2011,0.031244,-0.414012,-0.493681,1,0,1,5.0,0,0,...,0,0,0,0,0,0,0,0,1,0
1706,2014,0.164722,1.460982,1.399028,1,0,2,7.0,0,0,...,0,0,0,0,0,0,0,0,1,0
6889,2013,-0.335819,0.276775,0.862761,0,0,3,5.0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [76]:
stop here

SyntaxError: invalid syntax (<ipython-input-76-a96ba3aab008>, line 1)

### Step 5: Modeling

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

#models
lr, nb, sv, rf, gb = LogisticRegression(), GaussianNB(), SVC(), RandomForestClassifier(), GradientBoostingClassifier()

models = [lr, nb, sv, rf, gb]
names  = ["lr", "nb", "sv", "rf", "gb"]

for idx, model in enumerate(models):
    score = cross_val_score(model, X_train, y_train, n_jobs=-1) #no need cv and scoring because their default is ok in our case
    print(f"{names[idx]} - Mean: {score.mean()}; Std: {score.std()}")

In [None]:
from sklearn.model_selection import GridSearchCV

model = LogisticRegression(random_state=999)  #<----this is the model I choose, after cross validation

param_grid = dict()
param_grid['solver'] = ['newton-cg', 'lbfgs', 'liblinear']  #this is listed in the sklearn website
#add more parameters here
#param_grid[parameter] = list of parameters to search

#refit means it will pick the best model, and fit again, so it means grid is already the best model after this line
grid = GridSearchCV(model, param_grid, refit=True, return_train_score=True)
#scoring = f1, recall, precision, accuracy

#fit the grid, which will basically do cross validation across all combinatiosn, here we only have 3 comb
grid.fit(X_train, y_train)  #remember to use only training set here....

#print the best parameters and accuracy
print(grid.best_params_)
print(grid.best_score_)
# print(grid.cv_results_)   #hide this for aesthetic

#this score is cross-validation score, basically the accuracy/precision/etc on the validation set

#?Note:  our train score is around 0.95, 0.95, 0.82;  the val score is 0.95, 0.96, 0.82
#?       so no overfitting....because they are close
#?Note:  if we have overfitting, we need to check many things, e.g., 
#?       - choose simpler model
#?       - help the model choose better features
#?       - collect more high quality data, and more data....
#?       - or maybe your data has no pattern!! :-) the model is just learning pattern of noises......

### Step 6: Testing

In [None]:
pred_y = grid.predict(X_test)

#if your website needs probability, you can use 
#proba or log_proba are same, log_proba just make the value more scaled....
#?  Note:  some algorithm has no predict_proba() so please check
    #pred_y_prob = grid.predict_proba(X_test)
#or
    #pred_y_logprob = grid.predict_log_proba(X_test)

In [None]:
pred_y

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

print(classification_report(y_test, pred_y))
#? Note:  we can use all four metrics to understand
#? Accuracy:  0.95 - we can use it because our data is balanced
#? Precision: 0.95 - look at this metric if you want to prioritize lowering FP
#? Recall:    0.95 - look at this metirc if you want to prioritize lowering FN
#? F1-score:  0.95 - look at this metric if you want to prioritize lowering both FP and FN

#?macro-avg average all classes
#?weighted-avg average all classes * their fraction of sample sizes

#?if your data is balanced, both avg will be the same (anyway)

In [None]:
cm  = confusion_matrix(y_test, pred_y)
cmp = ConfusionMatrixDisplay(cm, display_labels=[0, 1, 2, 3])

cmp.plot()

#? Note:  0 is often confused as 1
#?        1 is confused as 2
#?        2 is confused as 3
#?  This makes sense because close price range may be easily confused.......i guess (i don't know)

In [None]:
#let's check which sample like to get wrong, especially "0 is often confused as 1"
conda = pred_y == 0
condb = y_test == 1

X_test[conda & condb]  
#so i will leave at that....but if you are the domain expert, you can check here....
 

### Step 7: Feature Importance

In [None]:
grid.best_estimator_.coef_ #(n_classes, n_features)
#n_classes means 0, 1, 2, 3
#if we have only two classes, it will be (1, n_features), because it can 0 or 1

In [None]:
feature_importance = pd.DataFrame(['ram', 'battery_power', 'px_height', 'px_width'], 
                                  columns=['features'])
feature_importance["0"] = grid.best_estimator_.coef_[0]
feature_importance["1"] = grid.best_estimator_.coef_[1]
feature_importance["2"] = grid.best_estimator_.coef_[2]
feature_importance["3"] = grid.best_estimator_.coef_[3]

for _class in ['0', '1', '2', '3']:  
    feature_importance = feature_importance.sort_values(by = [_class], ascending=True)
    feature_importance.plot.barh(x='features', y=_class)
    plt.show()
    
#? What we learn:
#? Class 0:  Ram is most important;  Ram negatively impact the price_range of 0
#? Class 1:  Ram is most important;  Ram negatively impact the price_range of 1
#? Class 2:  Ram is most important;  Ram positively impact the price_range of 2
#? Class 3:  Ram is most important;  Ram positively impact the price_range of 3

#? Overall, all importances point out that ram > battery_power > px_width > px_height

### Step 8: Saving the models

In [None]:
#please help me save the model here
import pickle

# save the model
filename = 'mobile_price.pkl' # pkl does not matter, you can do .everything
pickle.dump(grid, open(filename,'wb'))

# Load the model
loaded_grid=pickle.load(open(filename,'rb'))

# try predict X_test
loaded_grid.predict(X_test)

# if you have new data, then you fit again....but using loaded_grid
# which is a process of training more.....once you have more data....

# or another way is
# put all the dataset together, and train like it is new
    #this is possible ONLY if your dataset is not that big......


### For real world prediction

In [None]:
#for actual use? to predict some future data.....
some_data = np.array([ [1, 2, 3, 4], [2, 3, 4, 5] ])

#standardize
some_data = sc.transform(some_data)

#predict
pred = grid.predict(some_data)

pred #both samples have price_range of 3 (predicted, NOT actual, because we don't have actual)

In [None]:
stock = np.array([ [1, 1.01, 0.99, 0.80, 0.5, 0.9, 1.1, 1.2]])

#window size = 3
#predict the next 2 days

#X_1 = 1, 1.01, 0.99
#y_1 = 0.80, 0.5

#X_2 = 1.01, 0.99, 0.80
#y_2 = 0.5, 0.9

#X_3 = 0.99, 0.80, 0.5
#y_3 = 0.9, 1.1

#so on.....
#this will be your training set!!

# for in range(, , window_size):
#     X = 
#     y = 