In [127]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge

In [3]:
'''Reading the data from the link provided'''

data = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv')

In [4]:
data.head() 

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Make               11914 non-null  object 
 1   Model              11914 non-null  object 
 2   Year               11914 non-null  int64  
 3   Engine Fuel Type   11911 non-null  object 
 4   Engine HP          11845 non-null  float64
 5   Engine Cylinders   11884 non-null  float64
 6   Transmission Type  11914 non-null  object 
 7   Driven_Wheels      11914 non-null  object 
 8   Number of Doors    11908 non-null  float64
 9   Market Category    8172 non-null   object 
 10  Vehicle Size       11914 non-null  object 
 11  Vehicle Style      11914 non-null  object 
 12  highway MPG        11914 non-null  int64  
 13  city mpg           11914 non-null  int64  
 14  Popularity         11914 non-null  int64  
 15  MSRP               11914 non-null  int64  
dtypes: float64(3), int64(5

In [6]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Year,11914.0,2010.384338,7.57974,1990.0,2007.0,2015.0,2016.0,2017.0
Engine HP,11845.0,249.38607,109.19187,55.0,170.0,227.0,300.0,1001.0
Engine Cylinders,11884.0,5.628829,1.780559,0.0,4.0,6.0,6.0,16.0
Number of Doors,11908.0,3.436093,0.881315,2.0,2.0,4.0,4.0,4.0
highway MPG,11914.0,26.637485,8.863001,12.0,22.0,26.0,30.0,354.0
city mpg,11914.0,19.733255,8.987798,7.0,16.0,18.0,22.0,137.0
Popularity,11914.0,1554.911197,1441.855347,2.0,549.0,1385.0,2009.0,5657.0
MSRP,11914.0,40594.737032,60109.103604,2000.0,21000.0,29995.0,42231.25,2065902.0


In [7]:
data.isna().sum()

Make                    0
Model                   0
Year                    0
Engine Fuel Type        3
Engine HP              69
Engine Cylinders       30
Transmission Type       0
Driven_Wheels           0
Number of Doors         6
Market Category      3742
Vehicle Size            0
Vehicle Style           0
highway MPG             0
city mpg                0
Popularity              0
MSRP                    0
dtype: int64

In [10]:
## Features to be used Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle Style,highway MPG,city mpg,MSRP

data = data[['Make','Model','Year','Engine HP','Engine Cylinders','Transmission Type','Vehicle Style','highway MPG','city mpg','MSRP']]

'''Transforming the column names to lower case and replacing the space with _'''

data.columns = data.columns.str.lower().str.replace(' ','_')

In [11]:
data

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,msrp
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500
...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,46120
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,56670
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50620
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50920


In [12]:
data.isna().sum()

make                  0
model                 0
year                  0
engine_hp            69
engine_cylinders     30
transmission_type     0
vehicle_style         0
highway_mpg           0
city_mpg              0
msrp                  0
dtype: int64

In [13]:
'''Filling the null values with 0'''

data.fillna(0,inplace=True)

In [14]:
data.isna().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
msrp                 0
dtype: int64

In [15]:
'''Renaming the column name MSRP to price'''

data.rename(columns={'msrp':'price'},inplace=True)

In [16]:
data.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


In [17]:
data.transmission_type.value_counts()

AUTOMATIC           8266
MANUAL              2935
AUTOMATED_MANUAL     626
DIRECT_DRIVE          68
UNKNOWN               19
Name: transmission_type, dtype: int64

In [18]:
data.transmission_type.mode()

0    AUTOMATIC
Name: transmission_type, dtype: object

## Answer 1: AUTOMATIC with 8266 entries

In [19]:
data.dtypes

make                  object
model                 object
year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
price                  int64
dtype: object

In [22]:
'''Correlation matrix between the numerical features'''

data.corr().round(3)

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg,price
year,1.0,0.339,-0.041,0.258,0.198,0.228
engine_hp,0.339,1.0,0.775,-0.416,-0.425,0.65
engine_cylinders,-0.041,0.775,1.0,-0.615,-0.587,0.526
highway_mpg,0.258,-0.416,-0.615,1.0,0.887,-0.16
city_mpg,0.198,-0.425,-0.587,0.887,1.0,-0.158
price,0.228,0.65,0.526,-0.16,-0.158,1.0


## Answer 2: city_mpg & highway_mpg has highest correlation with 0.887

In [72]:
data['above_average'] = (data.price >= data.price.mean()).astype('int')

### Converting above_average to categorical variable

data.above_average = data.above_average.astype('category')

## Dropping the price column

data.drop('price',axis=1,inplace=True)

In [73]:
data.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,above_average
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,1
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,1
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,0
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,0
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,0


In [74]:
data.above_average.value_counts()

0    8645
1    3269
Name: above_average, dtype: int64

In [75]:
data.dtypes

make                   object
model                  object
year                    int64
engine_hp             float64
engine_cylinders      float64
transmission_type      object
vehicle_style          object
highway_mpg             int64
city_mpg                int64
above_average        category
dtype: object

In [76]:
### Splitting the data into train, val and test sets with 60%, 20% and 20% respectively

df_full_train,df_test = train_test_split(data,test_size=0.2,random_state=42)

df_train,df_val = train_test_split(df_full_train,test_size=0.25,random_state=42)

In [77]:
len(df_train), len(df_val), len(df_test)

(7148, 2383, 2383)

In [78]:
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

In [82]:
### Extracting categotical and numerical features names and storing them in separate lists

categorical = ['make','model','transmission_type','vehicle_style']
numerical = ['year','engine_hp','highway_mpg','city_mpg','engine_cylinders']

In [83]:
### Using the df_train set only calculating the mutual information score between above_average and other categorical variables


def mutual_info_churn_score(series):
    return mutual_info_score(series,y_train)

mi = df_train[categorical].apply(mutual_info_churn_score)
mi.sort_values(ascending=False).round(2)


model                0.46
make                 0.24
vehicle_style        0.08
transmission_type    0.02
dtype: float64

## Answer 3: transmission_type has the weakest mutual information score with above_average (0.02) meaning it tells very less about the predictor and vice versa

In [84]:
### One hot encoding the categorical variables

train_dicts = df_train[categorical + numerical].to_dict(orient='records')

dv = DictVectorizer(sparse=False)

X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')

X_val = dv.transform(val_dicts)

In [85]:
## Fitting a logistic regression model using the training data

model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)

model.fit(X_train,y_train)

## Calculating the accuracy score on the validation set

y_pred = model.predict_proba(X_val)[:,1]

above_average_decision = (y_pred >= 0.5)

accuracy = (y_val == above_average_decision).mean()

In [86]:
print(f'Accuracy score on the validation set is {accuracy}')

Accuracy score on the validation set is 0.9345362987830466


## The accuracy on the validation set was 0.934, closest to 0.95 in the options

In [105]:
### Let's find the least useful feature using the feature elimination technique. We will remove one feature at a time and calculate the accuracy score on the validation set. Train a model with all these features (using the same parameters as in Q4).
### Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
### For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

feature_list = list(df_train.columns)

def train(df_train,y_train, feature_list=feature_list):
    dicts = df_train[feature_list].to_dict(orient='records')
    
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)
    
    model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model.fit(X_train,y_train)
    
    return dv,model

def predict(df,dv,model,feature_list=feature_list):
    dicts = df[feature_list].to_dict(orient='records')
    
    X = dv.transform(dicts)
    y_pred = model.predict_proba(X)[:,1]
    
    return y_pred

dv,model = train(df_train,y_train, feature_list=feature_list)
y_pred = predict(df_val,dv,model, feature_list=feature_list)

original_accuracy = (y_val == (y_pred >= 0.5)).mean()

print(f'Original accuracy is {original_accuracy}')

diff = []

for feature in feature_list:
    df_train_copy = df_train.copy()
    del df_train_copy[feature]

    feature_sub_list = list(df_train_copy.columns)
    
    dv,model = train(df_train_copy,y_train, feature_sub_list)
    y_pred = predict(df_val,dv,model,feature_sub_list)
    
    accuracy = (y_val == (y_pred >= 0.5)).mean()
    diff.append(original_accuracy - accuracy)

diff = abs(pd.Series(diff,index=feature_list))

diff.sort_values(ascending=False)

Original accuracy is 0.9345362987830466


year                 0.012170
engine_cylinders     0.011750
city_mpg             0.011750
make                 0.011330
engine_hp            0.010911
transmission_type    0.010491
model                0.010491
highway_mpg          0.006714
vehicle_style        0.006295
dtype: float64

## Answer 5: As we can see from the output above, transmission_type has the smallest differnece with 0.010491

In [109]:
### Prepping the initial data set for linear regression and fitting ridge regression model

data_init = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv')

In [110]:
data_init

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,premium unleaded (required),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Hatchback,Luxury",Midsize,4dr Hatchback,23,16,204,46120
11910,Acura,ZDX,2012,premium unleaded (required),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Hatchback,Luxury",Midsize,4dr Hatchback,23,16,204,56670
11911,Acura,ZDX,2012,premium unleaded (required),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Hatchback,Luxury",Midsize,4dr Hatchback,23,16,204,50620
11912,Acura,ZDX,2013,premium unleaded (recommended),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Hatchback,Luxury",Midsize,4dr Hatchback,23,16,204,50920


In [111]:
## Features to be used Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle Style,highway MPG,city mpg,MSRP

data_init = data_init[['Make','Model','Year','Engine HP','Engine Cylinders','Transmission Type','Vehicle Style','highway MPG','city mpg','MSRP']]

'''Transforming the column names to lower case and replacing the space with _'''

data_init.columns = data_init.columns.str.lower().str.replace(' ','_')

In [112]:
data_init.isna().sum()

make                  0
model                 0
year                  0
engine_hp            69
engine_cylinders     30
transmission_type     0
vehicle_style         0
highway_mpg           0
city_mpg              0
msrp                  0
dtype: int64

In [113]:
'''Filling the null values with 0'''

data_init.fillna(0,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_init.fillna(0,inplace=True)


In [115]:
'''Renaming the column name MSRP to price'''

data_init.rename(columns={'msrp':'price'},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_init.rename(columns={'msrp':'price'},inplace=True)


In [116]:
data_init

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500
...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,46120
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,56670
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50620
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50920


In [117]:
## Splitting the data into train, val and test sets with 60%, 20% and 20% respectively

df_full_train_ridge,df_test_ridge = train_test_split(data_init,test_size=0.2,random_state=42)

df_train_ridge,df_val_ridge = train_test_split(df_full_train_ridge,test_size=0.25,random_state=42)

len(df_train_ridge), len(df_val_ridge), len(df_test_ridge)


df_train_ridge['price'] = np.log1p(df_train_ridge['price'])
df_val_ridge['price'] = np.log1p(df_val_ridge['price'])
df_test_ridge['price'] = np.log1p(df_test_ridge['price'])

y_train = df_train_ridge.price.values
y_val = df_val_ridge.price.values
y_test = df_test_ridge.price.values

del df_train_ridge['price']
del df_val_ridge['price']
del df_test_ridge['price']

In [121]:
feature_list_ridge = df_train_ridge.columns

In [122]:
### One hot encoding the categorical variables

train_dicts_ridge = df_train_ridge[feature_list_ridge].to_dict(orient='records')

dv = DictVectorizer(sparse=False)

X_train = dv.fit_transform(train_dicts_ridge)

val_dicts_ridge = df_val_ridge[feature_list_ridge].to_dict(orient='records')

X_val = dv.transform(val_dicts_ridge)

In [126]:
X_train.shape

(7148, 943)

In [129]:
## Fit the Ridge regression model on the training data with a solver 'sag'. Set the seed to 42. try the following values: [0, 0.01, 0.1, 1, 10] for the parameter alpha and calculate the RMSE on the validation set for each of the alpha values.

alphas = [0, 0.01, 0.1, 1, 10]

for alpha in alphas:
    model = Ridge(alpha=alpha, solver='sag', random_state=42)
    model.fit(X_train,y_train)
    
    y_pred = model.predict(X_val)
    
    rmse = np.sqrt(((y_val - y_pred)**2).mean())
    
    print(f'RMSE for alpha {alpha} is {rmse}')



RMSE for alpha 0 is 0.4867943132423886




RMSE for alpha 0.01 is 0.48679455192752674




RMSE for alpha 0.1 is 0.48679670001899733




RMSE for alpha 1 is 0.48681817454327364
RMSE for alpha 10 is 0.4870322832975124




## Alpha of 0 leads to the best RMSE, they are similar but 0 is the smallest thus I chose this

#### FIN ####