##### Data Preparation #####

In [19]:
# import librarires

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [20]:
# load the data

path = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv'

data = pd.read_csv(path)

data = data[['Make','Model','Year','Engine HP','Engine Cylinders','Transmission Type','Vehicle Style','highway MPG','city mpg', 'MSRP']]

In [21]:
data.columns = data.columns.str.lower().str.replace(' ','_')

# change the values of all columns to lower case and replace spaces with underscores

for col in data.columns:
    if data[col].dtype == 'object':
        data[col] = data[col].str.lower().str.replace(' ','_')

data

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,msrp
0,bmw,1_series_m,2011,335.0,6.0,manual,coupe,26,19,46135
1,bmw,1_series,2011,300.0,6.0,manual,convertible,28,19,40650
2,bmw,1_series,2011,300.0,6.0,manual,coupe,28,20,36350
3,bmw,1_series,2011,230.0,6.0,manual,coupe,28,18,29450
4,bmw,1_series,2011,230.0,6.0,manual,convertible,28,18,34500
...,...,...,...,...,...,...,...,...,...,...
11909,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,46120
11910,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,56670
11911,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,50620
11912,acura,zdx,2013,300.0,6.0,automatic,4dr_hatchback,23,16,50920


In [22]:
# fill all missing values with 0

for c in data.columns:
    data[c] = data[c].fillna(0)


# rename the column MSRP to price

data = data.rename(columns={'msrp':'price'})

In [23]:
numerical = ['year','engine_hp','engine_cylinders','highway_mpg','city_mpg', 'price']
numerical_no_price = ['year','engine_hp','engine_cylinders','highway_mpg','city_mpg']
categorical = ['make','model','transmission_type','vehicle_style']
target = 'price'

##### Question 1: What is the most frequent observation (mode) for the column transmission_type? #####

In [24]:
round(data['transmission_type'].value_counts(normalize=True)*100,2) 

transmission_type
automatic           69.38
manual              24.63
automated_manual     5.25
direct_drive         0.57
unknown              0.16
Name: proportion, dtype: float64

##### Question 2: What are the two features that have the biggest correlation in this dataset? #####

In [25]:
data[numerical]

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg,price
0,2011,335.0,6.0,26,19,46135
1,2011,300.0,6.0,28,19,40650
2,2011,300.0,6.0,28,20,36350
3,2011,230.0,6.0,28,18,29450
4,2011,230.0,6.0,28,18,34500
...,...,...,...,...,...,...
11909,2012,300.0,6.0,23,16,46120
11910,2012,300.0,6.0,23,16,56670
11911,2012,300.0,6.0,23,16,50620
11912,2013,300.0,6.0,23,16,50920


In [26]:
'''
    engine_hp and year
    engine_hp and engine_cylinders
    highway_mpg and engine_cylinders
    highway_mpg and city_mpg

'''

correlation_df = data[numerical_no_price].corr().abs()

for i in range(len(correlation_df)):
    correlation_df.iloc[i,i] = 0

correlation_df.unstack().sort_values(ascending=False).drop_duplicates()

# highest is city_mpg and highway_mpg

highway_mpg       city_mpg            0.886829
engine_cylinders  engine_hp           0.774851
highway_mpg       engine_cylinders    0.614541
city_mpg          engine_cylinders    0.587306
                  engine_hp           0.424918
engine_hp         highway_mpg         0.415707
                  year                0.338714
year              highway_mpg         0.258240
                  city_mpg            0.198171
engine_cylinders  year                0.040708
year              year                0.000000
dtype: float64

In [27]:
# make price binary - above average price is 1 and below average price is 0

data['above_average'] = (data.price >= data.price.mean()).astype(int)

In [28]:
# split the data

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data[categorical + numerical_no_price], data.above_average, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

##### Question 3: Mutual information score between above_average and other categorical variables in our dataset #####

In [29]:
from sklearn.metrics import mutual_info_score

for col in categorical:
    print(col, round(mutual_info_score(X_train[col], y_train),2))

# transmission_type has the lowest score

make 0.24
model 0.46
transmission_type 0.02
vehicle_style 0.08


##### Question 4: Train a logistic regression. Accuracy? #####

In [30]:
'''
In summary, this code initializes a DictVectorizer, converts the training and validation data into a suitable format 
(list of dictionaries), and then transforms this data into numerical arrays suitable for machine learning.

'''

from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)

train_dict = X_train[categorical + numerical_no_price].to_dict(orient='records')
X_train_hot = dv.fit_transform(train_dict)

val_dict = X_val[categorical + numerical_no_price].to_dict(orient='records')
X_val_hot = dv.transform(val_dict)

In [31]:
from sklearn.linear_model import LogisticRegression

def train_model(X,y, X_val, y_val):

    model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model.fit(X, y)
    y_pred_val = model.predict_proba(X_val)[:, 1]
    accuracy = ((y_pred_val >= 0.5).astype(int) == y_val).mean()
    return accuracy

In [32]:
original = train_model(X_train_hot, y_train, X_val_hot, y_val)
original

0.9471254720939991

##### Question 5: 
Find the least useful feature using the feature elimination technique. 

Train a model with all these features. exclude each feature from this set and train a model without it. 

Record the accuracy for each model. 

For each feature, calculate the difference between the original accuracy and the accuracy without the feature. #####

In [33]:
categorical + numerical_no_price

['make',
 'model',
 'transmission_type',
 'vehicle_style',
 'year',
 'engine_hp',
 'engine_cylinders',
 'highway_mpg',
 'city_mpg']

In [34]:
def least_useful(train, val):
    
    model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
       
    for i in range(train.shape[1]):
        col_name = train.columns[i]
        X_train_new = train.drop(col_name, axis=1)
        X_val_new = val.drop(col_name, axis=1)

        dv = DictVectorizer(sparse=False)
        
        train_dict = X_train_new.to_dict(orient='records')
        X_train_hot = dv.fit_transform(train_dict)
        val_dict = X_val_new.to_dict(orient='records')
        X_val_hot = dv.transform(val_dict)
                
        model.fit(X_train_hot, y_train)
        y_pred_val = model.predict_proba(X_val_hot)[:, 1]
        accuracy = ((y_pred_val >= 0.5).astype(int) == y_val).mean()
        
        if col_name == 'year' or col_name == 'engine_hp' or col_name == 'transmission_type' or col_name == 'city_mpg':
        
            print(col_name, round(accuracy-original,4))
    

least_useful(X_train, X_val)

transmission_type -0.0034
year 0.0004
engine_hp -0.0239
city_mpg -0.0147


In [35]:
original

# year

0.9471254720939991

##### Question 6: 

For this question, we'll see how to use a linear regression model from Scikit-Learn.

We'll need to use the original column price. Apply the logarithmic transformation to this column.

Fit the Ridge regression model on the training data with a solver 'sag'. Set the seed to 42.

This model also has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10].

Round your RMSE scores to 3 decimal digits.

In [50]:
import numpy as np

original_data = data.copy()
del original_data['above_average']

original_data['price'] = np.log1p(original_data['price'])

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(original_data.drop('price', axis=1), original_data.price, test_size=0.2, random_state=42)
X_train_reg, X_val_reg, y_train_reg, y_val_reg = train_test_split(X_train_reg, y_train_reg, test_size=0.25, random_state=42)

dv = DictVectorizer(sparse=True)

train_reg_dict = X_train_reg.to_dict(orient='records')
X_train_reg = dv.fit_transform(train_reg_dict)

val_reg_dict = X_val_reg.to_dict(orient='records')
X_val_reg = dv.transform(val_reg_dict)

In [55]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

for r in [0,0.01,0.1,1,10]:
    ridge = Ridge(alpha=r, solver='sag', random_state=42)
    ridge.fit(X_train_reg, y_train_reg)
    y_pred = ridge.predict(X_val_reg)
    rmse = mean_squared_error(y_val_reg, y_pred, squared=False)
    print(r, round(rmse,3))

0 0.255
0.01 0.255
0.1 0.251
1 0.258
10 0.336
