In [332]:
# ! wget https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv

In [354]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score, accuracy_score, mean_squared_error
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer

In [334]:
df = pd.read_csv('data.csv')

df.columns = df.columns.str.replace(' ', '_').str.lower()
df.rename(columns={'msrp': 'price'}, inplace=True)

In [335]:
cols_to_use = ['make', 'model', 'year', 'engine_hp',
               'engine_cylinders', 'transmission_type',
               'vehicle_style', 'highway_mpg', 'city_mpg']
features = df[cols_to_use]
target = df.price

In [336]:
features.fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features.fillna(0, inplace=True)


## Question 1

In [337]:
features.transmission_type.mode()

0    AUTOMATIC
Name: transmission_type, dtype: object

## Question 2

In [338]:
num_cols = features.select_dtypes(include=np.number).columns.tolist()

corr = features[num_cols].corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

  corr.style.background_gradient(cmap='coolwarm').set_precision(2)


Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg
year,1.0,0.34,-0.04,0.26,0.2
engine_hp,0.34,1.0,0.77,-0.42,-0.42
engine_cylinders,-0.04,0.77,1.0,-0.61,-0.59
highway_mpg,0.26,-0.42,-0.61,1.0,0.89
city_mpg,0.2,-0.42,-0.59,0.89,1.0


In [339]:
def get_redundant_pairs(df):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(df, n=5):
    au_corr = df.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[0:n]

print("Top Absolute Correlations")
print(get_top_abs_correlations(features[num_cols], 1))

Top Absolute Correlations
highway_mpg  city_mpg    0.886829
dtype: float64


## Question 3

In [340]:
price_avg = target.mean()
target_binary = np.where(target >= price_avg, 1 ,0)

In [341]:
np.random.seed(42)

x_train, x, y_train, y = train_test_split(features,target_binary,test_size=0.4,train_size=0.6)
x_test, x_val, y_test, y_val = train_test_split(x,y,test_size = 0.5,train_size =0.5)

In [342]:
categorical_cols = features.select_dtypes(include='object').columns.tolist()

In [343]:
for col in categorical_cols:
    print(col,round(mutual_info_score(y_train, x_train[col]), 2))

make 0.24
model 0.46
transmission_type 0.02
vehicle_style 0.08


## Question 4

In [344]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)

In [345]:
train_dict = x_train.to_dict(orient='records')

dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
x_train_tr = dv.transform(train_dict)

In [346]:
model.fit(x_train_tr, y_train)

In [347]:
val_dict = x_val.to_dict(orient='records')
x_val_tr = dv.transform(val_dict)
y_pred = model.predict(x_val_tr)
acc = round(accuracy_score(y_val, y_pred), 2)
acc

0.95

## Question 5

In [351]:
feats = num_cols + categorical_cols

orig_score = acc

for c in feats:
    subset = feats.copy()
    subset.remove(c)

    train_dict = x_train[subset].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    dv.fit(train_dict)

    x_train_tr = dv.transform(train_dict)

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(x_train_tr, y_train)

    val_dict = x_val[subset].to_dict(orient='records')
    x_val_tr = dv.transform(val_dict)

    y_pred = model.predict(x_val_tr)

    score = accuracy_score(y_val, y_pred)
    print(c, round(orig_score - score, 3))

year 0.007
engine_hp 0.028
engine_cylinders 0.004
highway_mpg 0.015
city_mpg 0.009
make 0.025
model 0.031
transmission_type 0.004
vehicle_style 0.028


## Question 6

In [358]:
target_log = np.log1p(target)
np.random.seed(42)

x_train, x, y_train, y = train_test_split(features,target_log,test_size=0.4,train_size=0.6)
x_test, x_val, y_test, y_val = train_test_split(x,y,test_size = 0.5,train_size =0.5)

In [361]:
train_dict = x_train[categorical_cols + num_cols].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

x_train = dv.transform(train_dict)

val_dict = x_val[categorical_cols + num_cols].to_dict(orient='records')
x_val = dv.transform(val_dict)
for a in [0, 0.01, 0.1, 1, 10]:
    model = Ridge(alpha=a, solver="sag", random_state=42)
    model.fit(x_train, y_train)

    y_pred = model.predict(x_val)

    score = np.sqrt(mean_squared_error(y_val, y_pred))

    print(a, round(score, 3))



0 0.48




0.01 0.48




0.1 0.48




1 0.48
10 0.48


