In [1]:
import pandas as pd 
import numpy as np 
from matplotlib import pyplot as plt
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold

In [2]:
# load the data

file_url = "https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv"
data = pd.read_csv(file_url)
data.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [3]:
# data preparpation
# select column and lower case their names
relevant_features=['Make', 'Model', 'Year', 'Engine HP', 'Engine Cylinders', 'Transmission Type', 'Vehicle Style','highway MPG', 'city mpg', 'MSRP']
data = data[relevant_features].copy()
data.columns = data.columns.str.replace(' ', '_').str.lower()

# fill missing values with 0
data = data.fillna(0).copy()

# rename 'msrp' to 'price'
data = data.rename(columns={'msrp':'price'}).copy()

# make price binary
df = data.copy()
df['above_average'] = (df.price > df.price.mean()).astype(int)
df = df.drop(['price'], axis=1)
df.head()

df.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,above_average
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,1
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,1
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,0
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,0
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,0


In [4]:
# split the data into train/val/test with 60%/20%/20%
df_full_train, df_test = train_test_split(df, test_size=round(len(df)*.2), random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=round(len(df)*.2), random_state=42)

In [5]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train['above_average'].values
y_val = df_val['above_average'].values
y_test = df_test['above_average'].values

del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

### Question 1

In [6]:
# distinguishing between numerical and categorical variables
cat = df_train.columns[df_train.dtypes=='object'].tolist()
num = df_train.columns[df_train.dtypes!='object'].tolist()

In [13]:
for c in num:
    auc = roc_auc_score(y_train, df_train[c])
    if auc < 0:
        auc = roc_auc_score(y_train, -df_train[c])
    print('%9s: %.3f' % (c, auc))

     year: 0.690
engine_hp: 0.913
engine_cylinders: 0.766
highway_mpg: 0.368
 city_mpg: 0.328


### Question 2

In [14]:
# one-hot encode features
def one_hot_encode(train_df, test_df, features):
    train_dict=train_df[features].to_dict(orient='records')
    val_dict=test_df[features].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    dv.fit(train_dict)
        
    return dv.transform(train_dict), dv.transform(val_dict)

In [15]:
# one-hot encode categorical variables
features = cat+num
X_train, X_val = one_hot_encode(df_train, df_val, features)

In [21]:
# train logistic regression model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict_proba(X_val)[:, 1]
print(roc_auc_score(y_val, y_pred))

0.9843231931346762


### Question 3

### Question 4

### Question 5

In [None]:
####################################################

In [21]:
# train logistic regression model
accuracy = train_run_logistic_regression(X_train, y_train, X_val, y_val)
print(round(accuracy,2))

0.94


### Question 5

In [17]:
score_differences = []
for c in features:
    subset = features.copy()
    subset.remove(c)

    X_train, X_val = one_hot_encode(df_train, df_val, subset)
    score = train_run_logistic_regression(X_train, y_train, X_val, y_val)

    score_differences.append(np.abs(accuracy - score))

min(dict(zip(features, score_differences)))

'city_mpg'

### Question 6

In [18]:
data.price = np.log1p(data.price)

In [19]:
# split and prepare the data
df_train_full, df_test = train_test_split(data, test_size=round(len(data)*.2), random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=round(len(data)*.2), random_state=42)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

del df_train['price']
del df_val['price']
del df_test['price']

In [20]:
def train_run_ridge_regression(X_train, y_train, X_test, y_test, a):
    model = Ridge(alpha=a, solver="sag", random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    return np.sqrt(mean_squared_error(y_test, y_pred))

In [21]:
# ridge regression 
X_train, X_val = one_hot_encode(df_train, df_val, df_train.columns)

alphas = [0, 0.01, 0.1, 1, 10]
scores = [train_run_ridge_regression(X_train, y_train, X_val, y_val, a) for a in alphas]



In [22]:
min(dict(zip(list(map(str,alphas)), scores)))

'0'