In [49]:
import pandas as pd
import numpy as np
import wget
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [2]:
data_url = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv'
filename = wget.download(data_url)

In [3]:
data_full = pd.read_csv('data.csv')

## Data Preparation

In [4]:
data = data_full[['Make','Model','Year','Engine HP','Engine Cylinders','Transmission Type',
   'Vehicle Style','highway MPG','city mpg','MSRP']]

In [5]:
data.columns = data.columns.str.replace(' ', '_').str.lower()
data.isnull().sum()

make                  0
model                 0
year                  0
engine_hp            69
engine_cylinders     30
transmission_type     0
vehicle_style         0
highway_mpg           0
city_mpg              0
msrp                  0
dtype: int64

In [6]:
df = data.copy()
df['engine_hp'] = df['engine_hp'].fillna(0)
df['engine_cylinders'] = df['engine_cylinders'].fillna(0)
df = df.rename(columns={"msrp": "price"})

## Q1

In [7]:
df.transmission_type.value_counts()

AUTOMATIC           8266
MANUAL              2935
AUTOMATED_MANUAL     626
DIRECT_DRIVE          68
UNKNOWN               19
Name: transmission_type, dtype: int64

## Q2

In [8]:
corr_matrix = df.corr().round(2)

correlation_pairs = []

# Iterate over the columns in the correlation matrix
for col1 in corr_matrix.columns:
    for col2 in corr_matrix.columns:
        if col1 != col2:  # Exclude self-correlations
            correlation = corr_matrix.loc[col1, col2]
            correlation_pairs.append((col1, col2, correlation))

# Sort the list of correlation pairs by absolute correlation value in descending order
correlation_pairs.sort(key=lambda x: abs(x[2]), reverse=True)

# Extract the top 5 pairs with the highest correlation
top_5_correlations = correlation_pairs[:5]

# Print the top 5 pairs and their correlations
for pair in top_5_correlations:
    print(f"Pair: {pair[0]} - {pair[1]}, Correlation: {pair[2]}")

Pair: highway_mpg - city_mpg, Correlation: 0.89
Pair: city_mpg - highway_mpg, Correlation: 0.89
Pair: engine_hp - engine_cylinders, Correlation: 0.77
Pair: engine_cylinders - engine_hp, Correlation: 0.77
Pair: engine_hp - price, Correlation: 0.65


## Q3

In [9]:
avg_price = df['price'].mean()
df['above_average'] = (df['price'] > avg_price).astype(int)

In [10]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values
del df_train['above_average']
del df_train['price']
del df_val['above_average']
del df_val['price']
del df_test['above_average']
del df_test['price']

In [11]:
len(df), len(df_train), len(df_val), len(df_test)

(11914, 7148, 2383, 2383)

In [12]:
categorical = ['make','model','transmission_type','vehicle_style']

In [13]:
def calculate_mi(series):
    return mutual_info_score(series, y_train)

round(df_train[categorical].apply(calculate_mi).sort_values(ascending=False),2)

model                0.46
make                 0.24
vehicle_style        0.08
transmission_type    0.02
dtype: float64

## Q4

In [14]:
train_dict = df_train.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
X_train = dv.transform(train_dict)
X_train.shape
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

LogisticRegression(C=10, max_iter=1000, random_state=42, solver='liblinear')

In [15]:
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)
model.predict_proba(X_val)

array([[9.99692966e-01, 3.07033780e-04],
       [2.27974745e-03, 9.97720253e-01],
       [9.99930206e-01, 6.97938861e-05],
       ...,
       [9.99898957e-01, 1.01043039e-04],
       [8.98629570e-03, 9.91013704e-01],
       [9.08719100e-03, 9.90912809e-01]])

In [16]:
y_pred = model.predict_proba(X_val)[:, 1]

In [32]:
above_avg = y_pred > 0.5
baseline_acc = (y_val == above_avg).mean()
baseline_acc

0.946286193873269

## Q5

In [33]:
for feature in df_train.columns:
    temp_df_train = df_train.copy()
    temp_df_val = df_val.copy()
    
    temp_df_train = temp_df_train.drop(columns=[feature])
    temp_df_val = temp_df_val.drop(columns=[feature])
    
    temp_train_dict = temp_df_train.to_dict(orient='records')
    temp_val_dict = temp_df_val.to_dict(orient='records')
    
    temp_dv = DictVectorizer(sparse=False)
    temp_dv.fit(temp_train_dict)
    
    temp_X_train = temp_dv.transform(temp_train_dict)
    temp_X_val = temp_dv.transform(temp_val_dict)
      
    temp_model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    temp_model.fit(temp_X_train, y_train)

    temp_y_pred = temp_model.predict_proba(temp_X_val)[:, 1]
    temp_above_avg = temp_y_pred > 0.5
    temp_acc = (y_val == temp_above_avg).mean()
    print("Without feature \'" + feature + "\' accuracy is "+ str(round(temp_acc,2)) + " yielding a delta of " + str(temp_acc - baseline_acc) + " from baseline")

Without feature 'make' accuracy is 0.95 yielding a delta of 0.002937473772555599 from baseline
Without feature 'model' accuracy is 0.92 yielding a delta of -0.022240872849349502 from baseline
Without feature 'year' accuracy is 0.95 yielding a delta of 0.0016785564414604215 from baseline
Without feature 'engine_hp' accuracy is 0.93 yielding a delta of -0.015946286193873282 from baseline
Without feature 'engine_cylinders' accuracy is 0.95 yielding a delta of 0.0008392782207301552 from baseline
Without feature 'transmission_type' accuracy is 0.95 yielding a delta of -0.0012589173310951773 from baseline
Without feature 'vehicle_style' accuracy is 0.94 yielding a delta of -0.003776751993285754 from baseline
Without feature 'highway_mpg' accuracy is 0.94 yielding a delta of -0.0020981955518254436 from baseline
Without feature 'city_mpg' accuracy is 0.95 yielding a delta of -0.0004196391103650221 from baseline


## Q6

In [53]:
del df_test['above_average']
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

y_train_orig = df_train.price.values
y_val_orig = df_val.price.values
y_test_orig = df_test.price.values

y_train = np.log1p(df_train.price.values)
y_val = np.log1p(df_val.price.values)
y_test = np.log1p(df_test.price.values)
del df_train['price']
del df_val['price']
del df_test['price']

train_dict = df_train.to_dict(orient='records')
val_dict = df_val.to_dict(orient='records')

dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

X_train = dv.transform(train_dict)
X_val = dv.transform(val_dict)

alphas = [0, 0.01, 0.1, 1, 10]

for alpha in alphas:
    model = Ridge(solver='sag', random_state=42, alpha=alpha)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred)
    print("For alpha: " + str(alpha) + " RMSE is: " + str(rmse))

For alpha: 0 RMSE is: 0.2417372235557587
For alpha: 0.01 RMSE is: 0.24173734384990983
For alpha: 0.1 RMSE is: 0.2417384264710566
For alpha: 1 RMSE is: 0.241749251183706
For alpha: 10 RMSE is: 0.24192618680843758
