# Homework for week03

> We'll keep working with the MSRP variable, and we'll transform it to a classification task.

> In this homework, we will use the Car price dataset


In [None]:
# uri = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv'

In [None]:
# !wget $uri -O data-hmwk-3.csv

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Data Ingestion

In [None]:
data = pd.read_csv('../data/data-hmwk-3.csv')

### Descriptive analytics - EDA

In [None]:
print(f'{data.shape =}')

In [None]:
data.columns = data.columns.str.replace(' ', '_').str.lower()

In [None]:
data.info()

In [None]:
data.head().T

In [None]:
data.columns

In [None]:
data.isnull().sum()

In [None]:
data.duplicated().sum()

In [None]:
data.columns = data.columns.str.replace('msrp', 'price')

In [None]:
features = ['make', 'model', 'year',  'engine_hp', 'engine_cylinders', 
       'transmission_type','vehicle_style',  'highway_mpg', 'city_mpg']
target = ['price']

In [None]:
categorical_columns = list(data[features].dtypes[data.dtypes == 'object'].index)
numerical_columns = list(data[features+target].dtypes[data.dtypes != 'object'].index)

In [None]:
data.describe().T

In [None]:
data[numerical_columns].plot()
plt.show()

In [None]:
data.plot.box(figsize=(12, 12), layout=(3,3), subplots=True, sharex=False, sharey=False)
plt.show()

In [None]:
from ydata_profiling import ProfileReport

profile = ProfileReport(data
                                    , title="Pandas Profiling Report"
                                    ,  config_file="../config_default.yaml"
                                    , explorative=True
                                )

profile.to_notebook_iframe()
# profile.to_file("ProfileReport.html")

## Data Preparation

In [None]:
data.head()

In [None]:
data = data[features+target]

In [None]:
data.isnull().sum()

In [None]:
data = data.fillna(0)

In [None]:
data.isnull().sum()

## Question 1

q: What is the most frequent observation (mode) for the column `transmission_type`?

a: `AUTOMATIC`

In [None]:
data.transmission_type.mode()

## Question 2

Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.

Q: What are the two features that have the biggest correlation in this dataset?

A: `highway_mpg` and `city_mpg`

In [None]:
corr = data[numerical_columns].corr()

In [None]:
corr

In [None]:
import plotly.express as px

fig = px.imshow(corr, text_auto=".2f", aspect="auto", color_continuous_scale='RdBu_r')
fig.show()

### Make price binary

In [None]:
# calculate the mean for price column
mean_price = data.price.mean()
# create a new column called 'above_average' that is set to 1 if value is above the mean_price
data['above_average'] = [1 if price > mean_price else 0 for price in data['price']]

In [None]:
mean_price

In [None]:
data.above_average.mean()

### Split the data


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df_full_train, df_test = train_test_split(data, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [None]:
len(df_train), len(df_val), len(df_test)

In [None]:
sns.pairplot(df_full_train, hue="above_average",  diag_kind="hist")
plt.show()

In [None]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [None]:
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

df_train.drop(['price', 'above_average'], axis=1, inplace=True)
df_val.drop(['price', 'above_average'], axis=1, inplace=True)
df_test.drop(['price', 'above_average'], axis=1, inplace=True)

In [None]:
y_test.shape

In [None]:
y_train.mean()

In [None]:
1 - y_train.mean()

## Question 3

- Calculate the mutual information score between above_average and other categorical variables in our dataset. Use the training set only.
- Round the scores to 2 decimals using round(score, 2).


Q: Which of these variables has the lowest mutual information score?

A: `transmission_type`

In [None]:
from sklearn.metrics import mutual_info_score

In [None]:
df_train[categorical_columns]

In [None]:
def mutual_info_target(series):
    return mutual_info_score(series, df_full_train.above_average)

In [None]:
mi = df_full_train[categorical_columns].apply(mutual_info_target)
mi.sort_values(ascending=False)

What does having lowest `mi` score mean?



## Question 4

Q: What accuracy did you get?

A: `0.94`

In [None]:
from sklearn.feature_extraction import DictVectorizer

In [None]:
numerical_columns

In [None]:
numerical_columns.remove('price')

In [None]:
dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_val)
y_pred_proba = model.predict_proba(X_val)[:, 1]

Remember that `.predict_proba` returns the probability of 
- (0, positive answer) `above_average` >= threshold and 
- (1, negative answer) `above_average` < threshold
for each X_val observation, for threshold=0.5

So, that's why we are only intrested in column [1], the negative answer aka the yes, churn equivalent

In [None]:
print(model.predict_proba(X_val))

In [None]:
pred_decision = (y_pred_proba >= 0.5)
(y_val == pred_decision).mean()

In [None]:
from sklearn.metrics import accuracy_score
round(accuracy_score(y_val, y_pred), 2)

## Question 5

Q: Which of following feature has the smallest difference?

A: `city_mpg`

> [!Note] 
> :memo: I got the feature with smallest difference as `vehicle_style` with 0.04% difference as opposed to `city_mpg` with 0.21% (next smallest difference), but selected the smallest out of the MCQ options provided. 
> 
> :warning: Is my logic wrong, since my resulting feature is not in the options listed?

#### From the top

In [None]:
from sklearn.metrics import mean_squared_error


# load the data
data = pd.read_csv('../data/data-hmwk-3.csv')

# identify features and target
features = ['make', 'model', 'year', 'engine_hp', 'engine_cylinders', 
            'transmission_type', 'vehicle_style',
            'highway_mpg', 'city_mpg']
target = ['price']

# clean/prepare the data
data.columns = data.columns.str.replace(' ', '_').str.lower()
data.columns = data.columns.str.replace('msrp', 'price')

# use a subset for analysis
data = data[features+target]

# impute Nan/nulls with 0
data = data.fillna(0)

categoricals = list(data.dtypes[data.dtypes == 'object'].index)
numericals = list(data.dtypes[data.dtypes != 'object'].index)


# binarize target
# calculate the mean for price column
mean_price = data.price.mean()
# create a new column called 'above_average' that is set to 1 if value is above the mean_price
data['above_average'] = [1 if price > mean_price else 0 for price in data['price']]


# split the data to train/val/test sets with 60%/20%/20% distribution
df_full_train, df_test = train_test_split(data, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

# set the y dataframe
y_train = df_train.above_average.values
y_val = df_val.above_average.values

df_train.drop(['price', 'above_average'], axis=1, inplace=True)
df_val.drop(['price', 'above_average'], axis=1, inplace=True)


# perform OHE on categorical data
dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)


# Train a model with all the features
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Calculate the accuracy of the model
y_pred = model.predict(X_val)
# mse = mean_squared_error(y_val, y_pred)
# accuracy = 1 - (mse / np.var(y_val))
accuracy = round(model.score(X_val, y_val), 3)
print(f'The accuracy of the model with all features is {accuracy:.2%}')
# print(f'The accuracy of the model with all features is {model.score(X_val, y_val):.2%}')

In [None]:
# Exclude each feature from this set and train a model without it
for feature in df_train.columns:
    print(f'Removing {feature}')
    train_new = df_train.drop(feature, axis=1)
    val_new = df_val.drop(feature, axis=1)

    # display(train_new.head())

    # perform OHE on categorical data
    dv = DictVectorizer(sparse=False)

    train_dict_new = train_new.to_dict(orient='records')
    X_train_new = dv.fit_transform(train_dict_new)

    val_dict = val_new.to_dict(orient='records')
    X_val_new = dv.transform(val_dict)

    model_new = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model_new.fit(X_train_new, y_train)
    y_pred_new = model_new.predict(X_val_new)
    # mse_new = mean_squared_error(y_val, y_pred_new)
    # new_accuracy = 1 - (mse_new / np.var(y_val))
    new_accuracy = round(model_new.score(X_val_new, y_val), 3)
    diff = round((accuracy - new_accuracy), 6)
    print(f'Number of features seen during fit = {model_new.n_features_in_}')
    print(f'old {accuracy = }')
    print(f'{new_accuracy = }')
    print(f'The difference in accuracy after excluding {feature} is {diff:.2%}')
    print()


## Question 6

Q: Which of these alphas leads to the best RMSE on the validation set?

A: `0`


### From the top part 2, prepare_X_y()


In [None]:
def prepare_X_y():

    # load the data
    data = pd.read_csv('../data/data-hmwk-3.csv')

    # identify features and target
    features = ['make', 'model', 'year', 'engine_hp', 'engine_cylinders', 
                'transmission_type', 'vehicle_style',
                'highway_mpg', 'city_mpg']
    target = ['price']

    # clean/prepare the data
    data.columns = data.columns.str.replace(' ', '_').str.lower()
    data.columns = data.columns.str.replace('msrp', 'price')

    # use a subset for analysis
    data = data[features+target]

    # impute Nan/nulls with 0
    data = data.fillna(0)

    # split the data to train/val/test sets with 60%/20%/20% distribution
    df_full_train, df_test = train_test_split(data, test_size=0.2, random_state=42)
    df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

    # set the y dataframe
    y_train = np.log1p(df_train.price.values)
    y_val = np.log1p(df_val.price.values)

    # remove target from df
    del df_train['price']
    del df_val['price']


    # perform OHE on categorical data; made sparse due to non-convergence otherwise
    dv = DictVectorizer(sparse=True)

    train_dict = df_train.to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    val_dict = df_val.to_dict(orient='records')
    X_val = dv.transform(val_dict)

    # print(X.shape, y.shape)
    return X_train, y_train, X_val, y_val


### Tuning `alpha`

In [None]:
from sklearn.linear_model import Ridge

rmse_scores = {}
alpha = [0, 0.01, 0.1, 1, 10]

X_train, y_train, X_val, y_val = prepare_X_y()

for num in alpha:
    model = Ridge(alpha=num, solver='sag', random_state=42)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    rmse = mean_squared_error(y_val, y_pred, squared=False)
    print(f'for alpha = {num}, rmse = {rmse:.6f}')
    rmse_scores[num] = round(rmse, 6)


In [None]:
from pprint import pprint

pprint(rmse_scores, indent=5)