In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import mutual_info_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer

from IPython.display import display

In [2]:
df = pd.read_csv(r"C:\Users\axelb\Desktop\ml-zoomcamp\dataset_hmw3.csv")

In [3]:
features = ['Make', 'Model', 'Year', 'Engine HP', 'Engine Cylinders', 
           'Transmission Type', 'Vehicle Style', 'highway MPG', 'city mpg']

**data preparation**

In [4]:
dataset = df[features]

In [5]:
dataset.columns = dataset.columns.str.replace(' ', '_').str.lower()

In [6]:
dataset.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18


In [7]:
dataset.isnull().sum()

make                  0
model                 0
year                  0
engine_hp            69
engine_cylinders     30
transmission_type     0
vehicle_style         0
highway_mpg           0
city_mpg              0
dtype: int64

In [8]:
dataset = dataset.fillna(0)

In [9]:
dataset.isnull().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
dtype: int64

In [10]:
df = df.rename(columns={'MSRP': 'price'})

In [11]:
df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,price
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [12]:
dataset['price'] = df['price']
dataset.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


**question 1**

In [13]:
dataset.transmission_type.value_counts()

transmission_type
AUTOMATIC           8266
MANUAL              2935
AUTOMATED_MANUAL     626
DIRECT_DRIVE          68
UNKNOWN               19
Name: count, dtype: int64

==> AUTOMATIC

**question 2**

In [14]:
dataset.dtypes

make                  object
model                 object
year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
price                  int64
dtype: object

In [15]:
numerical = list(dataset.dtypes[(dataset.dtypes != 'object') & (dataset.dtypes.index != 'price')].index)

In [16]:
numerical

['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']

In [17]:
corrMatrix = dataset[numerical].corr().abs()

In [18]:
corrMatrix

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg
year,1.0,0.338714,0.040708,0.25824,0.198171
engine_hp,0.338714,1.0,0.774851,0.415707,0.424918
engine_cylinders,0.040708,0.774851,1.0,0.614541,0.587306
highway_mpg,0.25824,0.415707,0.614541,1.0,0.886829
city_mpg,0.198171,0.424918,0.587306,0.886829,1.0


==> highway_mpg and city_mpg do have the biggest correlation

In [19]:
avg_price = dataset.price.mean()

In [20]:
dataset['above_average'] = (dataset.price > avg_price).astype(int)

In [21]:
dataset.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135,1
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650,1
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350,0
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450,0
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500,0


In [22]:
df_full_train, df_test = train_test_split(dataset, test_size=0.2, random_state=42)

In [23]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [24]:
len(df_train), len(df_val), len(df_test)

(7148, 2383, 2383)

In [25]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [26]:
y_train = df_train.above_average
y_val = df_val.above_average
y_test = df_test.above_average

In [27]:
del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

In [28]:
del df_train['price']
del df_val['price']
del df_test['price']

In [29]:
df_full_train = df_full_train.reset_index(drop=True)

**question 3**

In [30]:
def mutual_avg_info_score(category):
    return mutual_info_score(df_full_train.above_average, df_full_train[category])

In [31]:
categorical = list(dataset.dtypes[dataset.dtypes == 'object'].index)
categorical

['make', 'model', 'transmission_type', 'vehicle_style']

In [32]:
for category in categorical:
    print(category)
    mutual_info = mutual_avg_info_score(category)
    print(mutual_info)
    print()

make
0.2387236479073192

model
0.46099440756035703

transmission_type
0.020883575914971142

vehicle_style
0.08339022741593435



==> transmission_type does have the lowest mutual information score

**question 4**

In [33]:
dv = DictVectorizer(sparse=False)

In [34]:
train_dict = df_train[categorical + numerical].to_dict(orient='records')

In [35]:
X_train = dv.fit_transform(train_dict)

In [36]:
X_train.shape

(7148, 943)

In [37]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)

In [38]:
model.fit(X_train, y_train)

In [39]:
val_dict = df_val[categorical + numerical].to_dict(orient='records')

In [40]:
X_val = dv.transform(val_dict)

In [41]:
y_pred = model.predict_proba(X_val)[:, 1]

In [42]:
pred = (y_pred >= 0.5)
pred

array([False,  True, False, ..., False,  True,  True])

In [43]:
df_pred = pd.DataFrame()

In [44]:
df_pred['actual'] = y_val
df_pred['probability'] = y_pred
df_pred['prediction'] = pred.astype(int)

In [45]:
df_pred

Unnamed: 0,actual,probability,prediction
0,0,0.001006,0
1,1,0.996285,1
2,0,0.000157,0
3,0,0.245223,0
4,0,0.001523,0
...,...,...,...
2378,0,0.002326,0
2379,0,0.002003,0
2380,0,0.000334,0
2381,1,0.990554,1


In [46]:
df_pred['correct'] = df_pred['actual'] == df_pred['prediction']

In [47]:
df_pred

Unnamed: 0,actual,probability,prediction,correct
0,0,0.001006,0,True
1,1,0.996285,1,True
2,0,0.000157,0,True
3,0,0.245223,0,True
4,0,0.001523,0,True
...,...,...,...,...
2378,0,0.002326,0,True
2379,0,0.002003,0,True
2380,0,0.000334,0,True
2381,1,0.990554,1,True


In [48]:
original_accuracy = df_pred['correct'].mean()

In [50]:
round(original_accuracy, 2)

0.95

==> 0.95

**question 5**

In [51]:
# Excluding one categorical feature at a time

for c in range(len(categorical)):
    
    train_dict = df_train[categorical[:c] + categorical[c+1:] + numerical].to_dict(orient="records")
    val_dict = df_val[categorical[:c] + categorical[c+1:] + numerical].to_dict(orient="records")

    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(train_dict)
    X_val = dv.transform(val_dict)

    model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict_proba(X_val)[:, 1]
    pred_decision = (y_pred >= 0.5)

    df_pred = pd.DataFrame()
    df_pred['probability'] = y_pred
    df_pred['prediction'] = pred_decision.astype(int)
    df_pred['actual'] = y_val
    df_pred['correct'] = df_pred['prediction'] == df_pred['actual']

    accuracy = df_pred['correct'].mean()
    accuracy_diff = original_accuracy - accuracy

    print("logistic regression model without %s feature:" % categorical[c])
    print(df_pred)
    print(accuracy)
    print(accuracy_diff)
    print()

logistic regression model without make feature:
      probability  prediction  actual  correct
0        0.001882           0       0     True
1        0.905083           1       1     True
2        0.010310           0       0     True
3        0.304443           0       0     True
4        0.006444           0       0     True
...           ...         ...     ...      ...
2378     0.021872           0       0     True
2379     0.028936           0       0     True
2380     0.000212           0       0     True
2381     0.987350           1       1     True
2382     0.947549           1       1     True

[2383 rows x 4 columns]
0.931598825010491
0.013428451531682817

logistic regression model without model feature:
      probability  prediction  actual  correct
0        0.007061           0       0     True
1        0.991239           1       1     True
2        0.001129           0       0     True
3        0.371022           0       0     True
4        0.005134           0       0  

In [52]:
# Excluding one numerical feature at a time

for c in range(len(numerical)):
    
    train_dict = df_train[categorical + numerical[:c] + numerical[c+1:]].to_dict(orient="records")
    val_dict = df_val[categorical + numerical[:c] + numerical[c+1:]].to_dict(orient="records")

    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(train_dict)
    X_val = dv.transform(val_dict)

    model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict_proba(X_val)[:, 1]
    pred_decision = (y_pred >= 0.5)

    df_pred = pd.DataFrame()
    df_pred['probability'] = y_pred
    df_pred['prediction'] = pred_decision.astype(int)
    df_pred['actual'] = y_val
    df_pred['correct'] = df_pred['prediction'] == df_pred['actual']

    accuracy = df_pred['correct'].mean()
    accuracy_diff = original_accuracy - accuracy

    print("logistic regression model without %s feature:" % numerical[c])
    print(df_pred)
    print(accuracy)
    print(accuracy_diff)
    print()

logistic regression model without year feature:
      probability  prediction  actual  correct
0        0.000375           0       0     True
1        0.997037           1       1     True
2        0.000058           0       0     True
3        0.294140           0       0     True
4        0.000978           0       0     True
...           ...         ...     ...      ...
2378     0.001526           0       0     True
2379     0.001026           0       0     True
2380     0.000284           0       0     True
2381     0.996192           1       1     True
2382     0.994885           1       1     True

[2383 rows x 4 columns]
0.9479647503147294
-0.002937473772555599

logistic regression model without engine_hp feature:
      probability  prediction  actual  correct
0        0.000680           0       0     True
1        0.919983           1       1     True
2        0.001142           0       0     True
3        0.272696           0       0     True
4        0.014113           0    

==> the smallest difference is given when excluding 'year'

**question 6**

In [53]:
dataset['price'] = np.log1p(dataset['price'])

In [54]:
df_full_train, df_test = train_test_split(dataset, test_size=0.2, random_state=42)

In [55]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [56]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [57]:
y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

In [58]:
from sklearn.linear_model import Ridge

In [59]:
train_dict = df_train[categorical + numerical].to_dict(orient="records")
val_dict = df_val[categorical + numerical].to_dict(orient="records")

In [60]:
dv = DictVectorizer(sparse=False)

In [61]:
X_train = dv.fit_transform(train_dict)
X_val = dv.transform(val_dict)

In [62]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [66]:
for alpha in [0, 0.01, 0.1, 1, 10]:
    
    model = Ridge(alpha=alpha, solver='sag', max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    score = rmse(y_pred, y_val)
    rounded_score = round(score, 3)

    print("for alpha=%f" % alpha, rounded_score)
    print()



for alpha=0.000000 0.487





for alpha=0.010000 0.487





for alpha=0.100000 0.487





for alpha=1.000000 0.487

for alpha=10.000000 0.487



