# 0. Import pacakage and data

In [1]:
import pandas as pd
import numpy as np

In [2]:
!wget https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv

df = pd.read_csv('data.csv')

df.columns = df.columns.str.lower().str.replace(' ', '_')

string_columns = list(df.dtypes[df.dtypes == 'object'].index)

for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

display(df.head())

--2021-09-12 20:04:08--  https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1475504 (1.4M) [text/plain]
Saving to: ‘data.csv.2’


2021-09-12 20:04:08 (23.4 MB/s) - ‘data.csv.2’ saved [1475504/1475504]



Unnamed: 0,make,model,year,engine_fuel_type,engine_hp,engine_cylinders,transmission_type,driven_wheels,number_of_doors,market_category,vehicle_size,vehicle_style,highway_mpg,city_mpg,popularity,msrp
0,bmw,1_series_m,2011,premium_unleaded_(required),335.0,6.0,manual,rear_wheel_drive,2.0,"factory_tuner,luxury,high-performance",compact,coupe,26,19,3916,46135
1,bmw,1_series,2011,premium_unleaded_(required),300.0,6.0,manual,rear_wheel_drive,2.0,"luxury,performance",compact,convertible,28,19,3916,40650
2,bmw,1_series,2011,premium_unleaded_(required),300.0,6.0,manual,rear_wheel_drive,2.0,"luxury,high-performance",compact,coupe,28,20,3916,36350
3,bmw,1_series,2011,premium_unleaded_(required),230.0,6.0,manual,rear_wheel_drive,2.0,"luxury,performance",compact,coupe,28,18,3916,29450
4,bmw,1_series,2011,premium_unleaded_(required),230.0,6.0,manual,rear_wheel_drive,2.0,luxury,compact,convertible,28,18,3916,34500


# 1. Numpy version

In [3]:
print(np.__version__)

1.19.5


# 2. Pandas version

In [4]:
print(pd.__version__)

1.1.5


# 3. Average price of BMW

In [5]:
bmw_mean = (df[df['make'] == 'bmw'])['msrp'].mean()

print(bmw_mean)

61546.76347305389


# 4. Number of missing values in "Engine HP" after 2015

In [6]:
df_after_2015 = df[df['year'] >= 2015]

print(df_after_2015['engine_hp'].isnull().sum())

51


# 5. Does the mean change after filling missing values?

In [7]:
mean_before = df_after_2015['engine_hp'].mean()

print(round(mean_before))

mean_after = df_after_2015['engine_hp'].fillna(df_after_2015['engine_hp'].mean()).mean()

print(round(mean_after))

273
273


# 6. Sum of elements of the inverse matrix

In [8]:
df_rr = df[df['make'] == 'rolls-royce']

df_rr = df_rr[['engine_hp', 'engine_cylinders', 'highway_mpg']]

df_rr_no_duplicates = df_rr.drop_duplicates()

display(df_rr_no_duplicates)

Unnamed: 0,engine_hp,engine_cylinders,highway_mpg
2921,325.0,8.0,15
3505,563.0,12.0,19
5275,563.0,12.0,21
5279,563.0,12.0,20
7443,322.0,12.0,15
7553,453.0,12.0,19
11448,624.0,12.0,21


In [9]:
X = df_rr_no_duplicates.to_numpy()
XT = X.T
XTX = np.zeros((3, 3))

for i in range(len(XT)):
  for j in range(len(X[0])):
    for k in range(len(X)):
      XTX[i][j] += XT[i][k] * X[k][j]

print(XTX)

[[1.754801e+06 3.965600e+04 6.519600e+04]
 [3.965600e+04 9.280000e+02 1.500000e+03]
 [6.519600e+04 1.500000e+03 2.454000e+03]]


In [10]:
XTX_inv = np.linalg.inv(XTX)

XTX_sum = sum([sum(i) for i in XTX_inv])

print(XTX_sum)

0.03221232067748614


# 7. Normal equation

In [11]:
y = np.array([1000, 1100, 900, 1200, 1000, 850, 1300])

XTX_inv_XT = np.zeros((3, 7))

for i in range(len(XTX_inv)):
  for j in range(len(XT[0])):
    for k in range(len(XT)):
      XTX_inv_XT[i][j] += XTX_inv[i][k] * XT[k][j]

w = np.zeros(3)

for i in range(len(XTX_inv_XT)):
  for k in range(len(y)):
    w[i] += XTX_inv_XT[i][k] * y[k]

print(w)

[ 0.19989598 31.02612262 31.65378877]
