In [35]:
import pandas as pd
import numpy as np

In [36]:
pd .__version__

'2.3.2'

In [37]:
data_url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
dataset = pd.read_csv(data_url)
dataset.head(2)

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217


In [38]:
dataset.columns

Index(['engine_displacement', 'num_cylinders', 'horsepower', 'vehicle_weight',
       'acceleration', 'model_year', 'origin', 'fuel_type', 'drivetrain',
       'num_doors', 'fuel_efficiency_mpg'],
      dtype='object')

In [39]:
# Number of records
len(dataset)

9704

In [40]:
# Fuel types
dataset.value_counts("fuel_type")

fuel_type
Gasoline    4898
Diesel      4806
Name: count, dtype: int64

In [41]:
# How many columns in the dataset have missing values?
dataset.isnull().sum()

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

In [42]:
# What's the maximum fuel efficiency of cars from Asia?
dataset[dataset["origin"] == "Asia"]["fuel_efficiency_mpg"].max()

np.float64(23.759122836520497)

In [43]:
# Find the median value of horsepower column in the dataset.
median_horsepower_before = dataset["horsepower"].median()
median_horsepower_before

np.float64(149.0)

In [44]:
# calculate the most frequent value of the same horsepower column.
most_freq_horsepower = dataset["horsepower"].mode()[0]
most_freq_horsepower

np.float64(152.0)

In [45]:
# fill the missing values in horsepower column with the most frequent value from the previous step.
dataset["horsepower"] = dataset["horsepower"].fillna(most_freq_horsepower)

In [46]:
# calculate the median value of horsepower once again.
median_horsepower_after = dataset["horsepower"].median()
median_horsepower_after

np.float64(152.0)

In [47]:
# Has it changed?
if median_horsepower_before < median_horsepower_after:
    print("Yes, it increased")
elif median_horsepower_before > median_horsepower_after:
    print("Yes, it decreased")
else:
    print("No")

Yes, it increased


In [48]:
dataset.columns

Index(['engine_displacement', 'num_cylinders', 'horsepower', 'vehicle_weight',
       'acceleration', 'model_year', 'origin', 'fuel_type', 'drivetrain',
       'num_doors', 'fuel_efficiency_mpg'],
      dtype='object')

In [64]:
# SUm of weights
# Select all the cars from Asia
dataset_asia = dataset[dataset["origin"] == "Asia"]
# Select only columns vehicle_weight and model_year
dataset_asia_weight_year = dataset_asia[["vehicle_weight","model_year"]]
# Select the first 7 values
dataset_asia_weight_year_first_seven = dataset_asia_weight_year.head(7)
# Get the underlying NumPy array. Let's call it X.
X = dataset_asia_weight_year_first_seven.to_numpy()
# Compute matrix-matrix multiplication between the transpose of X and X. To get the transpose, use X.T. Let's call the result XTX
XTX = X.T.dot(X)
# Invert XTX
XTX_inv = np.linalg.inv(XTX)
# Create an array y with values [1100, 1300, 800, 900, 1000, 1100, 1200]
y = np.array([1100, 1300, 800, 900, 1000, 1100, 1200])
# Multiply the inverse of XTX with the transpose of X, and then multiply the result by y. Call the result w.
w = XTX_inv.dot(X.T).dot(y)
# What's the sum of all the elements of the result?
result = w.sum()
result

np.float64(0.5187709081074016)