# ML Zoomcamp - Week 1 Homework

## Setup

In [1]:
import pandas as pd
import numpy as np

## Q1. Pandas Version

In [2]:
pd.__version__

'2.3.2'

## Load Data

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv')
df.shape

(9704, 11)

In [4]:
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [5]:
# Check column names
print("Column names:")
print(df.columns.tolist())

Column names:
['engine_displacement', 'num_cylinders', 'horsepower', 'vehicle_weight', 'acceleration', 'model_year', 'origin', 'fuel_type', 'drivetrain', 'num_doors', 'fuel_efficiency_mpg']


## Q2. Records Count

In [6]:
len(df)

9704

## Q3. Fuel Types

In [7]:
df['fuel_type'].nunique()

2

## Q4. Missing Values

In [8]:
(df.isnull().sum() > 0).sum()

np.int64(4)

## Q5. Max Fuel Efficiency from Asia

In [9]:
# Filter for Asia cars and find max fuel efficiency
asia_cars = df[df['origin'] == 'Asia']
max_efficiency = asia_cars['fuel_efficiency_mpg'].max()
print(f"Max fuel efficiency from Asia: {max_efficiency}")
max_efficiency

Max fuel efficiency from Asia: 23.759122836520497


np.float64(23.759122836520497)

## Q6. Median Horsepower

In [10]:
# Original median
original_median = df['horsepower'].median()
print(f"Original median: {original_median}")

# Most frequent value
most_frequent = df['horsepower'].mode()[0]
print(f"Most frequent: {most_frequent}")

# Fill missing values and calculate new median
df_filled = df.copy()
df_filled['horsepower'] = df_filled['horsepower'].fillna(most_frequent)
new_median = df_filled['horsepower'].median()
print(f"New median: {new_median}")

# Check if changed
print(f"Changed: {new_median != original_median}")

Original median: 149.0
Most frequent: 152.0
New median: 152.0
Changed: True


## Q7. Linear Algebra (Linear Regression)

In [11]:
# Step 1-3: Select Asia cars, specific columns, first 7 rows
asia_subset = df[df['origin'] == 'Asia'][['vehicle_weight', 'model_year']].head(7)
print("Selected data:")
print(asia_subset)

# Step 4: Get NumPy array
X = asia_subset.values
print(f"\nX shape: {X.shape}")

# Step 5: Compute XTX
XTX = X.T @ X
print(f"\nXTX:\n{XTX}")

# Step 6: Invert XTX
XTX_inv = np.linalg.inv(XTX)
print(f"\nXTX inverse:\n{XTX_inv}")

# Step 7: Create y array
y = np.array([1100, 1300, 800, 900, 1000, 1100, 1200])
print(f"\ny: {y}")

# Step 8: Compute w = (XTX)^(-1) * X.T * y
w = XTX_inv @ X.T @ y
print(f"\nw: {w}")

# Step 9: Sum of all elements
w_sum = np.sum(w)
print(f"\nSum of w: {w_sum}")

Selected data:
    vehicle_weight  model_year
8      2714.219310        2016
12     2783.868974        2010
14     3582.687368        2007
20     2231.808142        2011
21     2659.431451        2016
34     2844.227534        2014
38     3761.994038        2019

X shape: (7, 2)

XTX:
[[62248334.33150761 41431216.5073268 ]
 [41431216.5073268  28373339.        ]]

XTX inverse:
[[ 5.71497081e-07 -8.34509443e-07]
 [-8.34509443e-07  1.25380877e-06]]

y: [1100 1300  800  900 1000 1100 1200]

w: [0.01386421 0.5049067 ]

Sum of w: 0.5187709081074006


## Summary of Answers