# ML Zoomcamp - Week 1 Homework

## Setup

In [None]:
import pandas as pd
import numpy as np

## Q1. Pandas Version

In [None]:
pd.__version__

## Load Data

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv')
df.shape

In [None]:
df.head()

In [None]:
# Check column names
print("Column names:")
print(df.columns.tolist())

## Q2. Records Count

In [None]:
len(df)

## Q3. Fuel Types

In [None]:
df['fuel_type'].nunique()

## Q4. Missing Values

In [None]:
(df.isnull().sum() > 0).sum()

## Q5. Max Fuel Efficiency from Asia

In [None]:
# Filter for Asia cars and find max fuel efficiency
asia_cars = df[df['origin'] == 'Asia']
max_efficiency = asia_cars['fuel_efficiency_mpg'].max()
print(f"Max fuel efficiency from Asia: {max_efficiency}")
max_efficiency

## Q6. Median Horsepower

In [None]:
# Original median
original_median = df['horsepower'].median()
print(f"Original median: {original_median}")

# Most frequent value
most_frequent = df['horsepower'].mode()[0]
print(f"Most frequent: {most_frequent}")

# Fill missing values and calculate new median
df_filled = df.copy()
df_filled['horsepower'] = df_filled['horsepower'].fillna(most_frequent)
new_median = df_filled['horsepower'].median()
print(f"New median: {new_median}")

# Check if changed
print(f"Changed: {new_median != original_median}")

## Q7. Linear Algebra (Linear Regression)

In [None]:
# Step 1-3: Select Asia cars, specific columns, first 7 rows
asia_subset = df[df['origin'] == 'Asia'][['vehicle_weight', 'model_year']].head(7)
print("Selected data:")
print(asia_subset)

# Step 4: Get NumPy array
X = asia_subset.values
print(f"\nX shape: {X.shape}")

# Step 5: Compute XTX
XTX = X.T @ X
print(f"\nXTX:\n{XTX}")

# Step 6: Invert XTX
XTX_inv = np.linalg.inv(XTX)
print(f"\nXTX inverse:\n{XTX_inv}")

# Step 7: Create y array
y = np.array([1100, 1300, 800, 900, 1000, 1100, 1200])
print(f"\ny: {y}")

# Step 8: Compute w = (XTX)^(-1) * X.T * y
w = XTX_inv @ X.T @ y
print(f"\nw: {w}")

# Step 9: Sum of all elements
w_sum = np.sum(w)
print(f"\nSum of w: {w_sum}")

## Summary of Answers

In [None]:
print("=== ML ZOOMCAMP WEEK 1 - HOMEWORK ANSWERS ===")
print("Execute all cells above first, then check answers:")
print("\nQ1. Pandas version:", pd.__version__)
print("Q2. Records count:", len(df))
print("Q3. Fuel types:", df['fuel_type'].nunique())
print("Q4. Missing values columns:", (df.isnull().sum() > 0).sum())
print("Q5. Max fuel efficiency (Asia):", df[df['origin'] == 'Asia']['fuel_efficiency_mpg'].max())
print("Q6. Horsepower median changed:", "Check output above")
print("Q7. Sum of weights:", "Check output above")