In [2]:
import pandas as pd
import numpy as np

Getting the required Data

Car Fuel Efficiency dataset: available at https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv

in your current terminal(assuming its your project root  paste this command: wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv  )

interacting with the downloaded data

In [4]:
data = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv")
# data = pd.read_csv("car_fuel_efficiency.csv")

Question 1
What's the version of Pandas that you installed?  answer:2.3.3

You can get the version information using the __version__ field:

In [6]:
pd.__version__


'2.3.3'

In [5]:
#get the first 5 rows
data.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


Question 2
How many records are in the dataset? the dataset contains  9704 rows and 11 columns

In [7]:
data.shape

(9704, 11)

Question 3
How many fuel types are presented in the dataset?

In [None]:
unique_fuel_types = data["fuel_type"].unique()
print(f"Unique fuel types: {unique_fuel_types}")
num_unique_fuel_types = len(unique_fuel_types)
print(f"Number of unique fuel types: {num_unique_fuel_types}")
#you can also use nunique()
num_unique_fuel_types_via_nunique = data["fuel_type"].nunique()
print(f"Number of unique fuel types (via nunique): {num_unique_fuel_types_via_nunique}")  
data.fuel_type.nunique()

Unique fuel types: ['Gasoline' 'Diesel']
Number of unique fuel types: 2
Number of unique fuel types (via nunique): 2


Computation Cost Comparison

| Method       | What it does internally                                                   | Memory                                | Speed                                                   | Notes                                           |
| ------------ | ------------------------------------------------------------------------- | ------------------------------------- | ------------------------------------------------------- | ----------------------------------------------- |
| `.unique()`  | Extracts all unique values and returns them as a NumPy array.             | **Higher** (stores all unique values) | Slightly **slower** if column has many distinct values. | Useful when you actually need the unique items. |
| `.nunique()` | Counts unique values **without storing them** (optimized Cython routine). | **Lower**                             | Slightly **faster**, especially for large datasets.     | Ideal if you only need the count.               |


Question 4
How many columns in the dataset have missing values? answer:4 columns

In [13]:
data.isnull().sum()

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

In [15]:
# total number of columns with missing values
(data.isnull().sum() > 0).sum()

np.int64(4)

Question 5
What's the maximum fuel efficiency of cars from Asia in the dataset?

In [18]:
# Get comprehensive stats by region
region_stats = data.groupby('origin')['fuel_efficiency_mpg'].agg(['min', 'mean', 'max', 'count']).rename(columns={
    'min': 'min',
    'mean': 'mean',
    'max': 'max',
    'count': 'count'
}).sort_values(by='mean', ascending=False)
print("Fuel efficiency by region:")
print(region_stats)

# Specifically for Asia
if 'asia' in region_stats.index.str.lower().values:
    asia_max = region_stats.loc[region_stats.index.str.lower() == 'asia', 'max'].iloc[0]
    print(f"\nMaximum fuel efficiency for Asian cars: {asia_max}")

Fuel efficiency by region:
             min       mean        max  count
origin                                       
USA     6.695483  15.040204  24.971452   3203
Asia    6.886245  14.973830  23.759123   3247
Europe  6.200971  14.942532  25.967222   3254

Maximum fuel efficiency for Asian cars: 23.759122836520497


Question 6
Find the median value of horsepower column in the dataset.
Next, calculate the most frequent value of the same horsepower column.
Use fillna method to fill the missing values in horsepower column with the most frequent value from the previous step.
Now, calculate the median value of horsepower once again.
Has it changed?

In [None]:
def analyze_horsepower_median(data):
    """
    Complete analysis of horsepower median before and after mode imputation.
    """
    print(" HORSEPOWER MEDIAN ANALYSIS")
    print("=" * 40)
    
    # Step 1: Original median
    original_median = data['horsepower'].median()
    missing_count = data['horsepower'].isna().sum()
    total_count = len(data['horsepower'])
    
    print(f" Original Data:")
    print(f"   - Total entries: {total_count}")
    print(f"   - Missing values: {missing_count} ({missing_count/total_count*100:.1f}%)")
    print(f"   - Original median: {original_median:.2f}")
    
    # Step 2: Most frequent value
    mode_values = data['horsepower'].mode()
    if len(mode_values) == 0:
        most_frequent = original_median
        print(f" Mode: No mode found, using median: {most_frequent:.2f}")
    else:
        most_frequent = mode_values.iloc[0]
        mode_frequency = (data['horsepower'] == most_frequent).sum()
        print(f" Mode: {most_frequent} (appears {mode_frequency} times)")
        if len(mode_values) > 1:
            print(f"   Note: Multiple modes exist: {mode_values.tolist()}")
    
    # Step 3 & 4: Fill and calculate new median
    filled_series = data['horsepower'].fillna(most_frequent)
    new_median = filled_series.median()
    
    print(f" After Mode Imputation:")
    print(f"   - New median: {new_median:.2f}")
    print(f"   - Change: {new_median - original_median:+.2f}")
    print("=" * 40)
    print("yes there is a positive change in median after filling missing values with mode")
    
    return {
        'original_median': original_median,
        'most_frequent': most_frequent,
        'new_median': new_median,
        'missing_count': missing_count,
        'filled_series': filled_series
    }

# Execute the analysiss
result = analyze_horsepower_median(data)

 HORSEPOWER MEDIAN ANALYSIS
 Original Data:
   - Total entries: 9704
   - Missing values: 708 (7.3%)
   - Original median: 149.00
 Mode: 152.0 (appears 142 times)
 After Mode Imputation:
   - New median: 152.00
   - Change: +3.00
yes there is a positive change in median after filling missing values with mode


Question 7
Select all the cars from Europe
Select only columns vehicle_weight and model_year
Select the first 7 values
Get the underlying NumPy array. Let's call it X.
Compute matrix-matrix multiplication between the transpose of X and X. To get the transpose, use X.T. Let's call the result XTX.
Invert XTX.
Create an array y with values [1100, 1300, 800, 900, 1000, 1100, 1200].
Multiply the inverse of XTX with the transpose of X, and then multiply the result by y. Call the result w.
What's the sum of all the elements of the result?

In [26]:
import pandas as pd
import numpy as np

# Select European cars and required columns
european_cars = data[data['origin'] == 'Europe']
selected_data = european_cars[['vehicle_weight', 'model_year']]

print("Step 1 - Selected data shape:", selected_data.shape)
print("First few rows:")
print(selected_data.head(10))

Step 1 - Selected data shape: (3254, 2)
First few rows:
    vehicle_weight  model_year
0      3413.433759        2003
2      3079.038997        2018
4      3460.870990        2009
5      2484.883986        2008
10     3111.810181        2014
13     2274.735191        2000
16     2673.552941        2013
18     2971.214575        2003
19     2986.958657        2004
22     2438.842126        2016


In [27]:
# Select first 7 values
first_7 = selected_data.head(7)
print(f"\nStep 2 - First 7 values shape: {first_7.shape}")
print(first_7)

# Get underlying NumPy array
X = first_7.values
print(f"\nNumPy array X shape: {X.shape}")
print("X =")
print(X)


Step 2 - First 7 values shape: (7, 2)
    vehicle_weight  model_year
0      3413.433759        2003
2      3079.038997        2018
4      3460.870990        2009
5      2484.883986        2008
10     3111.810181        2014
13     2274.735191        2000
16     2673.552941        2013

NumPy array X shape: (7, 2)
X =
[[3413.43375861 2003.        ]
 [3079.03899737 2018.        ]
 [3460.87098999 2009.        ]
 [2484.88398604 2008.        ]
 [3111.81018139 2014.        ]
 [2274.73519056 2000.        ]
 [2673.55294096 2013.        ]]


In [29]:
# Compute XTX = transpose(X) × X
XTX = np.dot(X.T, X)
print(f"\nStep 3 - XTX shape: {XTX.shape}")
print("XTX =")
print(XTX)


Step 3 - XTX shape: (2, 2)
XTX =
[[61289955.72540274 41191663.53461886]
 [41191663.53461886 28260843.        ]]


In [30]:
# Invert XTX
try:
    XTX_inv = np.linalg.inv(XTX)
    print(f"\nStep 4 - XTX_inv shape: {XTX_inv.shape}")
    print("XTX_inv =")
    print(XTX_inv)
except np.linalg.LinAlgError:
    print("Matrix is singular, cannot be inverted")
    # If singular, use pseudoinverse instead
    XTX_inv = np.linalg.pinv(XTX)
    print("Using pseudoinverse instead")
    print(XTX_inv)


Step 4 - XTX_inv shape: (2, 2)
XTX_inv =
[[ 7.99397664e-07 -1.16516410e-06]
 [-1.16516410e-06  1.73367254e-06]]


In [31]:
# Create array y
y = np.array([1100, 1300, 800, 900, 1000, 1100, 1200])
print(f"\nStep 5 - y shape: {y.shape}")
print("y =", y)

# Compute w = XTX_inv × X.T × y
w = XTX_inv @ X.T @ y
print(f"\nStep 6 - w shape: {w.shape}")
print("w =", w)


Step 5 - y shape: (7,)
y = [1100 1300  800  900 1000 1100 1200]

Step 6 - w shape: (2,)
w = [-0.07188499  0.63097065]


In [34]:
def solve_linear_regression_manual(data):
    """
    Complete solution for the manual linear regression problem.
    """
    print(" LINEAR REGRESSION MANUAL CALCULATION")
    print("=" * 50)
    
    # Step 1: Select European cars with required columns
    european_cars = data[data['origin'] == 'Europe']
    selected_data = european_cars[['vehicle_weight', 'model_year']].head(7)
    
    print(" Selected Data (European cars - first 7):")
    print(selected_data)
    
    # Step 2: Get NumPy array
    X = selected_data.values
    print(f"\n Design Matrix X (shape: {X.shape}):")
    print(X)
    
    # Step 3: Compute XTX
    XTX = X.T @ X
    print(f"\n XTX (X.T × X) (shape: {XTX.shape}):")
    print(XTX)
    
    # Step 4: Invert XTX with error handling
    try:
        XTX_inv = np.linalg.inv(XTX)
        print(f"\n Inverse of XTX (shape: {XTX_inv.shape}):")
        print(XTX_inv)
    except np.linalg.LinAlgError:
        print(" XTX is singular, using pseudoinverse")
        XTX_inv = np.linalg.pinv(XTX)
        print(f"Pseudoinverse of XTX (shape: {XTX_inv.shape}):")
        print(XTX_inv)
    
    # Step 5: Create y and compute w
    y = np.array([1100, 1300, 800, 900, 1000, 1100, 1200])
    print(f"\n Target vector y (shape: {y.shape}):")
    print(y)
    
    # Step 6: Compute weights w = (X.T × X)^-1 × X.T × y
    w = XTX_inv @ X.T @ y
    print(f"\n Weight vector w (shape: {w.shape}):")
    print(w)
    
    # Step 7: Calculate final sum
    result_sum = np.sum(w)
    print(f"\n FINAL RESULT: Sum of all elements in w = {result_sum}")
    
    return {
        'X': X,
        'XTX': XTX,
        'XTX_inv': XTX_inv,
        'y': y,
        'w': w,
        'sum_result': result_sum
    }

# Execute the solution
result = solve_linear_regression_manual(data)

 LINEAR REGRESSION MANUAL CALCULATION
 Selected Data (European cars - first 7):
    vehicle_weight  model_year
0      3413.433759        2003
2      3079.038997        2018
4      3460.870990        2009
5      2484.883986        2008
10     3111.810181        2014
13     2274.735191        2000
16     2673.552941        2013

 Design Matrix X (shape: (7, 2)):
[[3413.43375861 2003.        ]
 [3079.03899737 2018.        ]
 [3460.87098999 2009.        ]
 [2484.88398604 2008.        ]
 [3111.81018139 2014.        ]
 [2274.73519056 2000.        ]
 [2673.55294096 2013.        ]]

 XTX (X.T × X) (shape: (2, 2)):
[[61289955.72540274 41191663.53461886]
 [41191663.53461886 28260843.        ]]

 Inverse of XTX (shape: (2, 2)):
[[ 7.99397664e-07 -1.16516410e-06]
 [-1.16516410e-06  1.73367254e-06]]

 Target vector y (shape: (7,)):
[1100 1300  800  900 1000 1100 1200]

 Weight vector w (shape: (2,)):
[-0.07188499  0.63097065]

 FINAL RESULT: Sum of all elements in w = 0.559085655434893
