### Q1. Pandas version

In [1]:
import pandas as pd
import numpy as np
pd.__version__

'2.3.2'

### Getting the data

In [2]:
# !wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv

### Q2. Records count

In [3]:
# Read the file
df = pd.read_csv('car_fuel_efficiency.csv')
df

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.870990,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369
...,...,...,...,...,...,...,...,...,...,...,...
9699,140,5.0,164.0,2981.107371,17.3,2013,Europe,Diesel,Front-wheel drive,,15.101802
9700,180,,154.0,2439.525729,15.0,2004,USA,Gasoline,All-wheel drive,0.0,17.962326
9701,220,2.0,138.0,2583.471318,15.1,2008,USA,Diesel,All-wheel drive,-1.0,17.186587
9702,230,4.0,177.0,2905.527390,19.4,2011,USA,Diesel,Front-wheel drive,1.0,15.331551


### Q3. Fuel types

In [4]:
# Checking number of unique values in all columns
df.nunique()

engine_displacement      36
num_cylinders            14
horsepower              192
vehicle_weight         9704
acceleration            162
model_year               24
origin                    3
fuel_type                 2
drivetrain                2
num_doors                 9
fuel_efficiency_mpg    9704
dtype: int64

### Q4. Missing values

In [5]:
# Check the number of missing values
df.isnull().sum()

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

### Q5. Max fuel efficiency

In [6]:
# Max fuel efficiency for cars from Asia
df[
    (df['origin'] == 'Asia')
].describe()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,num_doors,fuel_efficiency_mpg
count,3247.0,3099.0,3013.0,3247.0,2933.0,3247.0,3072.0,3247.0
mean,200.052356,4.010326,150.303352,3006.292228,15.036754,2011.489375,-0.001628,14.97383
std,49.287252,1.971199,30.257169,502.434626,2.48834,6.704607,1.051428,2.578345
min,30.0,0.0,40.0,1223.298226,6.4,2000.0,-4.0,6.886245
25%,170.0,3.0,130.0,2665.518985,13.3,2006.0,-1.0,13.25201
50%,200.0,4.0,150.0,2988.585737,15.1,2012.0,0.0,15.03467
75%,230.0,5.0,171.0,3345.355571,16.7,2017.0,1.0,16.726318
max,370.0,11.0,245.0,4661.144932,22.7,2023.0,4.0,23.759123


### Q6. Median value of horsepower

In [7]:
# Find the median value of horsepower column in the dataset
df.horsepower.median()

np.float64(149.0)

In [8]:
# Next, calculate the most frequent value of the same horsepower column.
df.horsepower.mode()

0    152.0
Name: horsepower, dtype: float64

In [9]:
# Use fillna method to fill the missing values in horsepower column with the most frequent value from the previous step.
df_nona = df.horsepower.fillna(152.0)
df_nona

0       159.0
1        97.0
2        78.0
3       152.0
4       140.0
        ...  
9699    164.0
9700    154.0
9701    138.0
9702    177.0
9703    140.0
Name: horsepower, Length: 9704, dtype: float64

In [10]:
# Now, calculate the median value of horsepower once again.
df_nona.median()

np.float64(152.0)

### Q7. Sum of weights

In [11]:
# Select all the cars from Asia
df_sum = df[
    (df['origin'] == 'Asia')
]
df_sum

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
8,250,1.0,174.0,2714.219310,10.3,2016,Asia,Diesel,Front-wheel drive,-1.0,16.823554
12,320,5.0,145.0,2783.868974,15.1,2010,Asia,Diesel,All-wheel drive,1.0,16.175820
14,200,6.0,160.0,3582.687368,14.9,2007,Asia,Diesel,All-wheel drive,0.0,11.871091
20,150,3.0,197.0,2231.808142,18.7,2011,Asia,Gasoline,Front-wheel drive,1.0,18.889083
21,160,4.0,133.0,2659.431451,,2016,Asia,Gasoline,Front-wheel drive,-1.0,16.077730
...,...,...,...,...,...,...,...,...,...,...,...
9688,260,4.0,,3948.404625,15.5,2018,Asia,Diesel,All-wheel drive,-1.0,11.054830
9692,180,3.0,188.0,3680.341381,18.0,2016,Asia,Gasoline,Front-wheel drive,1.0,11.711653
9693,280,2.0,148.0,2545.070139,15.6,2012,Asia,Diesel,All-wheel drive,0.0,17.202782
9698,180,1.0,131.0,3107.427820,13.2,2005,Asia,Gasoline,Front-wheel drive,-2.0,13.933716


In [12]:
# Select only columns vehicle_weight and model_year
df_sum = df_sum[['vehicle_weight', 'model_year']]
df_sum

Unnamed: 0,vehicle_weight,model_year
8,2714.219310,2016
12,2783.868974,2010
14,3582.687368,2007
20,2231.808142,2011
21,2659.431451,2016
...,...,...
9688,3948.404625,2018
9692,3680.341381,2016
9693,2545.070139,2012
9698,3107.427820,2005


In [13]:
# Select the first 7 values
df_sum = df_sum.head(n=7)
df_sum

Unnamed: 0,vehicle_weight,model_year
8,2714.21931,2016
12,2783.868974,2010
14,3582.687368,2007
20,2231.808142,2011
21,2659.431451,2016
34,2844.227534,2014
38,3761.994038,2019


In [14]:
# Get the underlying NumPy array. Let's call it X.
X = df_sum.values
X

array([[2714.21930965, 2016.        ],
       [2783.86897424, 2010.        ],
       [3582.68736772, 2007.        ],
       [2231.8081416 , 2011.        ],
       [2659.43145076, 2016.        ],
       [2844.22753389, 2014.        ],
       [3761.99403819, 2019.        ]])

In [15]:
# Compute matrix-matrix multiplication between the transpose of X and X. To get the transpose, use X.T. Let's call the result XTX.
X_transpose = X.T
XTX = np.dot(X_transpose, X)
XTX

array([[62248334.33150762, 41431216.5073268 ],
       [41431216.5073268 , 28373339.        ]])

In [16]:
# Invert XTX.
XTX_inverse = np.linalg.inv(XTX)
XTX_inverse

array([[ 5.71497081e-07, -8.34509443e-07],
       [-8.34509443e-07,  1.25380877e-06]])

In [17]:
# Create an array y with values [1100, 1300, 800, 900, 1000, 1100, 1200].
Y = np.array([1100, 1300, 800, 900, 1000, 1100, 1200])
Y

array([1100, 1300,  800,  900, 1000, 1100, 1200])

In [18]:
# Multiply the inverse of XTX with the transpose of X, and then multiply the result by y. Call the result w.
XTX_X = np.dot(XTX_inverse, X_transpose)
w = np.dot(XTX_X, Y)
w

array([0.01386421, 0.5049067 ])

In [19]:
# What's the sum of all the elements of the result?
np.sum(w)

np.float64(0.5187709081074016)