### Q1. Pandas Version

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

pd.__version__

'2.3.2'

In [2]:
# get data
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
car_efficiency_df = pd.read_csv(url)
car_efficiency_df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


### Q2. Records count

In [3]:
car_efficiency_df.shape

(9704, 11)

### Q3. Fuel types

In [4]:
car_efficiency_df.fuel_type.value_counts()

fuel_type
Gasoline    4898
Diesel      4806
Name: count, dtype: int64

### Q4. Missing values

In [5]:
car_efficiency_df.describe()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,num_doors,fuel_efficiency_mpg
count,9704.0,9222.0,8996.0,9704.0,8774.0,9704.0,9202.0,9704.0
mean,199.708368,3.962481,149.657292,3001.280993,15.021928,2011.484027,-0.006412,14.985243
std,49.455319,1.999323,29.879555,497.89486,2.510339,6.659808,1.048162,2.556468
min,10.0,0.0,37.0,952.681761,6.0,2000.0,-4.0,6.200971
25%,170.0,3.0,130.0,2666.248985,13.3,2006.0,-1.0,13.267459
50%,200.0,4.0,149.0,2993.226296,15.0,2012.0,0.0,15.006037
75%,230.0,5.0,170.0,3334.957039,16.7,2017.0,1.0,16.707965
max,380.0,13.0,271.0,4739.077089,24.3,2023.0,4.0,25.967222


In [6]:
# function to calculate the percentage of missing values
def missing_values_table(df):
    """
    This function calculates the percentage of
    missing values in the dataset
    """
    # Total missing values
    mis_val = df.isnull().sum()

    # Percentage of missing values
    mis_val_percent = 100 * df.isnull().sum() / len(df)

    # Make a table with the results
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)

    # Rename the columns
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Total Values'})

    # Sort the table by percentage of missing descending
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    '% of Total Values', ascending=False).round(1)

    # Print some summary information
    print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
        "There are " + str(mis_val_table_ren_columns.shape[0]) +
            " columns that have missing values.")

    # Return the dataframe with missing information
    return mis_val_table_ren_columns

# check for missing values
missing_values_table(car_efficiency_df)

Your selected dataframe has 11 columns.
There are 4 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
acceleration,930,9.6
horsepower,708,7.3
num_doors,502,5.2
num_cylinders,482,5.0


### Q5. Max fuel efficiency

In [7]:
asia_data = car_efficiency_df[car_efficiency_df.origin == 'Asia']
asia_data.fuel_efficiency_mpg.max()

23.759122836520497

### Q6. Median value of horsepower

In [8]:
# median before filling missing values
median_horsepower = car_efficiency_df.horsepower.median()
print("The median value of horsepower is: {}".format(round(median_horsepower, 2)))

# get most frequent value of horsepower column
mode_hp = car_efficiency_df['horsepower'].mode().iloc[0]
car_efficiency_df['horsepower'].fillna(mode_hp, inplace=True)

# median after filling missing values
median_after_filling_na = car_efficiency_df.horsepower.median()
print("The median value of horsepower after filling missing values is: {}".format(round(median_after_filling_na, 2)))


The median value of horsepower is: 149.0
The median value of horsepower after filling missing values is: 152.0


In [9]:
# linear algebra
def vector_vector_multiplication(u, v):
    assert u.shape[0] == v.shape[0]
    
    n = u.shape[0]
    
    result = 0.0

    for i in range(n):
        result = result + u[i] * v[i]
    
    return result

def matrix_vector_multiplication(U, v):
    assert U.shape[1] == v.shape[0]
    
    num_rows = U.shape[0]
    
    result = np.zeros(num_rows)
    
    for i in range(num_rows):
        result[i] = vector_vector_multiplication(U[i], v)
    
    return result


def matrix_matrix_multiplication(U, V):
    assert U.shape[1] == V.shape[0]
    
    num_rows = U.shape[0]
    num_cols = V.shape[1]
    
    result = np.zeros((num_rows, num_cols))
    
    for i in range(num_cols):
        vi = V[:, i]
        Uvi = matrix_vector_multiplication(U, vi)
        result[:, i] = Uvi
    
    return result

### Q7. Sum of weights

In [10]:
all_asia_cars = car_efficiency_df[car_efficiency_df['origin'] == 'Asia']
X = all_asia_cars[['vehicle_weight', 'model_year']].head(7).to_numpy()
X


array([[2714.21930965, 2016.        ],
       [2783.86897424, 2010.        ],
       [3582.68736772, 2007.        ],
       [2231.8081416 , 2011.        ],
       [2659.43145076, 2016.        ],
       [2844.22753389, 2014.        ],
       [3761.99403819, 2019.        ]])

In [11]:
X_transpose = X.T 

# matrix-matrix multiplication between the transpose of X and X
XTX = matrix_matrix_multiplication(X_transpose, X)
XTX

array([[62248334.33150762, 41431216.50732678],
       [41431216.50732678, 28373339.        ]])

In [12]:
# invert XTX
XTX_inverse = np.linalg.inv(XTX)
XTX_inverse

array([[ 5.71497081e-07, -8.34509443e-07],
       [-8.34509443e-07,  1.25380877e-06]])

In [13]:
y = np.array([1100, 1300, 800, 900, 1000, 1100, 1200])
w = XTX_inverse.dot(X_transpose).dot(y)
w 

array([0.01386421, 0.5049067 ])

In [14]:
# sum of all element of w
w.sum()

np.float64(0.5187709081074012)