In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sb

## Q1. Pandas version

In [3]:
pd.__version__

'2.2.2'

## Getting the data


In [5]:
data = pd.read_csv('car_fuel_efficiency.csv')

In [12]:
df = pd.DataFrame(data)
df

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.870990,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369
...,...,...,...,...,...,...,...,...,...,...,...
9699,140,5.0,164.0,2981.107371,17.3,2013,Europe,Diesel,Front-wheel drive,,15.101802
9700,180,,154.0,2439.525729,15.0,2004,USA,Gasoline,All-wheel drive,0.0,17.962326
9701,220,2.0,138.0,2583.471318,15.1,2008,USA,Diesel,All-wheel drive,-1.0,17.186587
9702,230,4.0,177.0,2905.527390,19.4,2011,USA,Diesel,Front-wheel drive,1.0,15.331551


## Q2. Records count


In [8]:
df.shape

(9704, 11)

In [9]:
records = df.shape[0]
records

9704

## Q3. Fuel types


In [18]:
fuel_types = df['fuel_type'].nunique()
fuel_types

2

## Q4. Missing values

In [31]:
missing_columns = df.isnull().sum()
missing_columns

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

In [33]:
num_missing_columns = (missing_columns > 0).sum()
num_missing_columns

4

## Q5. Max fuel efficiency

In [38]:
in_asia = df[df['origin'] == 'Asia']
in_asia

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
8,250,1.0,174.0,2714.219310,10.3,2016,Asia,Diesel,Front-wheel drive,-1.0,16.823554
12,320,5.0,145.0,2783.868974,15.1,2010,Asia,Diesel,All-wheel drive,1.0,16.175820
14,200,6.0,160.0,3582.687368,14.9,2007,Asia,Diesel,All-wheel drive,0.0,11.871091
20,150,3.0,197.0,2231.808142,18.7,2011,Asia,Gasoline,Front-wheel drive,1.0,18.889083
21,160,4.0,133.0,2659.431451,,2016,Asia,Gasoline,Front-wheel drive,-1.0,16.077730
...,...,...,...,...,...,...,...,...,...,...,...
9688,260,4.0,,3948.404625,15.5,2018,Asia,Diesel,All-wheel drive,-1.0,11.054830
9692,180,3.0,188.0,3680.341381,18.0,2016,Asia,Gasoline,Front-wheel drive,1.0,11.711653
9693,280,2.0,148.0,2545.070139,15.6,2012,Asia,Diesel,All-wheel drive,0.0,17.202782
9698,180,1.0,131.0,3107.427820,13.2,2005,Asia,Gasoline,Front-wheel drive,-2.0,13.933716


In [40]:
in_asia['fuel_efficiency_mpg'].max()

23.759122836520497

## Q6. Median value of horsepower

In [47]:
median_horsepower = df['horsepower'].median()
median_horsepower

149.0

In [51]:
mode_horsepower = df['horsepower'].mode()
mode_horsepower

0    152.0
Name: horsepower, dtype: float64

In [53]:
mode_horsepower = mode_horsepower[0]
mode_horsepower

152.0

In [114]:
df['horsepower'] = df['horsepower'].fillna(mode_horsepower)
df['horsepower']

0       159.0
1        97.0
2        78.0
3       152.0
4       140.0
        ...  
9699    164.0
9700    154.0
9701    138.0
9702    177.0
9703    140.0
Name: horsepower, Length: 9704, dtype: float64

In [59]:
median_horsepower = df['horsepower'].median()
median_horsepower

152.0

## Q7. Sum of weights

In [64]:
selected_car_asia = in_asia[['vehicle_weight','model_year']]
selected_car_asia

Unnamed: 0,vehicle_weight,model_year
8,2714.219310,2016
12,2783.868974,2010
14,3582.687368,2007
20,2231.808142,2011
21,2659.431451,2016
...,...,...
9688,3948.404625,2018
9692,3680.341381,2016
9693,2545.070139,2012
9698,3107.427820,2005


In [68]:
first_seven_values = selected_car_asia[:7]
first_seven_values

Unnamed: 0,vehicle_weight,model_year
8,2714.21931,2016
12,2783.868974,2010
14,3582.687368,2007
20,2231.808142,2011
21,2659.431451,2016
34,2844.227534,2014
38,3761.994038,2019


In [82]:
x  = first_seven_values.to_numpy()
x

array([[2714.21930965, 2016.        ],
       [2783.86897424, 2010.        ],
       [3582.68736772, 2007.        ],
       [2231.8081416 , 2011.        ],
       [2659.43145076, 2016.        ],
       [2844.22753389, 2014.        ],
       [3761.99403819, 2019.        ]])

In [84]:
x.shape

(7, 2)

In [86]:
x.T

array([[2714.21930965, 2783.86897424, 3582.68736772, 2231.8081416 ,
        2659.43145076, 2844.22753389, 3761.99403819],
       [2016.        , 2010.        , 2007.        , 2011.        ,
        2016.        , 2014.        , 2019.        ]])

In [88]:
x.T.shape

(2, 7)

In [90]:
XTX = np.matmul(x.T , x)
XTX

array([[62248334.33150762, 41431216.50732678],
       [41431216.50732678, 28373339.        ]])

In [92]:
XTX.shape

(2, 2)

In [98]:
y = [1100, 1300, 800, 900, 1000, 1100, 1200]
y = np.array(y)
y

array([1100, 1300,  800,  900, 1000, 1100, 1200])

In [102]:
XTX_inverse = np.linalg.inv(XTX)
XTX_inverse


array([[ 5.71497081e-07, -8.34509443e-07],
       [-8.34509443e-07,  1.25380877e-06]])

In [106]:
w = np.matmul(XTX_inv,x.T)
w

array([[-1.31202622e-04, -8.63909858e-05,  3.72634923e-04,
        -4.02726650e-04, -1.62513724e-04, -5.52342829e-05,
         4.65094049e-04],
       [ 2.62636846e-04,  1.96990690e-04, -4.73392228e-04,
         6.58944477e-04,  3.08357831e-04,  1.51636137e-04,
        -6.07979633e-04]])

In [110]:
w = np.matmul(result,y)

In [112]:
w.sum()

0.5187709081074007