## Preparing the dataset

In [2]:
# The goal of this homework is to create a regression model for predicting the car fuel efficiency (column 'fuel_efficiency_mpg').

# Preparing the dataset
# Preparation:

# Fill missing values with zeros.
# Do train/validation/test split with 60%/20%/20% distribution.
# Use the train_test_split function and set the random_state parameter to 1.
# Use DictVectorizer(sparse=True) to turn the dataframes into matrices.

import pandas as pd

df = pd.read_csv("car_fuel_efficiency.csv")
df.tail()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
9699,140,5.0,164.0,2981.107371,17.3,2013,Europe,Diesel,Front-wheel drive,,15.101802
9700,180,,154.0,2439.525729,15.0,2004,USA,Gasoline,All-wheel drive,0.0,17.962326
9701,220,2.0,138.0,2583.471318,15.1,2008,USA,Diesel,All-wheel drive,-1.0,17.186587
9702,230,4.0,177.0,2905.52739,19.4,2011,USA,Diesel,Front-wheel drive,1.0,15.331551
9703,270,3.0,140.0,2908.043477,14.7,2005,Asia,Diesel,All-wheel drive,-1.0,14.884467


In [3]:
# Fill missing values with zeros.
# NB - I also see that num_doors is negative - better fill it with zeroes too...

df.describe()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,num_doors,fuel_efficiency_mpg
count,9704.0,9222.0,8996.0,9704.0,8774.0,9704.0,9202.0,9704.0
mean,199.708368,3.962481,149.657292,3001.280993,15.021928,2011.484027,-0.006412,14.985243
std,49.455319,1.999323,29.879555,497.89486,2.510339,6.659808,1.048162,2.556468
min,10.0,0.0,37.0,952.681761,6.0,2000.0,-4.0,6.200971
25%,170.0,3.0,130.0,2666.248985,13.3,2006.0,-1.0,13.267459
50%,200.0,4.0,149.0,2993.226296,15.0,2012.0,0.0,15.006037
75%,230.0,5.0,170.0,3334.957039,16.7,2017.0,1.0,16.707965
max,380.0,13.0,271.0,4739.077089,24.3,2023.0,4.0,25.967222


In [5]:
df.num_doors = abs(df.num_doors)
df.describe()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,num_doors,fuel_efficiency_mpg
count,9704.0,9222.0,8996.0,9704.0,8774.0,9704.0,9202.0,9704.0
mean,199.708368,3.962481,149.657292,3001.280993,15.021928,2011.484027,0.766246,14.985243
std,49.455319,1.999323,29.879555,497.89486,2.510339,6.659808,0.715184,2.556468
min,10.0,0.0,37.0,952.681761,6.0,2000.0,0.0,6.200971
25%,170.0,3.0,130.0,2666.248985,13.3,2006.0,0.0,13.267459
50%,200.0,4.0,149.0,2993.226296,15.0,2012.0,1.0,15.006037
75%,230.0,5.0,170.0,3334.957039,16.7,2017.0,1.0,16.707965
max,380.0,13.0,271.0,4739.077089,24.3,2023.0,4.0,25.967222


In [7]:
# missing values
df.isna().sum()

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

In [10]:
# replace NaN or missing with zeros
df = df.fillna(0)
df.isna().sum()

engine_displacement    0
num_cylinders          0
horsepower             0
vehicle_weight         0
acceleration           0
model_year             0
origin                 0
fuel_type              0
drivetrain             0
num_doors              0
fuel_efficiency_mpg    0
dtype: int64

In [11]:
df.describe()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,num_doors,fuel_efficiency_mpg
count,9704.0,9704.0,9704.0,9704.0,9704.0,9704.0,9704.0,9704.0
mean,199.708368,3.765664,138.738355,3001.280993,13.582275,2011.484027,0.726608,14.985243
std,49.455319,2.130715,48.401189,497.89486,5.025294,6.659808,0.716819,2.556468
min,10.0,0.0,0.0,952.681761,0.0,2000.0,0.0,6.200971
25%,170.0,2.0,124.0,2666.248985,12.6,2006.0,0.0,13.267459
50%,200.0,4.0,146.0,2993.226296,14.7,2012.0,1.0,15.006037
75%,230.0,5.0,168.0,3334.957039,16.5,2017.0,1.0,16.707965
max,380.0,13.0,271.0,4739.077089,24.3,2023.0,4.0,25.967222


In [17]:
# Do train/validation/test split with 60%/20%/20% distribution.
# Use the train_test_split function and set the random_state parameter to 1.

from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [18]:
len(df_full_train), len(df_test) # (7763, 1941)

(7763, 1941)

In [19]:
len(df_train), len(df_val)  # (5822, 1941)

(5822, 1941)

In [20]:
# reset index in our splits

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)