# Week 09 - Imputation Using Prediction

In [1]:
# get data
import pandas as pd

auto = pd.read_csv('https://raw.githubusercontent.com/gitmystuff/INFO4050/main/Datasets/Auto.csv', usecols=['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year'])
auto.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year
0,18.0,8,307.0,130,3504,12.0,70
1,15.0,8,350.0,165,3693,11.5,70
2,18.0,8,318.0,150,3436,11.0,70
3,16.0,8,304.0,150,3433,12.0,70
4,17.0,8,302.0,140,3449,10.5,70


In [2]:
missing_values = auto.loc[~auto['horsepower'].astype(str).str.isdigit()]
print(missing_values.head())

      mpg  cylinders  displacement horsepower  weight  acceleration  year
32   25.0          4          98.0          ?    2046          19.0    71
126  21.0          6         200.0          ?    2875          17.0    74
330  40.9          4          85.0          ?    1835          17.3    80
336  23.6          4         140.0          ?    2905          14.3    80
354  34.5          4         100.0          ?    2320          15.8    81


In [3]:
# delete rows with question mark and convert horsepower to int64
import numpy as np

auto = auto[(auto != '?').all(axis=1)]
auto['horsepower'] = auto['horsepower'].astype(np.int64)
auto.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 396
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    int64  
 4   weight        392 non-null    int64  
 5   acceleration  392 non-null    float64
 6   year          392 non-null    int64  
dtypes: float64(3), int64(4)
memory usage: 24.5 KB


In [4]:
# train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    auto.drop('horsepower', axis=1), 
    auto['horsepower'], 
    test_size=0.25, 
    random_state=42)

In [5]:
# create and train the model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

model = LinearRegression()
model.fit(X_train, y_train)

# test set prediction results
yhats = model.predict(missing_values.drop('horsepower', axis=1))
yhats

array([ 64.9350177 ,  97.29551547,  57.76742504, 101.01401941,
        75.931622  ])

In [6]:
# get original dataset
auto = pd.read_csv('Auto.csv', usecols=['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year'])

# get missing values
missing_values = auto.loc[~auto['horsepower'].astype(str).str.isdigit()]

# replace missing values with predictions by index
for i, idx in enumerate(missing_values.index):
    auto.loc[idx, 'horsepower'] = str(round(yhats[i]))

# convert horsepower to numeric
auto['horsepower'] = auto['horsepower'].astype(np.int64)
print(auto.loc[missing_values.index])
print(auto.info())

      mpg  cylinders  displacement  horsepower  weight  acceleration  year
32   25.0          4          98.0          65    2046          19.0    71
126  21.0          6         200.0          97    2875          17.0    74
330  40.9          4          85.0          58    1835          17.3    80
336  23.6          4         140.0         101    2905          14.3    80
354  34.5          4         100.0          76    2320          15.8    81
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           397 non-null    float64
 1   cylinders     397 non-null    int64  
 2   displacement  397 non-null    float64
 3   horsepower    397 non-null    int64  
 4   weight        397 non-null    int64  
 5   acceleration  397 non-null    float64
 6   year          397 non-null    int64  
dtypes: float64(3), int64(4)
memory usage: 21.8 KB
None


In [7]:
# check for missing values
auto.loc[~auto['horsepower'].astype(str).str.isdigit()]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year
