In [1]:
import seaborn as sns
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor

# Load the data
df = sns.load_dataset("mpg")
df = df.drop('name', axis = 1)

# Create dummy variables for the non-numeric columns
df_dummies = pd.get_dummies(df, columns=["origin"], prefix="origin")

# Create a copy of the data with the missing values removed
df_clean = df_dummies.dropna()

# Split the clean data into features and labels
X = df_clean.drop("horsepower", axis=1)
y = df_clean["horsepower"]

# Create a SimpleImputer object to fill the missing values
imputer = SimpleImputer(strategy="mean")

# Fit the imputer to the clean data and transform the data with missing values
X_imputed = imputer.fit_transform(df_dummies.drop("horsepower", axis=1))

# Create a RandomForestRegressor model
model = RandomForestRegressor()

# Fit the model to the clean data
model.fit(X, y)

# Use the model to predict the missing values
y_pred = model.predict(X_imputed)

# Add the predicted values to the data frame
df["horsepower_pred"] = y_pred

# Replace the missing values with the predicted values
df["horsepower"].fillna(df["horsepower_pred"], inplace=True)

# Drop the temporary column
# df_dummies.drop("horsepower_pred", axis=1, inplace=True)




In [2]:
df.isnull().sum()

mpg                0
cylinders          0
displacement       0
horsepower         0
weight             0
acceleration       0
model_year         0
origin             0
horsepower_pred    0
dtype: int64

In [3]:
# Load the data
df2 = sns.load_dataset("mpg")
df2.name = df2.name.apply(lambda x: x.split(' ')[0])

# Create dummy variables for the non-numeric columns
df2_dummies = pd.get_dummies(df2, columns=["name"], prefix="name")
df2_dummies = pd.get_dummies(df2_dummies, columns=["origin"], prefix="origin")


In [4]:


# Create a copy of the data with the missing values removed
df2_clean = df2_dummies.dropna()

# Split the clean data into features and labels
X = df2_clean.drop("horsepower", axis=1)
y = df2_clean["horsepower"]

# Create a SimpleImputer object to fill the missing values
imputer = SimpleImputer(strategy="mean")

# Fit the imputer to the clean data and transform the data with missing values
X_imputed = imputer.fit_transform(df2_dummies.drop("horsepower", axis=1))

# Create a RandomForestRegressor model
model = RandomForestRegressor()

# Fit the model to the clean data
model.fit(X, y)

# Use the model to predict the missing values
y_pred = model.predict(X_imputed)

# Add the predicted values to the data frame
df2["horsepower_pred"] = y_pred

# Replace the missing values with the predicted values
df2["horsepower"].fillna(df2["horsepower_pred"], inplace=True)

# Drop the temporary column
# df_dummies.drop("horsepower_pred", axis=1, inplace=True)





In [5]:
df[sns.load_dataset('mpg').horsepower.isnull()]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,horsepower_pred
32,25.0,4,98.0,68.85,2046,19.0,71,usa,68.85
126,21.0,6,200.0,89.82,2875,17.0,74,usa,89.82
330,40.9,4,85.0,60.47,1835,17.3,80,europe,60.47
336,23.6,4,140.0,99.99,2905,14.3,80,usa,99.99
354,34.5,4,100.0,79.59,2320,15.8,81,europe,79.59
374,23.0,4,151.0,87.26,3035,20.5,82,usa,87.26


In [6]:
df2[sns.load_dataset('mpg').horsepower.isnull()]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name,horsepower_pred
32,25.0,4,98.0,69.56,2046,19.0,71,usa,ford,69.56
126,21.0,6,200.0,87.48,2875,17.0,74,usa,ford,87.48
330,40.9,4,85.0,60.91,1835,17.3,80,europe,renault,60.91
336,23.6,4,140.0,95.65,2905,14.3,80,usa,ford,95.65
354,34.5,4,100.0,77.97,2320,15.8,81,europe,renault,77.97
374,23.0,4,151.0,86.83,3035,20.5,82,usa,amc,86.83


In [7]:
df2

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name,horsepower_pred
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet,137.77
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick,162.51
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth,150.17
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc,148.80
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford,139.94
...,...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa,ford,88.51
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw,53.94
395,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge,87.49
396,28.0,4,120.0,79.0,2625,18.6,82,usa,ford,81.59
