In [11]:
# Load the dataset into pandas dataframe. use 'read_csv' function. 
# Open CMD and install necessary packages. "pip3 install pandas"

import pandas as pd

# There are no column headers, so specifying column names
column_names = ['MPG', 'Cylinder', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'Model Year', 'Origin', 'Car Name']
data = pd.read_csv('auto-mpg.data', delim_whitespace=True, names=column_names)

# Display the first 5 rows of the DF with headers
print(data.head())

    MPG  Cylinder  Displacement Horsepower  Weight  Acceleration  Model Year  \
0  18.0         8         307.0      130.0  3504.0          12.0          70   
1  15.0         8         350.0      165.0  3693.0          11.5          70   
2  18.0         8         318.0      150.0  3436.0          11.0          70   
3  16.0         8         304.0      150.0  3433.0          12.0          70   
4  17.0         8         302.0      140.0  3449.0          10.5          70   

   Origin                   Car Name  
0       1  chevrolet chevelle malibu  
1       1          buick skylark 320  
2       1         plymouth satellite  
3       1              amc rebel sst  
4       1                ford torino  


In [12]:
# Handle missing values, the horsepower column contains '?' for some missing values.
# need to replace these with the mean of the other values in the column.

import numpy as np

# Replace '?' with NaN
data['Horsepower'].replace('?',np.nan, inplace=True)

# Convert the column to float
data['Horsepower'] = data['Horsepower'].astype(float)

# Replace NaNs with mean of the column
data['Horsepower'].fillna(data['Horsepower'].mean(), inplace=True)
print(data.head())

    MPG  Cylinder  Displacement  Horsepower  Weight  Acceleration  Model Year  \
0  18.0         8         307.0       130.0  3504.0          12.0          70   
1  15.0         8         350.0       165.0  3693.0          11.5          70   
2  18.0         8         318.0       150.0  3436.0          11.0          70   
3  16.0         8         304.0       150.0  3433.0          12.0          70   
4  17.0         8         302.0       140.0  3449.0          10.5          70   

   Origin                   Car Name  
0       1  chevrolet chevelle malibu  
1       1          buick skylark 320  
2       1         plymouth satellite  
3       1              amc rebel sst  
4       1                ford torino  


In [13]:
# Normalize the 'MPG' column so its values range from 0 to 1
data['MPG'] = (data['MPG'] - data['MPG'].min()) / (data['MPG'].max() - data['MPG'].min())
print(data.head())

        MPG  Cylinder  Displacement  Horsepower  Weight  Acceleration  \
0  0.239362         8         307.0       130.0  3504.0          12.0   
1  0.159574         8         350.0       165.0  3693.0          11.5   
2  0.239362         8         318.0       150.0  3436.0          11.0   
3  0.186170         8         304.0       150.0  3433.0          12.0   
4  0.212766         8         302.0       140.0  3449.0          10.5   

   Model Year  Origin                   Car Name  
0          70       1  chevrolet chevelle malibu  
1          70       1          buick skylark 320  
2          70       1         plymouth satellite  
3          70       1              amc rebel sst  
4          70       1                ford torino  


In [15]:
# One-hot encoding for the 'Origin' column because it contains categorical data

data = pd. get_dummies(data, columns=['Origin'])
print(data.head())

        MPG  Cylinder  Displacement  Horsepower  Weight  Acceleration  \
0  0.239362         8         307.0       130.0  3504.0          12.0   
1  0.159574         8         350.0       165.0  3693.0          11.5   
2  0.239362         8         318.0       150.0  3436.0          11.0   
3  0.186170         8         304.0       150.0  3433.0          12.0   
4  0.212766         8         302.0       140.0  3449.0          10.5   

   Model Year                   Car Name  Origin_1  Origin_2  Origin_3  
0          70  chevrolet chevelle malibu      True     False     False  
1          70          buick skylark 320      True     False     False  
2          70         plymouth satellite      True     False     False  
3          70              amc rebel sst      True     False     False  
4          70                ford torino      True     False     False  


In [16]:
# Save the preprocessed dataframe as CSV

data.to_csv('cleaned_data.csv', index=False)