In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Import packages
import pandas as pd
import numpy as np

# Import dataset
auto = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/ZybookDataScience/Ch5-Data-Wrangling/Data/autodata.csv")

# Display auto
auto

Unnamed: 0,Manufacturer,Model,Drive,EngineType,Cylinders,Liters,MPG
0,Audi,A4,All,Gas,4.0,2.0,24.0
1,BMW,328 Ci,Rear,Gas,6.0,3.6,20.0
2,Bentley,Continental,Rear,Gas,,,210.0
3,Chevrolet,Malibu,Front,Gas,6.0,3.6,18.0
4,Ford,Mustang,Rear,Gas,6.0,3.7,
5,Rolls-Royce,Ghost,Rear,Gas,12.0,6.6,12.0
6,Chevrolet,Malibu,Front,Gas,6.0,3.6,18.0


In [None]:
# Which rows have missing values?
auto.isnull()

Unnamed: 0,Manufacturer,Model,Drive,EngineType,Cylinders,Liters,MPG
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,True,True,False
3,False,False,False,False,False,False,False
4,False,False,False,False,False,False,True
5,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False


In [None]:
# Count the rows with missing values for each feature
print(auto.isnull().values)

sum(auto.isnull().values)


[[False False False False False False False]
 [False False False False False False False]
 [False False False False  True  True False]
 [False False False False False False False]
 [False False False False False False  True]
 [False False False False False False False]
 [False False False False False False False]]


array([0, 0, 0, 0, 1, 1, 1])

In [None]:
# Alternative approach (this is better)
auto.isnull().sum()


Unnamed: 0,0
Manufacturer,0
Model,0
Drive,0
EngineType,0
Cylinders,1
Liters,1
MPG,1


In [None]:
# Remove duplicates from dataset
auto.drop_duplicates(inplace=True)
auto

Unnamed: 0,Manufacturer,Model,Drive,EngineType,Cylinders,Liters,MPG
0,Audi,A4,All,Gas,4.0,2.0,24.0
1,BMW,328 Ci,Rear,Gas,6.0,3.6,20.0
2,Bentley,Continental,Rear,Gas,,,210.0
3,Chevrolet,Malibu,Front,Gas,6.0,3.6,18.0
4,Ford,Mustang,Rear,Gas,6.0,3.7,
5,Rolls-Royce,Ghost,Rear,Gas,12.0,6.6,12.0


In [None]:
# Remove rows with missing values for both Cylinders and Liters
auto.dropna(subset=['Cylinders', 'Liters'], axis=0, how='all', inplace=True)

#all -> delete the row only if both Clyinders and Liters are NaN (AND condition)
#any -> would delete the row if at least one of the columns is NaN (OR condition)

auto

Unnamed: 0,Manufacturer,Model,Drive,EngineType,Cylinders,Liters,MPG
0,Audi,A4,All,Gas,4.0,2.0,24.0
1,BMW,328 Ci,Rear,Gas,6.0,3.6,20.0
3,Chevrolet,Malibu,Front,Gas,6.0,3.6,18.0
4,Ford,Mustang,Rear,Gas,6.0,3.7,
5,Rolls-Royce,Ghost,Rear,Gas,12.0,6.6,12.0


In [None]:
# Remove features with missing values in a given row

auto1 = auto.dropna(axis=1, subset=[3]) #very confusing -->
#column-wise operation --> remove column if the row 3 has any NaN
print(auto1)



Unnamed: 0,Manufacturer,Model,Drive,EngineType,Cylinders,Liters
0,Audi,A4,All,Gas,4.0,2.0
1,BMW,328 Ci,Rear,Gas,6.0,3.6
3,Chevrolet,Malibu,Front,Gas,6.0,3.6
4,Ford,Mustang,Rear,Gas,6.0,3.7
5,Rolls-Royce,Ghost,Rear,Gas,12.0,6.6


In [None]:
#Typicall, remove a columnn if that column has NaN
#or remove a row if that row has NaN
data = {
    'A': [1, 2, np.nan, 4],
    'B': [5, 6, 7, 8],
    'C': [np.nan, 2, 3, 4]
}
df3 = pd.DataFrame(data)
df3


Unnamed: 0,A,B,C
0,1.0,5,
1,2.0,6,2.0
2,,7,3.0
3,4.0,8,4.0


In [None]:
#Typicall, remove a columnn if that column has NaN
#or remove a row if that row has NaN

if df3["C"].isna().any():
  print("C has NaN")
  df3.drop(columns=["C"], inplace = True)

print (df3)
print ("\n\n")

if df3.loc[2].isna().any():
  temp3 = df3.drop(index=2)
print (temp3)

C has NaN
     A  B
0  1.0  5
1  2.0  6
2  NaN  7
3  4.0  8



     A  B
0  1.0  5
1  2.0  6
3  4.0  8


In [None]:
# Compute mean values of numeric features
print (auto)

mean = auto.mean(numeric_only=True)
print ("\n\n")
print (mean)
print("\n\n")

auto.fillna(value=mean, inplace=True)
auto

  Manufacturer        Model  Drive EngineType  Cylinders  Liters    MPG
0         Audi           A4    All        Gas        4.0     2.0   24.0
1          BMW       328 Ci   Rear        Gas        6.0     3.6   20.0
2      Bentley  Continental   Rear        Gas        NaN     NaN  210.0
3    Chevrolet       Malibu  Front        Gas        6.0     3.6   18.0
4         Ford      Mustang   Rear        Gas        6.0     3.7    NaN
5  Rolls-Royce        Ghost   Rear        Gas       12.0     6.6   12.0



Cylinders     6.8
Liters        3.9
MPG          56.8
dtype: float64





Unnamed: 0,Manufacturer,Model,Drive,EngineType,Cylinders,Liters,MPG
0,Audi,A4,All,Gas,4.0,2.0,24.0
1,BMW,328 Ci,Rear,Gas,6.0,3.6,20.0
2,Bentley,Continental,Rear,Gas,6.8,3.9,210.0
3,Chevrolet,Malibu,Front,Gas,6.0,3.6,18.0
4,Ford,Mustang,Rear,Gas,6.0,3.7,56.8
5,Rolls-Royce,Ghost,Rear,Gas,12.0,6.6,12.0
