### Handling Missing Data

In [4]:
import pandas as pd
import warnings 
  
# Settings the warnings to be ignored 
warnings.filterwarnings('ignore') 

dataframe  = pd.read_csv('employee.csv')

In [5]:
# Displaying using Jupyter Notebook
dataframe

Unnamed: 0,First Name,Gender,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,97308.0,6.945,True,Marketing
1,Thomas,Male,61933.0,,True,
2,Jerry,Male,,9.34,True,Finance
3,Dennis,n.a.,115163.0,10.125,False,Legal
4,,Female,0.0,11.598,,Finance
5,Angela,,,18.523,True,Engineering
6,Shawn,Male,111737.0,6.414,False,na
7,Rachel,Female,142032.0,12.599,False,Business Development
8,Linda,Female,57427.0,9.557,True,Client Services
9,Stephanie,Female,36844.0,5.574,True,Business Development


#### Exercise 1 Customizing Missing Data Values

In [6]:
import pandas as pd

# --- Added the code here ---
df  = pd.read_csv('employee.csv', na_values = {"Salary" : [0], "" : ['na']})
# ---------------------------

In [7]:
# Display using Jupyter Notebook
df

Unnamed: 0,First Name,Gender,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,97308.0,6.945,True,Marketing
1,Thomas,Male,61933.0,,True,
2,Jerry,Male,,9.34,True,Finance
3,Dennis,n.a.,115163.0,10.125,False,Legal
4,,Female,,11.598,,Finance
5,Angela,,,18.523,True,Engineering
6,Shawn,Male,111737.0,6.414,False,na
7,Rachel,Female,142032.0,12.599,False,Business Development
8,Linda,Female,57427.0,9.557,True,Client Services
9,Stephanie,Female,36844.0,5.574,True,Business Development


In [8]:
# --- Added the code here ---
missing_values = ["n.a.", "NA", "n/a", "na", 0]
# ---------------------------

In [10]:
# --- Added the code here ---
df  = pd.read_csv('employee.csv', na_values = missing_values)
# ---------------------------

In [11]:
# Display using Jupyter Notebook
df

Unnamed: 0,First Name,Gender,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,97308.0,6.945,True,Marketing
1,Thomas,Male,61933.0,,True,
2,Jerry,Male,,9.34,True,Finance
3,Dennis,,115163.0,10.125,False,Legal
4,,Female,,11.598,,Finance
5,Angela,,,18.523,True,Engineering
6,Shawn,Male,111737.0,6.414,False,
7,Rachel,Female,142032.0,12.599,False,Business Development
8,Linda,Female,57427.0,9.557,True,Client Services
9,Stephanie,Female,36844.0,5.574,True,Business Development


#### Exercise 2 Removing Rows With Missing Values

In [12]:
# drops all rows with NaN values

# --- Added the code here ---
clean_df = df.dropna(axis = 0, inplace = False)
# ---------------------------

In [13]:
clean_df

Unnamed: 0,First Name,Gender,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,97308.0,6.945,True,Marketing
7,Rachel,Female,142032.0,12.599,False,Business Development
8,Linda,Female,57427.0,9.557,True,Client Services
9,Stephanie,Female,36844.0,5.574,True,Business Development


In [14]:
# --- Added the code here ---
df.dropna(axis = 0, inplace = True, how= 'all')
# ---------------------------

In [15]:
df

Unnamed: 0,First Name,Gender,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,97308.0,6.945,True,Marketing
1,Thomas,Male,61933.0,,True,
2,Jerry,Male,,9.34,True,Finance
3,Dennis,,115163.0,10.125,False,Legal
4,,Female,,11.598,,Finance
5,Angela,,,18.523,True,Engineering
6,Shawn,Male,111737.0,6.414,False,
7,Rachel,Female,142032.0,12.599,False,Business Development
8,Linda,Female,57427.0,9.557,True,Client Services
9,Stephanie,Female,36844.0,5.574,True,Business Development


#### Filling out Missing Values

#### Exercise 3 Filling Missing Values with Column Mean

In [16]:
import pandas as pd

# Cleansing the missing values
missing_values = ["n.a.", "NA", "n/a", "na", 0]
df  = pd.read_csv('employee.csv', na_values = missing_values)

# Removing all NaN
df.dropna(axis = 0, inplace = True, how = 'all')

# Show df
df

Unnamed: 0,First Name,Gender,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,97308.0,6.945,True,Marketing
1,Thomas,Male,61933.0,,True,
2,Jerry,Male,,9.34,True,Finance
3,Dennis,,115163.0,10.125,False,Legal
4,,Female,,11.598,,Finance
5,Angela,,,18.523,True,Engineering
6,Shawn,Male,111737.0,6.414,False,
7,Rachel,Female,142032.0,12.599,False,Business Development
8,Linda,Female,57427.0,9.557,True,Client Services
9,Stephanie,Female,36844.0,5.574,True,Business Development


In [17]:
# Using mean

# --- Added the code here ---
df['Salary'].fillna(int(df['Salary'].mean()), inplace = True)
# ---------------------------

In [18]:
df

Unnamed: 0,First Name,Gender,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,97308.0,6.945,True,Marketing
1,Thomas,Male,61933.0,,True,
2,Jerry,Male,88920.0,9.34,True,Finance
3,Dennis,,115163.0,10.125,False,Legal
4,,Female,88920.0,11.598,,Finance
5,Angela,,88920.0,18.523,True,Engineering
6,Shawn,Male,111737.0,6.414,False,
7,Rachel,Female,142032.0,12.599,False,Business Development
8,Linda,Female,57427.0,9.557,True,Client Services
9,Stephanie,Female,36844.0,5.574,True,Business Development


#### Exercise 4 Filling Missing Values with Column Median

In [19]:
import pandas as pd

# Cleansing the missing values
missing_values = ["n.a.", "NA", "n/a", "na", 0]
df  = pd.read_csv('employee.csv', na_values = missing_values)

# Removing all NaN
df.dropna(axis = 0, inplace = True, how = 'all')

# Show df
df

Unnamed: 0,First Name,Gender,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,97308.0,6.945,True,Marketing
1,Thomas,Male,61933.0,,True,
2,Jerry,Male,,9.34,True,Finance
3,Dennis,,115163.0,10.125,False,Legal
4,,Female,,11.598,,Finance
5,Angela,,,18.523,True,Engineering
6,Shawn,Male,111737.0,6.414,False,
7,Rachel,Female,142032.0,12.599,False,Business Development
8,Linda,Female,57427.0,9.557,True,Client Services
9,Stephanie,Female,36844.0,5.574,True,Business Development


In [20]:
# Using median

# --- Added the code here ---
df['Salary'].fillna(int(df['salary'].median()), inplace = True)
# ---------------------------

KeyError: 'salary'

In [21]:
df

Unnamed: 0,First Name,Gender,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,97308.0,6.945,True,Marketing
1,Thomas,Male,61933.0,,True,
2,Jerry,Male,,9.34,True,Finance
3,Dennis,,115163.0,10.125,False,Legal
4,,Female,,11.598,,Finance
5,Angela,,,18.523,True,Engineering
6,Shawn,Male,111737.0,6.414,False,
7,Rachel,Female,142032.0,12.599,False,Business Development
8,Linda,Female,57427.0,9.557,True,Client Services
9,Stephanie,Female,36844.0,5.574,True,Business Development


#### Exercise 5 Filling Missing Values with Column Mode

In [22]:
import pandas as pd

# Cleansing the missing values
missing_values = ["n.a.", "NA", "n/a", "na", 0]
df  = pd.read_csv('employee.csv', na_values = missing_values)

# Removing all NaN
df.dropna(axis = 0, inplace = True, how = 'all')

# Show df
df

Unnamed: 0,First Name,Gender,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,97308.0,6.945,True,Marketing
1,Thomas,Male,61933.0,,True,
2,Jerry,Male,,9.34,True,Finance
3,Dennis,,115163.0,10.125,False,Legal
4,,Female,,11.598,,Finance
5,Angela,,,18.523,True,Engineering
6,Shawn,Male,111737.0,6.414,False,
7,Rachel,Female,142032.0,12.599,False,Business Development
8,Linda,Female,57427.0,9.557,True,Client Services
9,Stephanie,Female,36844.0,5.574,True,Business Development


In [23]:
# Using mode

# --- Added the code here ---
df['Salary'].fillna(df['Salary'].mode(), inplace = True)
# ---------------------------

In [24]:
df

Unnamed: 0,First Name,Gender,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,97308.0,6.945,True,Marketing
1,Thomas,Male,61933.0,,True,
2,Jerry,Male,61933.0,9.34,True,Finance
3,Dennis,,115163.0,10.125,False,Legal
4,,Female,111737.0,11.598,,Finance
5,Angela,,115163.0,18.523,True,Engineering
6,Shawn,Male,111737.0,6.414,False,
7,Rachel,Female,142032.0,12.599,False,Business Development
8,Linda,Female,57427.0,9.557,True,Client Services
9,Stephanie,Female,36844.0,5.574,True,Business Development


#### Exercise 6 Filling Missing Values with a Constant

In [25]:
import pandas as pd

# Cleansing the missing values
missing_values = ["n.a.", "NA", "n/a", "na", 0]
df  = pd.read_csv('employee.csv', na_values = missing_values)

# Removing all NaN
df.dropna(axis = 0, inplace = True, how = 'all')

# Show df
df

Unnamed: 0,First Name,Gender,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,97308.0,6.945,True,Marketing
1,Thomas,Male,61933.0,,True,
2,Jerry,Male,,9.34,True,Finance
3,Dennis,,115163.0,10.125,False,Legal
4,,Female,,11.598,,Finance
5,Angela,,,18.523,True,Engineering
6,Shawn,Male,111737.0,6.414,False,
7,Rachel,Female,142032.0,12.599,False,Business Development
8,Linda,Female,57427.0,9.557,True,Client Services
9,Stephanie,Female,36844.0,5.574,True,Business Development


In [26]:
# --- Added the code here ---
df['Salary'].fillna(0, inplace = True)
# ---------------------------

In [27]:
df

Unnamed: 0,First Name,Gender,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,97308.0,6.945,True,Marketing
1,Thomas,Male,61933.0,,True,
2,Jerry,Male,0.0,9.34,True,Finance
3,Dennis,,115163.0,10.125,False,Legal
4,,Female,0.0,11.598,,Finance
5,Angela,,0.0,18.523,True,Engineering
6,Shawn,Male,111737.0,6.414,False,
7,Rachel,Female,142032.0,12.599,False,Business Development
8,Linda,Female,57427.0,9.557,True,Client Services
9,Stephanie,Female,36844.0,5.574,True,Business Development


#### Exercise 7 Forward Fill Missing DataFrame Values

In [28]:
import pandas as pd

# Cleansing the missing values
missing_values = ["n.a.", "NA", "n/a", "na", 0]
df  = pd.read_csv('employee.csv', na_values = missing_values)

# Removing all NaN
df.dropna(axis = 0, inplace = True, how = 'all')

# Show df
df

Unnamed: 0,First Name,Gender,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,97308.0,6.945,True,Marketing
1,Thomas,Male,61933.0,,True,
2,Jerry,Male,,9.34,True,Finance
3,Dennis,,115163.0,10.125,False,Legal
4,,Female,,11.598,,Finance
5,Angela,,,18.523,True,Engineering
6,Shawn,Male,111737.0,6.414,False,
7,Rachel,Female,142032.0,12.599,False,Business Development
8,Linda,Female,57427.0,9.557,True,Client Services
9,Stephanie,Female,36844.0,5.574,True,Business Development


In [29]:
# --- Added the code here ---
df['Salary'].fillna(method = 'ffill', inplace = True)
# ---------------------------

In [30]:
df

Unnamed: 0,First Name,Gender,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,97308.0,6.945,True,Marketing
1,Thomas,Male,61933.0,,True,
2,Jerry,Male,61933.0,9.34,True,Finance
3,Dennis,,115163.0,10.125,False,Legal
4,,Female,115163.0,11.598,,Finance
5,Angela,,115163.0,18.523,True,Engineering
6,Shawn,Male,111737.0,6.414,False,
7,Rachel,Female,142032.0,12.599,False,Business Development
8,Linda,Female,57427.0,9.557,True,Client Services
9,Stephanie,Female,36844.0,5.574,True,Business Development


#### Exercise 8 Backward Fill Missing DataFrame Values

In [31]:
import pandas as pd

# Cleansing the missing values
missing_values = ["n.a.", "NA", "n/a", "na", 0]
df  = pd.read_csv('employee.csv', na_values = missing_values)

# Removing all NaN
df.dropna(axis = 0, inplace = True, how = 'all')

# Show df
df

Unnamed: 0,First Name,Gender,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,97308.0,6.945,True,Marketing
1,Thomas,Male,61933.0,,True,
2,Jerry,Male,,9.34,True,Finance
3,Dennis,,115163.0,10.125,False,Legal
4,,Female,,11.598,,Finance
5,Angela,,,18.523,True,Engineering
6,Shawn,Male,111737.0,6.414,False,
7,Rachel,Female,142032.0,12.599,False,Business Development
8,Linda,Female,57427.0,9.557,True,Client Services
9,Stephanie,Female,36844.0,5.574,True,Business Development


In [32]:
# --- Added the code here ---
df['Team'].fillna(method = 'bfill', inplace = True)
# ---------------------------

In [33]:
df

Unnamed: 0,First Name,Gender,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,97308.0,6.945,True,Marketing
1,Thomas,Male,61933.0,,True,Finance
2,Jerry,Male,,9.34,True,Finance
3,Dennis,,115163.0,10.125,False,Legal
4,,Female,,11.598,,Finance
5,Angela,,,18.523,True,Engineering
6,Shawn,Male,111737.0,6.414,False,Business Development
7,Rachel,Female,142032.0,12.599,False,Business Development
8,Linda,Female,57427.0,9.557,True,Client Services
9,Stephanie,Female,36844.0,5.574,True,Business Development


#### Exercise 9 Fill Missing DataFrame Values with Interpolation - polynomial

In [34]:
import pandas as pd

# Cleansing the missing values
missing_values = ["n.a.", "NA", "n/a", "na", 0]
df  = pd.read_csv('employee.csv', na_values = missing_values)

# Removing all NaN
df.dropna(axis = 0, inplace = True, how = 'all')

# Show df
df

Unnamed: 0,First Name,Gender,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,97308.0,6.945,True,Marketing
1,Thomas,Male,61933.0,,True,
2,Jerry,Male,,9.34,True,Finance
3,Dennis,,115163.0,10.125,False,Legal
4,,Female,,11.598,,Finance
5,Angela,,,18.523,True,Engineering
6,Shawn,Male,111737.0,6.414,False,
7,Rachel,Female,142032.0,12.599,False,Business Development
8,Linda,Female,57427.0,9.557,True,Client Services
9,Stephanie,Female,36844.0,5.574,True,Business Development


In [35]:
# --- Added the code here ---
df['Salary'].interpolate(method = 'polynomial', order = 5 ,inplace = True)
# ---------------------------

In [36]:
df

Unnamed: 0,First Name,Gender,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,97308.0,6.945,True,Marketing
1,Thomas,Male,61933.0,,True,
2,Jerry,Male,108558.907129,9.34,True,Finance
3,Dennis,,115163.0,10.125,False,Legal
4,,Female,85046.355671,11.598,,Finance
5,Angela,,73044.323666,18.523,True,Engineering
6,Shawn,Male,111737.0,6.414,False,
7,Rachel,Female,142032.0,12.599,False,Business Development
8,Linda,Female,57427.0,9.557,True,Client Services
9,Stephanie,Female,36844.0,5.574,True,Business Development


#### Exercise 10 Fill Missing DataFrame Values with Interpolation - spline

In [37]:
import pandas as pd

# Cleansing the missing values
missing_values = ["n.a.", "NA", "n/a", "na", 0]
df  = pd.read_csv('employee.csv', na_values = missing_values)

# Removing all NaN
df.dropna(axis = 0, inplace = True, how = 'all')

# Show df
df

Unnamed: 0,First Name,Gender,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,97308.0,6.945,True,Marketing
1,Thomas,Male,61933.0,,True,
2,Jerry,Male,,9.34,True,Finance
3,Dennis,,115163.0,10.125,False,Legal
4,,Female,,11.598,,Finance
5,Angela,,,18.523,True,Engineering
6,Shawn,Male,111737.0,6.414,False,
7,Rachel,Female,142032.0,12.599,False,Business Development
8,Linda,Female,57427.0,9.557,True,Client Services
9,Stephanie,Female,36844.0,5.574,True,Business Development


In [38]:
# --- Added the code here ---
df['Salary'].interpolate(method='spline', order = 5, inplace = True)
# ---------------------------

In [39]:
df

Unnamed: 0,First Name,Gender,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,97308.0,6.945,True,Marketing
1,Thomas,Male,61933.0,,True,
2,Jerry,Male,108557.233181,9.34,True,Finance
3,Dennis,,115163.0,10.125,False,Legal
4,,Female,85049.266288,11.598,,Finance
5,Angela,,73048.433135,18.523,True,Engineering
6,Shawn,Male,111737.0,6.414,False,
7,Rachel,Female,142032.0,12.599,False,Business Development
8,Linda,Female,57427.0,9.557,True,Client Services
9,Stephanie,Female,36844.0,5.574,True,Business Development


#### Exercise 11 Fill Missing DataFrame Values with Interpolation - linear

In [40]:
import pandas as pd

# Cleansing the missing values
missing_values = ["n.a.", "NA", "n/a", "na", 0]
df  = pd.read_csv('employee.csv', na_values = missing_values)

# Removing all NaN
df.dropna(axis = 0, inplace = True, how = 'all')

# Show df
df

Unnamed: 0,First Name,Gender,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,97308.0,6.945,True,Marketing
1,Thomas,Male,61933.0,,True,
2,Jerry,Male,,9.34,True,Finance
3,Dennis,,115163.0,10.125,False,Legal
4,,Female,,11.598,,Finance
5,Angela,,,18.523,True,Engineering
6,Shawn,Male,111737.0,6.414,False,
7,Rachel,Female,142032.0,12.599,False,Business Development
8,Linda,Female,57427.0,9.557,True,Client Services
9,Stephanie,Female,36844.0,5.574,True,Business Development


In [41]:
# --- Added the code here ---
df['Salary'].interpolate(method='linear', inplace=True)
# ---------------------------

In [42]:
df

Unnamed: 0,First Name,Gender,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,97308.0,6.945,True,Marketing
1,Thomas,Male,61933.0,,True,
2,Jerry,Male,88548.0,9.34,True,Finance
3,Dennis,,115163.0,10.125,False,Legal
4,,Female,114021.0,11.598,,Finance
5,Angela,,112879.0,18.523,True,Engineering
6,Shawn,Male,111737.0,6.414,False,
7,Rachel,Female,142032.0,12.599,False,Business Development
8,Linda,Female,57427.0,9.557,True,Client Services
9,Stephanie,Female,36844.0,5.574,True,Business Development


#### Exercise 12 Writing CSV File with to_csv( )

In [45]:
# --- Added the code here ---
df.to_csv('cleaned_employee.csv')
# ---------------------------

-----

#### Revised Date: July 18, 2024