# Data Cleaning

In [1]:
import numpy as np
import pandas as pd

In [2]:
# create a new DataFrame based on test_data.csv
df = pd.read_csv('test_data.csv')

In [3]:
# show column statistics
df.describe()

Unnamed: 0,ID,Age,Score
count,8.0,6.0,7.0
mean,4.5,25.0,66.285714
std,2.44949,6.78233,32.57665
min,1.0,18.0,-1.0
25%,2.75,19.75,60.0
50%,4.5,23.5,78.0
75%,6.25,29.5,87.5
max,8.0,35.0,92.0


## Filtering a DataFrame

In [4]:
# show only rows where age is greater than 18
df_filt_age = df[(df['Age'] > 18)]
df_filt_age

Unnamed: 0,ID,Name,Age,City,Score
0,1,Alicia,25.0,NY,85.0
2,3,Charlie,22.0,Chicago,-1.0
3,4,Dominic,19.0,Houston,65.0
4,5,Eve,31.0,Phoenix,78.0
5,6,Frank,35.0,Boston,


In [5]:
# show only rows where score is greater than 50 and less than 100
df_filter_score = df[(df['Score'] > 50) & (df['Score'] < 100)]
df_filter_score

Unnamed: 0,ID,Name,Age,City,Score
0,1,Alicia,25.0,NY,85.0
1,2,Bob,,LA,92.0
3,4,Dominic,19.0,Houston,65.0
4,5,Eve,31.0,Phoenix,78.0
6,7,Grace,,Austin,55.0
7,8,Heidi,18.0,San Diego,90.0


## Modifying a DataFrame

In [6]:
# Remove rows where the score is less than 0 (i.e., invalid score data)
for x in df.index:
    if df.loc[x, 'Score'] < 0:
        df_cleaned = df.drop(x, inplace=True)
df_cleaned

In [7]:
# Remove the city column from DataFrame
df_cleaned = df.drop(columns=['City'])
df_cleaned

Unnamed: 0,ID,Name,Age,Score
0,1,Alicia,25.0,85.0
1,2,Bob,,92.0
3,4,Dominic,19.0,65.0
4,5,Eve,31.0,78.0
5,6,Frank,35.0,
6,7,Grace,,55.0
7,8,Heidi,18.0,90.0


In [8]:
# Replace all missing age values with the mean age
mean_age = df_cleaned['Age'].mean()
df_cleaned['Age'] = df_cleaned['Age'].fillna(mean_age)
df_cleaned

Unnamed: 0,ID,Name,Age,Score
0,1,Alicia,25.0,85.0
1,2,Bob,25.6,92.0
3,4,Dominic,19.0,65.0
4,5,Eve,31.0,78.0
5,6,Frank,35.0,
6,7,Grace,25.6,55.0
7,8,Heidi,18.0,90.0


In [9]:
# Replace any score value of 0 with the mean score of the DataFrame
mean_score = df_cleaned['Score'].mean()
df_cleaned = df_cleaned.replace({'Score':0}, mean_score)
df_cleaned

Unnamed: 0,ID,Name,Age,Score
0,1,Alicia,25.0,85.0
1,2,Bob,25.6,92.0
3,4,Dominic,19.0,65.0
4,5,Eve,31.0,78.0
5,6,Frank,35.0,
6,7,Grace,25.6,55.0
7,8,Heidi,18.0,90.0


In [10]:
# Remove any remaining rows that contain NaN values
df_cleaned = df_cleaned.dropna()
df_cleaned

Unnamed: 0,ID,Name,Age,Score
0,1,Alicia,25.0,85.0
1,2,Bob,25.6,92.0
3,4,Dominic,19.0,65.0
4,5,Eve,31.0,78.0
6,7,Grace,25.6,55.0
7,8,Heidi,18.0,90.0


In [11]:
# Apply the values from ID as index labels, then remove column ID
df_cleaned = df_cleaned.set_index('ID', drop=True)
df_cleaned

Unnamed: 0_level_0,Name,Age,Score
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Alicia,25.0,85.0
2,Bob,25.6,92.0
4,Dominic,19.0,65.0
5,Eve,31.0,78.0
7,Grace,25.6,55.0
8,Heidi,18.0,90.0
