0. Setup

In [None]:
import pandas as pd
import numpy as np

url = 'https://bit.ly/messy_csv'
messy_df = pd.read_csv(url)

messy_df.head()

1. Removing duplicates

In [None]:
df_unique = messy_df.drop_duplicates()
df_unique.head()

2. Handling missing values (either fill or dropna to remove rows with missing data)

In [None]:
df_cleaned = df_unique.dropna().copy()
df_cleaned.head()

3. Ensuring consistent data types (dates, strings)

In [None]:
df_cleaned['collection date'] = pd.to_datetime(df_cleaned['collection date'])
print(df_cleaned['collection date'].apply(type))

df_cleaned['temperature_c'] = df_cleaned['temperature_c'].astype(float)
print(df_cleaned['temperature_c'].apply(type))

df_cleaned['pH'] = df_cleaned['pH'].astype(float)
print(df_cleaned['pH'].apply(type))

df_cleaned['dissolved_oxygen_mg_L'] = df_cleaned['dissolved_oxygen_mg_L'].astype(float)
print(df_cleaned['dissolved_oxygen_mg_L'].apply(type))

4. Formatting the ‘site’ column for consistency

In [None]:
df_cleaned.loc[0,'site'] = "site_a"
df_cleaned.loc[2,'site'] = "site_c"
df_cleaned.head()

5. Making sure all column names are lower case, without whitespace.

In [None]:
# df_cleaned = df_cleaned.rename(columns={'pH': 'ph', 'collection date': 'collection_date'})
df_cleaned = [col.lower().replace(" ", "_") for col in df_cleaned]
print(df_cleaned)