# Data Cleaning

In [1]:
import pandas as pd
import numpy as np

In [11]:
# Load raw data
raw_data = pd.read_csv('survey_results_raw.csv')

raw_data.head()


Unnamed: 0,Owner Name,Cat Name,Cat Age,Cat Breed,Hair Type,Self-Playing Activities,Interactive Activities,Activity Duration (min),Zoomies,Food Types,...,Teeth Cleaning Method,Hairball Prevention Frequency (times/week),Hairball Prevention Measures,Average Sleep Duration (hours/night),Sleep Quality (1-10),Awareness of Teeth Cleaning Needs,Awareness of Shedding Management,Awareness of Ear and Eye Care,Awareness of Playtime Importance,Awareness of Hairball Prevention
0,Alexander Wilkinson,Ginger,7,Persian,Short Hair,Hunting,,66,No,Dry Food,...,Professional Cleaning,3,Special Food,12.1,6,Yes,Yes,No,Yes,No
1,Beth Payne,Callie,15,Siamese,Middle Hair,Running,Fetching,38,No,Wet Food,...,Brushing,4,Malt Paste,11.9,5,No,Yes,Yes,No,No
2,Kathy Wright,Pepper,11,Siamese,Short Hair,Running,Fetching,87,Yes,Dry Food,...,Brushing,4,Malt Paste,13.3,7,Yes,No,No,No,No
3,Brandon Miles MD,Callie,8,Bengal,Middle Hair,Running,,101,No,Dry Food,...,Professional Cleaning,6,,9.1,5,Yes,No,No,No,No
4,Jeremy Bradley,Sammy,7,Siamese,Middle Hair,Running,Playing with toys,78,No,Raw Diet,...,,4,Brushing,8.2,9,No,No,No,Yes,Yes


In [12]:
# Data Cleaning

# Standardize Date Formats
date_columns = ['Last Vet Visit', 'Next Vaccination Date', 'Last Anti-parasitic Treatment', 'Last Teeth Cleaning']
for col in date_columns:
    raw_data[col] = pd.to_datetime(raw_data[col], errors='coerce').dt.strftime('%Y-%m-%d')

# Standardize Weight
def convert_weight(value):
    try:
        if isinstance(value, str):
            if 'and a half kilos' in value:
                value = float(value.split(' ')[0]) + 0.5
            elif 'pounds' in value or 'lb' in value:
                value = float(value.split(' ')[0]) / 2.20462
        return round(float(value), 2)
    except:
        return np.nan

raw_data['Current Weight (kg)'] = raw_data['Current Weight (kg)'].apply(convert_weight)

# Handle Missing Values
raw_data['Next Vaccination Date'] = raw_data['Next Vaccination Date'].fillna('Unknown')
raw_data['Last Vet Visit'] = raw_data['Last Vet Visit'].fillna('Unknown')
raw_data['Feeding Schedule'] = raw_data['Feeding Schedule'].fillna('Unknown')
raw_data['Interactive Activities'] = raw_data['Interactive Activities'].fillna('None')

# Ensure no missing values for Sleep Quality and Average Sleep Duration
raw_data['Sleep Quality (1-10)'] = raw_data['Sleep Quality (1-10)'].fillna(raw_data['Sleep Quality (1-10)'].mean())
raw_data['Average Sleep Duration (hours/night)'] = raw_data['Average Sleep Duration (hours/night)'].fillna(raw_data['Average Sleep Duration (hours/night)'].mean())

# Standardize Categorical Data
def standardize_feeding_schedule(schedule):
    schedule = schedule.lower()
    if 'routine' in schedule:
        return 'Routine Feeding'
    elif 'free' in schedule or 'always' in schedule:
        return 'Free Feeding'
    else:
        return schedule

raw_data['Feeding Schedule'] = raw_data['Feeding Schedule'].apply(standardize_feeding_schedule)

# Save the cleaned data
raw_data.to_csv('cleaned_cat_survey_results.csv', index=False)



print("Data cleaning completed and saved as 'cleaned_cat_survey_results.csv'.")

Data cleaning completed and saved as 'cleaned_cat_survey_results.csv'.
