# Data Cleaning
Jupyter Notebook used to modify [Indoor House Plants Dataset with Care Instructions](https://www.kaggle.com/datasets/prakash27x/indoor-house-plants-dataset-with-care-instructions) dataset to include only species present in [🌱 House Plant Species 🌱](https://www.kaggle.com/datasets/kacpergregorowicz/house-plant-species) and thus trained on by the developed ML model.

### **Setup:**

To run the notebook, you'll first need to download the Kaggle datasets available at:

[Indoor House Plants Dataset with Care Instructions](https://www.kaggle.com/datasets/prakash27x/indoor-house-plants-dataset-with-care-instructions)

[🌱 House Plant Species 🌱](https://www.kaggle.com/datasets/kacpergregorowicz/house-plant-species)

Save these datasets in the `/Data/` folder before running the code.

---

In [1]:
# Obtain a list of species
import os
import re

BASE_DIR = '../Data/house_plant_species'
species = []
dir_content = os.listdir(BASE_DIR)

for class_folder in dir_content:
    folder_path = os.path.join(BASE_DIR, class_folder)
    if os.path.isdir(folder_path):
        # Extract only the English name of the species
        specie = class_folder.lower()
        match = re.match(r".*?\((.*?)\)", specie)
        species.append(match.group(1) if match else specie)

len(species), species

(47,
 ['saintpaulia ionantha',
  'aloe vera',
  'anthurium andraeanum',
  'dypsis lutescens',
  'asparagus setaceus',
  'begonia spp.',
  'strelitzia reginae',
  'asplenium nidus',
  'nephrolepis exaltata',
  'calathea',
  'aspidistra elatior',
  'aglaonema',
  'pilea peperomioides',
  'schlumbergera bridgesii',
  'chrysanthemum',
  'ctenanthe',
  'narcissus spp.',
  'dracaena',
  'dieffenbachia spp.',
  'alocasia spp.',
  'hedera helix',
  'hyacinthus orientalis',
  'begonia masoniana',
  'crassula ovata',
  'kalanchoe',
  'hemerocallis',
  'convallaria majalis',
  'pachira aquatica',
  'monstera deliciosa',
  'orchid',
  'chamaedorea elegans',
  'peace lily',
  'euphorbia pulcherrima',
  'hypoestes phyllostachya',
  'beaucarnea recurvata',
  'ivy arum',
  'maranta leuconeura',
  'calathea lancifolia',
  'ficus elastica',
  'cycas revoluta',
  'schefflera',
  'sanseviera',
  'tradescantia',
  'tulip',
  'venus flytrap',
  'yucca',
  'zamioculcas zamiifolia'])

In [2]:
# Load the json dataset
import json

with open('../Data/house_plants.json') as f:
    data = json.load(f)

data

[{'id': 0,
  'latin': 'Aeschynanthus lobianus',
  'family': 'Gesneriaceae',
  'common': ['Lipstick'],
  'category': 'Hanging',
  'origin': 'Java',
  'climate': 'Tropical',
  'tempmax': {'celsius': 32, 'fahrenheit': 89.6},
  'tempmin': {'celsius': 14, 'fahrenheit': 57.2},
  'ideallight': 'Bright light',
  'toleratedlight': 'Direct sunlight',
  'watering': 'Keep moist between watering. Can be a bit dry between watering',
  'insects': ['Mealy bug', 'Aphid', 'Thrips'],
  'diseases': 'N/A',
  'use': ['Hanging', 'Flower', 'Tertiary']},
 {'id': 1,
  'latin': 'Adiantum raddianum',
  'family': 'Polypodiaceae',
  'common': ['Maindenhair', 'Delta maidenhair'],
  'category': 'Fern',
  'origin': 'Brazil',
  'climate': 'Tropical',
  'tempmax': {'celsius': 30, 'fahrenheit': 86},
  'tempmin': {'celsius': 12, 'fahrenheit': 53.6},
  'ideallight': 'Bright light',
  'toleratedlight': 'Diffused',
  'watering': 'Keep moist between watering. Must not be dry between watering',
  'insects': ['Mealy bug', 'Aphi

In [None]:
def normalize(text):
    """Normalize text by lowercasing and removing accents."""
    import unicodedata
    return ''.join(c for c in unicodedata.normalize('NFD', text.lower()) if unicodedata.category(c) != 'Mn')

def is_specie_in_entry(specie, entry):
    """Check if a species name matches an entry with enhanced logic and updates the entry."""
    specie_normalized = normalize(specie)
    latin_normalized = normalize(entry['latin'])
    common_names = [normalize(name) for name in entry['common']]

    # Check exact or substring match
    if specie_normalized in latin_normalized:
        entry['matched_as'] = 'latin'
        entry['matched_name'] = specie
        return True
    if any(specie_normalized in name for name in common_names):
        entry['matched_as'] = 'common'
        entry['matched_name'] = specie
        return True

    # Split species into components
    specie_parts = specie_normalized.split()
    if len(specie_parts) > 1:
        genus, epithet = specie_parts[0], specie_parts[-1]
        if genus in latin_normalized or epithet in latin_normalized:
            entry['matched_as'] = 'latin_partial'
            entry['matched_name'] = specie
            return True
        if any(genus in name or epithet in name for name in common_names):
            entry['matched_as'] = 'common_partial'
            entry['matched_name'] = specie
            return True

    return False

no_found = 0
missing = []
matches = []

# Main loop
for specie in species:
    found = False
    for entry in data:
        if is_specie_in_entry(specie, entry):
            print(f'Specie {specie} found! Matched as {entry["matched_as"]}.')
            no_found += 1
            found = True
            matches.append(entry)
            break
    if not found:
        missing.append(specie)
        print(f'Specie {specie} not found!')

print(f"Total found: {no_found}")
print(f"Missing species: {missing}")

Specie saintpaulia ionantha not found!
Specie aloe vera not found!
Specie anthurium andraeanum found! Matched as latin_partial.
Specie dypsis lutescens found! Matched as latin.
Specie asparagus setaceus not found!
Specie begonia spp. not found!
Specie strelitzia reginae found! Matched as latin_partial.
Specie asplenium nidus found! Matched as latin.
Specie nephrolepis exaltata found! Matched as latin.
Specie calathea found! Matched as latin.
Specie aspidistra elatior found! Matched as latin.
Specie aglaonema found! Matched as latin.
Specie pilea peperomioides not found!
Specie schlumbergera bridgesii not found!
Specie chrysanthemum not found!
Specie ctenanthe found! Matched as latin.
Specie narcissus spp. not found!
Specie dracaena found! Matched as latin.
Specie dieffenbachia spp. found! Matched as latin_partial.
Specie alocasia spp. found! Matched as latin_partial.
Specie hedera helix found! Matched as latin_partial.
Specie hyacinthus orientalis not found!
Specie begonia masoniana no

In [23]:
for i in range(len(matches)):
    matches[i]['id'] = 20 + i

matches

[{'id': 20,
  'latin': 'Anthurium superbum',
  'family': 'Araceae',
  'common': ['Bronze Anthurium'],
  'category': 'Anthurium',
  'origin': 'Ecuador',
  'climate': 'Tropical',
  'tempmax': {'celsius': 28, 'fahrenheit': 82.4},
  'tempmin': {'celsius': 18, 'fahrenheit': 64.4},
  'ideallight': 'Bright light',
  'toleratedlight': 'Diffused',
  'watering': 'Water when soil is half dry. Can be dry between watering.',
  'insects': ['Spider mite', 'Mealy bug'],
  'diseases': 'N/A',
  'use': ['Table top', 'Colors / Forms', 'Secondary'],
  'matched_as': 'latin_partial',
  'matched_name': 'anthurium andraeanum'},
 {'id': 21,
  'latin': 'Dypsis lutescens',
  'family': 'Arecaceae',
  'common': ['Areca palm', 'Butterfly palm'],
  'category': 'Palm',
  'origin': 'Madagascar',
  'climate': 'Tropical',
  'tempmax': {'celsius': 28, 'fahrenheit': 82.4},
  'tempmin': {'celsius': 10, 'fahrenheit': 50},
  'ideallight': '6 or more hours of direct sunlight per day.',
  'toleratedlight': 'Direct sunlight.',
 

In [24]:
import json

with open('house_plants_filtered.json', 'w') as f:
    json.dump(matches, f)