In [3]:
#test

perfume_name = "Coco Mademoiselle - Chanel" 
base_url = "https://www.wikiparfum.com/en/"
search_url = base_url + "?q=" + urllib.parse.quote(perfume_name)

print("Search URL:", search_url)


Search URL: https://www.wikiparfum.com/en/?q=Coco%20Mademoiselle%20-%20Chanel


In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Load CSV file
file_path = "Cleaned_Perfume_Data.csv"  # Update with correct path if needed
df = pd.read_csv(file_path)

# Headers to mimic a real browser request
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
}

# Function to scrape perfume details from the given URL
def get_perfume_info(perfume_url):
    try:
        response = requests.get(perfume_url, headers=headers)
        if response.status_code != 200:
            print(f"Failed to fetch {perfume_url} (Status {response.status_code})")
            return None, None, None
        
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract Family
        family_tag = soup.find("p", class_="text-12 text-grey600 uppercase text-center", string="Family")
        family = family_tag.find_next("p").text.strip() if family_tag else "Not Found"

        # Extract Subfamily
        subfamily_tag = soup.find("p", class_="text-12 text-grey600 uppercase text-center", string="Subfamily")
        subfamily = subfamily_tag.find_next("p").text.strip() if subfamily_tag else "Not Found"

        # Extract Ingredients
        ingredients_tags = soup.select("a.h-4.text-black.ps-2")
        ingredients = ", ".join([tag.text.strip() for tag in ingredients_tags]) if ingredients_tags else "Not Found"

        print(f"Scraped: {perfume_url} - Family: {family}, Subfamily: {subfamily}, Ingredients: {ingredients}")
        return family, subfamily, ingredients

    except Exception as e:
        print(f"Error scraping {perfume_url}: {e}")
        return None, None, None

# Loop through each perfume
for index, row in df.iterrows():
    perfume_url = row.get("url")  # Ensure the column is named correctly

    if pd.isna(perfume_url) or perfume_url.strip() == "":
        continue

    family, subfamily, ingredients = get_perfume_info(perfume_url)

    # Update DataFrame
    df.at[index, "Family"] = family
    df.at[index, "Subfamily"] = subfamily
    df.at[index, "Ingredients"] = ingredients

    time.sleep(2)  # Add delay to avoid being blocked

# Save results to a new CSV
df.to_csv("Final_Perfume_Data.csv", index=False)
print("Scraping completed. Data saved to Final_Perfume_Data.csv")


Scraped: https://www.wikiparfum.com/en/fragrances/valentino-donna-born-in-roma - Family: WOODY, Subfamily: AMBERY (ORIENTAL), Ingredients: Cashmeran (Woody musky), Blackcurrant Bud, Jasmine (Grandiflorum), Bergamot, Vanilla (Bourbon), Pink Pepper, Jasmine (Sambac), Cashmeran (Woody musky), Blackcurrant Bud, Jasmine (Grandiflorum), Bergamot, Vanilla (Bourbon), Pink Pepper, Jasmine (Sambac)


  df.at[index, "Family"] = family
  df.at[index, "Subfamily"] = subfamily
  df.at[index, "Ingredients"] = ingredients


Scraped: https://www.wikiparfum.com/en/fragrances/coco-mademoiselle-eau-de-parfum - Family: CHYPRE, Subfamily: FRUITY, Ingredients: Patchouli, Rose, Vanilla, Jasmine, Vetiver, Bergamot, Patchouli, Rose, Vanilla, Jasmine, Vetiver, Bergamot
Scraped: https://www.wikiparfum.com/en/fragrances/miss-dior-parfum - Family: FLORAL, Subfamily: FRUITY, Ingredients: Jasmine, Mandarin, Cedarwood (Alaska), Strawberry, Patchouli, Jasmine, Mandarin, Cedarwood (Alaska), Strawberry, Patchouli
Scraped: https://www.wikiparfum.com/en/fragrances/good-girl-blush - Family: AMBERY (ORIENTAL), Subfamily: FLORAL, Ingredients: Vanilla (Madagascar), Ylang-ylang (Madagascar), Rose, Peony, Bergamot, Mandarin, Tonka Bean, Vanilla (Madagascar), Ylang-ylang (Madagascar), Rose, Peony, Bergamot, Mandarin, Tonka Bean
Scraped: https://www.wikiparfum.com/en/fragrances/paradoxe-1 - Family: FLORAL, Subfamily: AMBERY (ORIENTAL), Ingredients: Neroli, Jasmine (Sambac), Ambrofix (Ambergris), Serenolide (Musk), Vanilla (Bourbon), B