In [1]:
import json
import pandas as pd

with open("nestle_scraped.json") as f:
    raw_data = json.load(f)

df = pd.DataFrame(raw_data)
print(f"Total pages scraped: {len(df)}")
df.head()


Total pages scraped: 76


Unnamed: 0,url,title,text,links
0,https://www.madewithnestle.ca/,"Nestlé Brands' Products, Recipes and News | Ma...",Learn about our commitment to sustainable coco...,"[#main-content, /, https://www.madewithnestle...."
1,https://www.madewithnestle.ca/aero,AERO | Feel the Bubbles Melt | Nestlé Canada,Enter for a chance to WIN headphones or Spotif...,"[/#facebook, /#twitter, /#pinterest, /#email, ..."
2,https://www.madewithnestle.ca/coffee-crisp,COFFEE CRISP | Makes a Nice Light Snack!,Wake up your taste buds with the perfect blend...,"[/#facebook, /#twitter, /#pinterest, /#email, ..."
3,https://www.madewithnestle.ca/kit-kat,KITKAT | Have a Break | Made with Nestlé Canada,These cookies are necessary for the website t...,"[#main-content, /, https://www.madewithnestle...."
4,https://www.madewithnestle.ca/smarties,SMARTIES | How Do You Smarties?,"Whether you shake, sort, or make your own crea...","[/#facebook, /#twitter, /#pinterest, /#email, ..."


In [3]:
# Drop entries where text is missing or too short
df = df[df['text'].str.len() > 50]

# Drop where title is missing or suspicious
df = df[~df['title'].str.contains("Just a moment", na=False)]
df = df[df['title'].notnull()]

df.reset_index(drop=True, inplace=True)
print(f"Pages after cleanup: {len(df)}")


Pages after cleanup: 75


In [5]:
def clean_text(text):
    if not text: return ""
    lines = text.split("\n")
    junk_phrases = [
        "By continuing to use", "Terms of Service", "Privacy Policy",
        "Accept cookies", "Nestlé Canada", "All rights reserved"
    ]
    cleaned = []
    for line in lines:
        if not any(junk.lower() in line.lower() for junk in junk_phrases):
            cleaned.append(line.strip())
    return " ".join(cleaned)

df['clean_text'] = df['text'].apply(clean_text)


In [7]:
# Save cleaned data to a new file
cleaned = df[['url', 'title', 'clean_text']].rename(columns={'clean_text': 'text'})

with open("nestle_cleaned.json", "w") as f:
    json.dump(cleaned.to_dict(orient="records"), f, indent=2)

print("✅ Cleaned data saved to nestle_cleaned.json")


✅ Cleaned data saved to nestle_cleaned.json
