In [3]:
import requests
from bs4 import BeautifulSoup

url = "https://www.cnn.com/2025/06/13/style/why-luxury-brands-are-so-expensive"
response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(response.text, "html.parser")

# CNN articles often store text inside <div class="paragraph"> or <p> tags
paragraphs = soup.find_all("p")
text = " ".join(p.get_text(strip=True) for p in paragraphs)

first_700 = text[:700]

print("1. First 700 characters of extracted text:")
print(first_700)
print(f"\nLength: {len(first_700)} characters")


1. First 700 characters of extracted text:
More than ever, high-end brands want you to know exactly how, and where, their goods are made. They are producing enormous glossy coffee table books showing white-coated workers hand-stitching products in glamorous workshops, and creating marketing campaigns emphasizing the exquisite materials and dedicated handiwork that go into the making of their very, very expensive products. These companies are trying to explain the value of their creations to consumers because their profits are slowing, even as their prices are increasing. While the personal luxury goods market was worth €363 billion (about $415 billion) in 2024, up from €223 billion ($242 billion) a decade prior, according to the glob

Length: 700 characters


In [None]:
import requests
from bs4 import BeautifulSoup

# Step 1: Read the content from the URL
url = "https://www.cnn.com/2025/06/13/style/why-luxury-brands-are-so-expensive"
response = requests.get(url)
html_content = response.text # Raw HTML

# Parse and extract text (removes some HTML structure early, but keeps for print)
soup = BeautifulSoup(html_content, 'html.parser')
raw_text = soup.get_text() # Raw text without most HTML tags

# Print first 700 characters
first_700 = raw_text[:700]
print("1. First 700 characters of raw text:")
print(first_700)
print(f"\nLength: {len(first_700)} characters")


1. First 700 characters of raw text:
 



































Luxury brands are more expensive than ever. They’re telling you why they’re worth it | CNN










































































CNN values your feedback




                                                        1. How relevant is this ad to you?
                                                






























                                                2. Did you encounter any technical issues?
                                        











                                                                        Video player was slow to load content
                                              

Length: 700 characters


In [None]:
# Step 2: Remove HTML tags (already partially done above)
clean_text = soup.get_text(separator=' ', strip=True) # Raw text only, no tags
print("\n2. Text after HTML removal (first 300 chars):")
print(clean_text[:300])


2. Text after HTML removal (first 300 chars):
Luxury brands are more expensive than ever. They’re telling you why they’re worth it | CNN CNN values your feedback 1. How relevant is this ad to you? 2. Did you encounter any technical issues? Video player was slow to load content Video content never loaded Ad froze or did not finish loading Video 


In [None]:
import re

# Step 3: Lowercase and remove punctuation
text_lower = clean_text.lower()
text_no_punct = re.sub(r'[^\w\s]', ' ', text_lower) # Keep words/spaces only
text_no_punct = ' '.join(text_no_punct.split()) # Normalize spaces

print("\n3. Lower/no punctuation (first 300 chars):")
print(text_no_punct[:300])


3. Lower/no punctuation (first 300 chars):
luxury brands are more expensive than ever they re telling you why they re worth it cnn cnn values your feedback 1 how relevant is this ad to you 2 did you encounter any technical issues video player was slow to load content video content never loaded ad froze or did not finish loading video content


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt_tab', quiet=True)

# Step 4: Remove stopwords
stop_words = set(stopwords.words('english'))
tokens = word_tokenize(text_no_punct)
filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]

print("\n4. After stopword removal (first 50 tokens):")
print(filtered_tokens[:50])


4. After stopword removal (first 50 tokens):
['luxury', 'brands', 'expensive', 'ever', 'telling', 'worth', 'cnn', 'cnn', 'values', 'feedback', 'relevant', 'ad', 'encounter', 'technical', 'issues', 'video', 'player', 'slow', 'load', 'content', 'video', 'content', 'never', 'loaded', 'ad', 'froze', 'finish', 'loading', 'video', 'content', 'start', 'ad', 'audio', 'ad', 'loud', 'issues', 'ad', 'never', 'loaded', 'ad', 'prevented', 'slowed', 'page', 'loading', 'content', 'moved', 'around', 'ad', 'loaded', 'ad']


In [None]:
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

# Step 5: Lemmatize
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_tokens]

print("\n5. First 50 lemmatized words:")
print(lemmatized_words[:50])

# Stemming for comparison
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in filtered_tokens]

print("\nFirst 50 stemmed words:")
print(stemmed_words[:50])

# Check differences
differences = [(i, lemmatized_words[i], stemmed_words[i])
for i in range(min(50, len(lemmatized_words)))
if lemmatized_words[i] != stemmed_words[i]]
print(f"\nStemming vs Lemmatization differences in first 50: {len(differences)} positions")
print("Examples:", differences[:10])


5. First 50 lemmatized words:
['luxury', 'brand', 'expensive', 'ever', 'telling', 'worth', 'cnn', 'cnn', 'value', 'feedback', 'relevant', 'ad', 'encounter', 'technical', 'issue', 'video', 'player', 'slow', 'load', 'content', 'video', 'content', 'never', 'loaded', 'ad', 'froze', 'finish', 'loading', 'video', 'content', 'start', 'ad', 'audio', 'ad', 'loud', 'issue', 'ad', 'never', 'loaded', 'ad', 'prevented', 'slowed', 'page', 'loading', 'content', 'moved', 'around', 'ad', 'loaded', 'ad']

First 50 stemmed words:
['luxuri', 'brand', 'expens', 'ever', 'tell', 'worth', 'cnn', 'cnn', 'valu', 'feedback', 'relev', 'ad', 'encount', 'technic', 'issu', 'video', 'player', 'slow', 'load', 'content', 'video', 'content', 'never', 'load', 'ad', 'froze', 'finish', 'load', 'video', 'content', 'start', 'ad', 'audio', 'ad', 'loud', 'issu', 'ad', 'never', 'load', 'ad', 'prevent', 'slow', 'page', 'load', 'content', 'move', 'around', 'ad', 'load', 'ad']

Stemming vs Lemmatization differences in first 50: 1