# Importing the necessary libraries

In [None]:
import pandas as pd
import aiohttp
import asyncio
from bs4 import BeautifulSoup
from google.colab import files
import nest_asyncio
from urllib.parse import urlencode

## Extracting Data from Furniture Store URLs Using AsyncIO

I know there're plenty of other options for extracting data from webpages (for example, Selenium that I have some experience with). I chose AsyncIO here because it can perform concurrent, non-blocking I/O operations efficiently (it's fast and lightweight). Also, I did the majority of the task in Google Colab (partly because of the free GPU). Using Selenium there was more time-consuming, so I sticked with asyncio this time.


In [None]:
# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()

# Upload the CSV file
uploaded = files.upload()  # This will prompt you to upload 'URL_list.csv'

# Load the CSV file containing URLs
csv_file = list(uploaded.keys())[0]  # Get the uploaded file name dynamically
urls = pd.read_csv(csv_file)

# Extract the URLs from the 'max(page)' column
url_list = urls['max(page)'].tolist()

# Function to fetch a page asynchronously with retries and SSL handling
async def fetch(session, url, retries=3):
    for _ in range(retries):
        try:
            async with session.get(url, ssl=False, timeout=10) as response:
                response.raise_for_status()
                return await response.text()
        except Exception as e:
            print(f"Error fetching {url}: {e}")
            await asyncio.sleep(2)  # Retry after a short delay
    return None  # Return None if all retries fail

# Function to scrape a single page and parse the text
async def scrape_page(session, url):
    page_content = await fetch(session, url)
    if page_content:
        soup = BeautifulSoup(page_content, 'html.parser')
        return soup.get_text(separator=' ', strip=True)
    return None

# Function to handle multiple requests concurrently
async def scrape_all(urls):
    page_texts = {}
    async with aiohttp.ClientSession() as session:
        tasks = []
        for url in urls:
            tasks.append(scrape_page(session, url))

            # Process tasks in batches of 100 to avoid overwhelming the server
            if len(tasks) == 100:
                results = await asyncio.gather(*tasks)

                # Store results in a dictionary
                for i, result in enumerate(results):
                    if result:
                        page_texts[urls[i - len(tasks)]] = result

                # Clear the tasks list for the next batch
                tasks = []

        # Handle any remaining tasks after the loop
        if tasks:
            results = await asyncio.gather(*tasks)
            for i, result in enumerate(results):
                if result:
                    page_texts[urls[i - len(tasks)]] = result

    return page_texts

# Run the asynchronous scraping for all URLs
page_texts = asyncio.run(scrape_all(url_list))

# Save the extracted data to a file
with open('scraped_data.txt', 'w', encoding='utf-8') as f:
    for url, text in page_texts.items():
        f.write(f"URL: {url}\n\n{text}\n\n{'-'*80}\n\n")

print("Data extraction complete.")

# Download the file to your local machine
files.download('scraped_data.txt')

Saving URL_list.csv to URL_list.csv
Error fetching https://home-buy.com.au/products/bridger-pendant-larger-lamp-metal-brass: Cannot connect to host home-buy.com.au:443 ssl:False [Name or service not known]
Error fetching https://beckurbanfurniture.com.au/products/page/2/: Cannot connect to host beckurbanfurniture.com.au:443 ssl:False [Name or service not known]
Error fetching https://edenliving.online/collections/summerloving/products/nice-lounge-1: 404, message='Not Found', url='https://edenliving.online/collections/summerloving/products/nice-lounge-1'
Error fetching https://furnish123watertown.com/products/: 404, message='Not Found', url='https://furnish123watertown.com/products/'
Error fetching https://www.fentonandfenton.com.au/products/gift-card&media=http://cdn.shopify.com/s/files/1/1909/9637/products/GiftVouchers_Product_1024x1024.jpg&description=Gift%20Card%20%23Art-Lover%20%23Birthday%20%23Colour-Lover: 404, message='Not Found', url='https://www.fentonandfenton.com.au/products

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

As you can see above, I had trouble crawling data from most of these links. I modified the code above a lot in an attempt to gather more data. In this notebook, you can see the final version that was kinda better than the others.

In [None]:
# Here's a function that encapsulates the whole process of scraping data (This notebook reuses the same code a lot, which is bad. 
# I will change that if I have time)

# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()

def scrape_furniture_data():
    # Upload the CSV file
    uploaded = files.upload()  # This will prompt you to upload 'URL_list.csv'

    # Load the CSV file containing URLs
    csv_file = list(uploaded.keys())[0]  # Get the uploaded file name dynamically
    urls = pd.read_csv(csv_file)

    # Extract the URLs from the 'max(page)' column
    url_list = urls['max(page)'].tolist()

    # Function to fetch a page asynchronously with retries and SSL handling
    async def fetch(session, url, retries=3):
        for _ in range(retries):
            try:
                async with session.get(url, ssl=False, timeout=10) as response:
                    response.raise_for_status()
                    return await response.text()
            except Exception as e:
                print(f"Error fetching {url}: {e}")
                await asyncio.sleep(2)  # Retry after a short delay
        return None  # Return None if all retries fail

    # Function to scrape a single page and parse the text
    async def scrape_page(session, url):
        page_content = await fetch(session, url)
        if page_content:
            soup = BeautifulSoup(page_content, 'html.parser')
            return soup.get_text(separator=' ', strip=True)
        return None

    # Function to handle multiple requests concurrently
    async def scrape_all(urls):
        page_texts = {}
        async with aiohttp.ClientSession() as session:
            tasks = []
            for url in urls:
                tasks.append(scrape_page(session, url))

                # Process tasks in batches of 100 to avoid overwhelming the server
                if len(tasks) == 100:
                    results = await asyncio.gather(*tasks)

                    # Store results in a dictionary
                    for i, result in enumerate(results):
                        if result:
                            page_texts[urls[i - len(tasks)]] = result

                    # Clear the tasks list for the next batch
                    tasks = []

            # Handle any remaining tasks after the loop
            if tasks:
                results = await asyncio.gather(*tasks)
                for i, result in enumerate(results):
                    if result:
                        page_texts[urls[i - len(tasks)]] = result

        return page_texts

    # Run the asynchronous scraping for all URLs
    page_texts = asyncio.run(scrape_all(url_list))

    # Save the extracted data to a file
    with open('scraped_data.txt', 'w', encoding='utf-8') as f:
        for url, text in page_texts.items():
            f.write(f"URL: {url}\n\n{text}\n\n{'-'*80}\n\n")

    print("Data extraction complete.")

    # Download the file to your local machine
    files.download('scraped_data.txt')

# Call the function to execute the scraping process
scrape_furniture_data()

# Using Wayback to access disfunctional links

The Wayback Machine can help me to access dysfunctional links by retrieving the archived versions of web pages that are no longer available. After using it, I did actually get a bit more data. But the improvement was too slight. So I tried other things that you'll see below.

In [1]:
import pandas as pd
import aiohttp
import asyncio
from bs4 import BeautifulSoup
from google.colab import files
import nest_asyncio
from urllib.parse import urlencode
import random
import time

# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()

# Upload the CSV file
uploaded = files.upload()  # Prompt to upload 'URL_list.csv'

# Load the CSV file containing URLs
csv_file = list(uploaded.keys())[0]  # Get the uploaded file name dynamically
urls = pd.read_csv(csv_file)

# Extract the URLs from the 'max(page)' column
url_list = urls['max(page)'].tolist()

# Wayback Machine API URL for archived versions
WAYBACK_API = "http://archive.org/wayback/available?"

# Function to fetch a page asynchronously with retries and SSL handling
async def fetch(session, url, retries=2, timeout=15):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
    for attempt in range(retries):
        try:
            async with session.get(url, ssl=False, timeout=timeout, headers=headers) as response:
                if response.status == 200:
                    return await response.text()
        except Exception as e:
            print(f"Error fetching {url}: {e}")
            await asyncio.sleep(random.uniform(1, 3))  # Short random delay to avoid being blocked
    return None  # Return None if all retries fail

# Function to check if Wayback Machine has an archived version of the page
async def fetch_wayback(session, url):
    try:
        api_url = WAYBACK_API + urlencode({'url': url})
        async with session.get(api_url) as response:
            response.raise_for_status()
            data = await response.json()
            if 'archived_snapshots' in data and 'closest' in data['archived_snapshots']:
                archived_url = data['archived_snapshots']['closest']['url']
                print(f"Fetching archived version from Wayback: {archived_url}")
                async with session.get(archived_url) as archived_response:
                    archived_response.raise_for_status()
                    return await archived_response.text()
    except Exception as e:
        print(f"Error fetching Wayback version of {url}: {e}")
    return None

# Function to scrape a single page and parse the text
async def scrape_page(session, url):
    page_content = await fetch(session, url)
    if not page_content:
        print(f"Fetching from Wayback for: {url}")
        page_content = await fetch_wayback(session, url)

    if page_content:
        soup = BeautifulSoup(page_content, 'html.parser')
        return soup.get_text(separator=' ', strip=True)
    return None

# Function to handle multiple requests concurrently
async def scrape_all(urls):
    page_texts = {}
    async with aiohttp.ClientSession() as session:
        tasks = []
        for url in urls:
            tasks.append(scrape_page(session, url))

            # Process tasks in batches of 50 to avoid overwhelming the server
            if len(tasks) == 50:
                results = await asyncio.gather(*tasks)

                for i, result in enumerate(results):
                    if result:
                        page_texts[urls[i]] = result

                # Clear tasks for the next batch
                tasks = []

                # Add a small random delay between batches to avoid rate limiting
                await asyncio.sleep(random.uniform(1, 3))

        # Handle any remaining tasks after the loop
        if tasks:
            results = await asyncio.gather(*tasks)
            for i, result in enumerate(results):
                if result:
                    page_texts[urls[i]] = result

    return page_texts

# Run the asynchronous scraping for all URLs
start_time = time.time()
page_texts = asyncio.run(scrape_all(url_list))
elapsed_time = time.time() - start_time
print(f"Data extraction completed in {elapsed_time:.2f} seconds.")

# Save the extracted data to a file
with open('scraped_data.txt', 'w', encoding='utf-8') as f:
    for url, text in page_texts.items():
        f.write(f"URL: {url}\n\n{text}\n\n{'-'*80}\n\n")

print("Data extraction complete.")

# Download the file to your local machine
files.download('scraped_data.txt')


Saving URL_list.csv to URL_list.csv
Error fetching https://home-buy.com.au/products/bridger-pendant-larger-lamp-metal-brass: Cannot connect to host home-buy.com.au:443 ssl:False [Name or service not known]
Error fetching https://beckurbanfurniture.com.au/products/page/2/: Cannot connect to host beckurbanfurniture.com.au:443 ssl:False [Name or service not known]
Fetching from Wayback for: https://edenliving.online/collections/summerloving/products/nice-lounge-1
Error fetching https://www.homekoncepts.com/products/furniture/tables/end-tables/: Cannot connect to host www.homekoncepts.com:443 ssl:False [Name or service not known]
Error fetching https://nicoyafurniture.com.au/products/playa-bowl: Cannot connect to host nicoyafurniture.com.au:443 ssl:False [Name or service not known]
Fetching from Wayback for: https://hemisphereliving.com.au/products/
Fetching from Wayback for: https://furnish123watertown.com/products/
Error fetching https://beckurbanfurniture.com.au/products/page/2/: Cannot

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Getting more links

After getting a relatively small amount of data, I decided to find more. The task guidelines didn't prohibit expanding the dataset (which probably means that's allowed). I tried finding datasets for the task, but I didn't like the results.

So I found some online articles with links to furniture recommendations. The code below scrapes some links and saves them into csv files. After getting the links, I repeated the scraping process above. 

In [2]:
# Apply nest_asyncio to allow nested event loops in Colab
nest_asyncio.apply()

# URL of the webpage containing furniture store links
target_url = "https://www.housebeautiful.com/shopping/furniture/g22548814/best-online-furniture-stores-websites/"

# Function to fetch a page asynchronously with retries
async def fetch(session, url, retries=3):
    for _ in range(retries):
        try:
            async with session.get(url, ssl=False) as response:
                response.raise_for_status()
                return await response.text()
        except Exception as e:
            print(f"Error fetching {url}: {e}")
            await asyncio.sleep(2)
    return None  # Return None if all retries fail

# Function to scrape the webpage and extract furniture store URLs
async def scrape_furniture_stores(url):
    async with aiohttp.ClientSession() as session:
        page_content = await fetch(session, url)
        if page_content:
            soup = BeautifulSoup(page_content, 'html.parser')
            # Find all <a> tags with href attribute
            store_links = [a['href'] for a in soup.find_all('a', href=True) if "http" in a['href']]
            return store_links
        return []

# Function to save the scraped links into a CSV file
def save_links_to_csv(links, filename='furniture_store_links.csv'):
    df = pd.DataFrame(links, columns=["URL"])
    df.to_csv(filename, index=False)
    print(f"Links saved to {filename}")
    files.download(filename)  # Download the file in Colab

# Main function to run the asyncio scraping task
async def main():
    links = await scrape_furniture_stores(target_url)
    if links:
        save_links_to_csv(links)
    else:
        print("No links found.")

# Run the main function
asyncio.run(main())


Links saved to furniture_store_links.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [3]:
links_2 = pd.read_csv("/content/furniture_store_links.csv")

In [6]:
# Remove rows from index 1 to 17 (inclusive) from the DataFrame 'links_2'
# These links weren't connected with furniture, so I manually removed them.
links_2 = links_2.drop(links_2.index[1:18])

# Reset the index after dropping the rows, if needed
links_2 = links_2.reset_index(drop=True)

# Display the DataFrame to verify the result
print(links_2)

                                                  URL
0   https://shop.housebeautiful.com/house-beautifu...
1   https://go.redirectingat.com?id=74968X1525080&...
2   https://go.redirectingat.com?id=74968X1525080&...
3   https://www.housebeautiful.com/author/270315/j...
4   https://go.redirectingat.com?id=74968X1525080&...
..                                                ...
80  https://www.hearst.com/-/us-magazines-privacy-...
81  https://www.hearst.com/-/us-magazines-privacy-...
82  https://www.hearst.com/-/us-magazines-privacy-...
83  https://www.hearst.com/-/us-magazines-privacy-...
84  https://www.hearst.com/-/us-magazines-terms-of...

[85 rows x 1 columns]


# Scraping from my links

In [8]:
# Apply nest_asyncio to allow nested event loops (for environments like Jupyter/Colab)
nest_asyncio.apply()

# Assuming 'links_2' is your DataFrame and contains a column named 'url' with links
urls = links_2['URL'].tolist()  # Convert the 'url' column into a list

# Function to fetch page content asynchronously with retries
async def fetch(session, url, retries=3):
    for _ in range(retries):
        try:
            async with session.get(url, ssl=False, timeout=10) as response:
                response.raise_for_status()  # Raise an error for bad responses (4xx, 5xx)
                return await response.text()
        except Exception as e:
            print(f"Error fetching {url}: {e}")
            await asyncio.sleep(2)  # Retry after a short delay
    return None  # Return None if all retries fail

# Function to scrape a page and parse its content
async def scrape_page(session, url):
    page_content = await fetch(session, url)
    if page_content:
        soup = BeautifulSoup(page_content, 'html.parser')
        return soup.get_text(separator=' ', strip=True)  # Get page text
    return None

# Function to handle multiple concurrent requests
async def scrape_all(urls):
    results = {}
    async with aiohttp.ClientSession() as session:
        tasks = []
        for url in urls:
            tasks.append(scrape_page(session, url))

            # Process tasks in batches of 100 to avoid overwhelming the server
            if len(tasks) == 100:
                scraped_pages = await asyncio.gather(*tasks)

                # Store results in a dictionary
                for i, result in enumerate(scraped_pages):
                    if result:
                        results[urls[i]] = result

                # Clear the tasks list for the next batch
                tasks = []

        # Handle remaining tasks after the loop
        if tasks:
            scraped_pages = await asyncio.gather(*tasks)
            for i, result in enumerate(scraped_pages):
                if result:
                    results[urls[i]] = result

    return results

# Run the asynchronous scraping for all URLs
scraped_data = asyncio.run(scrape_all(urls))

# Save the scraped data to a text file
with open('scraped_furniture_data_my_links.txt', 'w', encoding='utf-8') as f:
    for url, content in scraped_data.items():
        f.write(f"URL: {url}\n\n")
        f.write(content)
        f.write("\n\n" + "-"*80 + "\n\n")  # Add a separator between contents

print("Data scraping complete. Data saved in 'scraped_furniture_data.txt'.")


Error fetching https://www.wayfair.com/bed-bath/pdp/spirit-linen-6-piece-foliage-reversible-comforter-set-w006112385.html: 429, message='Too Many Requests (CDN PX)', url='https://www.wayfair.com/bed-bath/pdp/spirit-linen-6-piece-foliage-reversible-comforter-set-w006112385.html'
Error fetching https://www.amazon.com/?tag=housebeautiful_auto-append-20: 503, message='', url='https://www.amazon.com/?tag=housebeautiful_auto-append-20'
Error fetching https://www.wayfair.com/furniture/pdp/whifea-bar-carts-hnnj1052.html?piid=89079751: 429, message='Too Many Requests (CDN PX)', url='https://www.wayfair.com/furniture/pdp/whifea-bar-carts-hnnj1052.html?piid=89079751'
Error fetching https://www.wayfair.com/lighting/pdp/wade-logan-allana-58-dimmable-led-novelty-corner-floor-lamp-w100186534.html?piid=1849096619: 429, message='Too Many Requests (CDN PX)', url='https://www.wayfair.com/lighting/pdp/wade-logan-allana-58-dimmable-led-novelty-corner-floor-lamp-w100186534.html?piid=1849096619'
Error fetchi

# Another bunch of furniture web page links

In [12]:
# Apply nest_asyncio to allow nested event loops in Colab
nest_asyncio.apply()

# URL of the webpage containing furniture store links
target_url = "https://people.com/best-furniture-deals-amazon-september-2024-8719913"

# Function to fetch a page asynchronously with retries
async def fetch(session, url, retries=3):
    for _ in range(retries):
        try:
            async with session.get(url, ssl=False) as response:
                response.raise_for_status()
                return await response.text()
        except Exception as e:
            print(f"Error fetching {url}: {e}")
            await asyncio.sleep(2)
    return None  # Return None if all retries fail

# Function to scrape the webpage and extract furniture store URLs
async def scrape_furniture_stores(url):
    async with aiohttp.ClientSession() as session:
        page_content = await fetch(session, url)
        if page_content:
            soup = BeautifulSoup(page_content, 'html.parser')
            # Find all <a> tags with href attribute
            store_links = [a['href'] for a in soup.find_all('a', href=True) if "http" in a['href']]
            return store_links
        return []

# Function to save the scraped links into a CSV file
def save_links_to_csv(links, filename='amazon_links.csv'):
    df = pd.DataFrame(links, columns=["URL"])
    df.to_csv(filename, index=False)
    print(f"Links saved to {filename}")
    files.download(filename)  # Download the file in Colab

# Main function to run the asyncio scraping task
async def main():
    links = await scrape_furniture_stores(target_url)
    if links:
        save_links_to_csv(links)
    else:
        print("No links found.")

# Run the main function
asyncio.run(main())

Links saved to amazon_links.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Removing some links that don't contain furniture stuff

In [14]:
import pandas as pd

# Load the CSV file
file_path = '/content/amazon_links.csv'
df = pd.read_csv(file_path)

# Remove rows 2-99 (1-98 in zero-indexing) and 201-218 (200-217 in zero-indexing)
# Note: We use `drop()` to remove the specified indexes
rows_to_drop = list(range(1, 99)) + list(range(200, 217))
df_dropped = df.drop(index=rows_to_drop)

# Save the modified DataFrame back to a CSV file
df_dropped.to_csv(file_path, index=False)

print(f"Removed the specified rows, and saved the updated CSV.")

Removed the specified rows, and saved the updated CSV.


## Scraping amazon finds from the links

In [15]:
# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()

# Upload the CSV file
uploaded = files.upload()  # This will prompt you to upload 'amazon_links.csv'

# Load the CSV file containing URLs
csv_file = list(uploaded.keys())[0]  # Get the uploaded file name dynamically
urls = pd.read_csv(csv_file)

# Extract the URLs from the column (assuming they are in the first column)
url_list = urls.iloc[:, 0].tolist()

# Function to fetch a page asynchronously with retries and SSL handling
async def fetch(session, url, retries=3):
    for _ in range(retries):
        try:
            async with session.get(url, ssl=False, timeout=15) as response:
                if response.status == 200:
                    return await response.text()
                elif response.status == 429:  # Too many requests
                    print(f"Too many requests for {url}, waiting before retry...")
                    await asyncio.sleep(5)  # Wait before retrying
                else:
                    response.raise_for_status()
        except Exception as e:
            print(f"Error fetching {url}: {e}")
            await asyncio.sleep(2)  # Retry after a short delay
    return None  # Return None if all retries fail

# Function to scrape a single page and parse the text
async def scrape_page(session, url):
    # Fetch the live page
    page_content = await fetch(session, url)

    if page_content:
        soup = BeautifulSoup(page_content, 'html.parser')
        return soup.get_text(separator=' ', strip=True)  # Extract text
    return None

# Function to handle multiple requests concurrently
async def scrape_all(urls):
    page_texts = {}
    async with aiohttp.ClientSession() as session:
        tasks = []
        for url in urls:
            tasks.append(scrape_page(session, url))

            # Process tasks in batches of 100 to avoid overwhelming the server
            if len(tasks) == 100:
                results = await asyncio.gather(*tasks)

                # Store results in a dictionary
                for i, result in enumerate(results):
                    if result:
                        page_texts[urls[i - len(tasks)]] = result

                # Clear the tasks list for the next batch
                tasks = []

        # Handle any remaining tasks after the loop
        if tasks:
            results = await asyncio.gather(*tasks)
            for i, result in enumerate(results):
                if result:
                    page_texts[urls[i - len(tasks)]] = result

    return page_texts

# Run the asynchronous scraping for all URLs
page_texts = asyncio.run(scrape_all(url_list))

# Save the extracted data to a file
output_file = 'scraped_amazon_data.txt'
with open(output_file, 'w', encoding='utf-8') as f:
    for url, text in page_texts.items():
        f.write(f"URL: {url}\n\n{text}\n\n{'-'*80}\n\n")

print("Data extraction complete.")

# Download the file to your local machine
files.download(output_file)


Saving amazon_links (2).csv to amazon_links (2).csv
Error fetching https://www.amazon.com/gp/customer-reviews/RBR1UFO7O7FQZ/?tag=people-onsite-backup-20: 503, message='Service Unavailable', url='https://www.amazon.com/gp/customer-reviews/RBR1UFO7O7FQZ/?tag=people-onsite-backup-20'
Error fetching https://www.amazon.com/gp/customer-reviews/R1CGUJEKPYTDIO/?tag=people-onsite-backup-20: 503, message='Service Unavailable', url='https://www.amazon.com/gp/customer-reviews/R1CGUJEKPYTDIO/?tag=people-onsite-backup-20'
Error fetching https://www.amazon.com/MoNiBloom-Drafting-Adjustable-Versatile-Replaceable/dp/B0C3RFVW35?tag=people-onsite-backup-20: 500, message='Internal Server Error', url='https://www.amazon.com/MoNiBloom-Drafting-Adjustable-Versatile-Classroom/dp/B0CY2KDDKK'
Error fetching https://www.amazon.com/gp/customer-reviews/RBR1UFO7O7FQZ/?tag=people-onsite-backup-20: 503, message='Service Unavailable', url='https://www.amazon.com/gp/customer-reviews/RBR1UFO7O7FQZ/?tag=people-onsite-bac

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd
import aiohttp
import asyncio
from bs4 import BeautifulSoup
import nest_asyncio

# Apply nest_asyncio to allow nested event loops (for environments like Jupyter/Colab)
nest_asyncio.apply()

# Assuming 'links_2' is your DataFrame and contains a column named 'url' with links
urls = links_2['URL'].tolist()  # Convert the 'url' column into a list

# Function to fetch page content asynchronously with retries
async def fetch(session, url, retries=3):
    for _ in range(retries):
        try:
            async with session.get(url, ssl=False, timeout=10) as response:
                response.raise_for_status()  # Raise an error for bad responses (4xx, 5xx)
                return await response.text()
        except Exception as e:
            print(f"Error fetching {url}: {e}")
            await asyncio.sleep(2)  # Retry after a short delay
    return None  # Return None if all retries fail

# Function to scrape a page and parse its content
async def scrape_page(session, url):
    page_content = await fetch(session, url)
    if page_content:
        soup = BeautifulSoup(page_content, 'html.parser')
        return soup.get_text(separator=' ', strip=True)  # Get page text
    return None

# Function to handle multiple concurrent requests
async def scrape_all(urls):
    results = {}
    async with aiohttp.ClientSession() as session:
        tasks = []
        for url in urls:
            tasks.append(scrape_page(session, url))

            # Process tasks in batches of 100 to avoid overwhelming the server
            if len(tasks) == 100:
                scraped_pages = await asyncio.gather(*tasks)

                # Store results in a dictionary
                for i, result in enumerate(scraped_pages):
                    if result:
                        results[urls[i]] = result

                # Clear the tasks list for the next batch
                tasks = []

        # Handle remaining tasks after the loop
        if tasks:
            scraped_pages = await asyncio.gather(*tasks)
            for i, result in enumerate(scraped_pages):
                if result:
                    results[urls[i]] = result

    return results

# Run the asynchronous scraping for all URLs
scraped_data = asyncio.run(scrape_all(urls))

# Save the scraped data to a text file
with open('amazon_furniture.txt', 'w', encoding='utf-8') as f:
    for url, content in scraped_data.items():
        f.write(f"URL: {url}\n\n")
        f.write(content)
        f.write("\n\n" + "-"*80 + "\n\n")  # Add a separator between contents

print("Data scraping complete. Data saved in 'amazon_furniture.txt'.")

# Saving all scraped data into a single text file

In [None]:
# File paths for the three input text files. Got two additional files apart from the ones in the task
file1 = '/content/scraped_furniture_data_my_links.txt'
file2 = '/content/scraped_data.txt'
file3 = '/content/scraped_amazon_data.txt'

# Output file where the merged content will be written
output_file = 'merged_scraped_data.txt'

# Open the output file in write mode
with open(output_file, 'w', encoding='utf-8') as outfile:
    # First, read and write the content of the first file
    with open(file1, 'r', encoding='utf-8') as infile1:
        content1 = infile1.read()
        outfile.write(content1)
        outfile.write("\n\n" + "-"*80 + "\n\n")  # Separator for readability

    # Next, read and write the content of the second file
    with open(file2, 'r', encoding='utf-8') as infile2:
        content2 = infile2.read()
        outfile.write(content2)
        outfile.write("\n\n" + "-"*80 + "\n\n")  # Separator for readability

    # Lastly, read and write the content of the third file
    with open(file3, 'r', encoding='utf-8') as infile3:
        content3 = infile3.read()
        outfile.write(content3)

print(f"Files merged successfully into '{output_file}'.")