In [440]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import pyarrow as pa # optional
import seaborn as sns
import matplotlib.pyplot as plt
import re
import io
import os.path
import json
import time
import glob
from IPython.core.display import HTML
from IPython import display
from base64 import b64decode
import sqlite3
import os
import csv
import re


In [418]:
style = '''<style>
h3, h4 {
    background-color: #7efcf5;
    border-left: 5px solid #7ec4fc;
    border-right: 5px solid #7ec4fc;
    padding: 0em;
}
h3 {
    background-color: #7efcf5;
    border-top: 5px solid #7ec4fc;
    border-left: 5px solid #7ec4fc;
    border-right: 5px solid #7ec4fc;
    padding: 0.5em;
}
p {
    padding: 0.5em;
    max-width: 34em;
    font-weight:400;
}
.md {
    max-width: 80ch;

}
.prompt {    
    background-color: lightgreen;
    border-color: #dFb5b4;
    border-left: 5px solid #f57efc;
    padding: 0.5em;
    font-weight:500;
    }
 </style>'''
HTML(style)

In [423]:
# creating database
connection = sqlite3.connect('furniture.db')
cursor = connection.cursor()


cursor.execute('''
    CREATE TABLE IF NOT EXISTS furniture (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        type TEXT NOT NULL,
        productId INTEGER,
        productName TEXT,
        productUrl TEXT,
        productImage TEXT,
        currentPrice REAL,
        productDescription TEXT,
        dimensions TEXT
    )
''')

<sqlite3.Cursor at 0x1594af5c0>

In [425]:
#OPEN FILE HTML and read from it 
def read_html_file(url):
    with open(url, "r", encoding="utf-8") as file:
        content = file.read()
        soup = BeautifulSoup(content, "html.parser")
        return content, soup
    
def read_url(url):
    response = requests.get(url)
    response.raise_for_status()
    html_content = response.text

    soup = BeautifulSoup(html_content, 'html.parser')
    return soup

In [442]:

living_array = []



furniture = {
            'store': "N/A",
            'type': "N/A",
            'productName': "N/A",
            'productUrl': "N/A",
            'productImage': "N/A",
            'currentPrice': 0,
            'productDescription': "N/A",
            'dimensions': {},
            "productId": 0
        }

In [443]:

# FOR CB2
# connecting to the living room furniture html within CB2
url = "data/html/cb2/cb2_livingroom.html"

content, soup = read_html_file(url)

living_furniture = re.search(r'productData', content, re.DOTALL)


def get_description(url):
    soup = read_url(url)
    description = soup.find("div", class_= "details-description").find("p").get_text(strip=True)
    if description is None:
        return 'N/A'
    return description



def search_furniture(json_text):
   
   if json_text:
        products_data = json.loads(f'{{"product": {json_text.group(1)}}}')
        for product in products_data["product"]:
            exists = any(item.get("productId") == product["productInfo"].get("productId") for item in living_array)
            if(exists == False):
                furniture = {
                    'store': "N/A",
                    'type': "N/A",
                    'productName': "N/A",
                    'productUrl': "N/A",
                    'productImage': "N/A",
                    'currentPrice': 0,
                    'productDescription': "N/A",
                    'dimensions': {},
                    "productId": 0
                }
                furniture["store"] = "CB2"
                product_info = product["productInfo"]
                attributes = product["attributes"]
                furniture["productId"] =  product_info.get("productId")
                furniture["productName"] =  product_info.get("productName")
                furniture["productUrl"] =  "https://www.cb2.com" + product_info.get("productURL")
                furniture["productImage"] = "https://cb2.scene7.com/is/image/CB2/" + product_info.get("productImage")
                furniture["currentPrice"] =  attributes["price"].get("currentPrice")
                title = product_info.get("productName").lower()
                if "sofa" in title:
                    type = 'sofa'
                elif "chair" in title:
                    type = 'chair'
                elif "side table" in title:
                    type = 'side_table'
                elif "media table" in title:
                    type = 'media_table'
                elif "console table" in title:
                    type = 'console_table'
                elif "coffee table" in title:
                    type = 'coffee_table'
                elif "table" in title:
                    type = 'coffee_table'
                elif "ottoman" in title:
                    type = 'ottoman'
                furniture['type'] = type
                furniture['dimensions'] = {}
                furniture['productDescription'] = ""
                if type != 'NA':
                    living_array.append(furniture)
                    # print(furniture)

json_text = re.search(r'"product":\s*(\[\{.*?\}\])', living_furniture.string, re.DOTALL)
search_furniture(json_text)





In [446]:
for i in living_array:
    print(i)

len(living_array)

{'store': 'CB2', 'type': 'sofa', 'productName': 'Ceva 103" Light Blue Performance Velvet Sofa', 'productUrl': 'https://www.cb2.com/ceva-103-light-blue-performance-velvet-sofa/s669338', 'productImage': 'https://cb2.scene7.com/is/image/CB2/CevaLtBlueVelvetSofaSHF23', 'currentPrice': 2499.0, 'productDescription': '', 'dimensions': {}, 'productId': '669338'}
{'store': 'CB2', 'type': 'sofa', 'productName': 'Faible 100" Wheat Performance Velvet Sofa', 'productUrl': 'https://www.cb2.com/faible-100-wheat-performance-velvet-sofa/s668917', 'productImage': 'https://cb2.scene7.com/is/image/CB2/FaibleWheatVelvetSofaSHF23', 'currentPrice': 2499.0, 'productDescription': '', 'dimensions': {}, 'productId': '668917'}
{'store': 'CB2', 'type': 'sofa', 'productName': 'Marguerite 102" White Performance Fabric Sofa', 'productUrl': 'https://www.cb2.com/marguerite-102-white-performance-fabric-sofa/s682377', 'productImage': 'https://cb2.scene7.com/is/image/CB2/MargueriteWhtLrgSofaSHF23', 'currentPrice': 2699.0,

300

In [487]:
circle_array = []

In [488]:

# For circle furniture 

def get_dimensions(html):
    dimensions = html.find('div', class_='product_item_accrodian').find_all("p")
    dimensions_dict = {}
    getDimension = False
    if(len(dimensions) >= 1 and len(dimensions) <= 3):
         if(len(dimensions) == 1):
             dimensions = dimensions[0].get_text(strip=True)
         else:
             dimensions = dimensions[1].get_text(strip=True)
         getDimension = True
    elif (len(dimensions) > 3):
         dimensions = dimensions[2].get_text(strip=True)
         getDimension = True
    else:
        print("new")
        print(dimensions)
        print(len(dimensions))
    

    if getDimension:
        if 'w' in dimensions:
            width = re.search(r'(\d+)"w', dimensions)
            if width:
                dimensions_dict["Width"] = int(width.group(1))
                
        if 'd' in dimensions:
            length = re.search(r'(\d+)"d', dimensions)
            if length:
                dimensions_dict["Length"] = int(length.group(1))
        if 'h' in dimensions:
            height = re.search(r'(\d+)"h', dimensions)
            if height:
                dimensions_dict["Height"] = int(height.group(1))
    return dimensions_dict

def get_description(html):
    try:
        description = html.find("p").get_text(strip=True)
        if description is None:
            return 'N/A'
        return description
    except requests.exceptions.RequestException as e:
        print(f"Error processing {url} for {url}: {e}")
        pass

def page_details(url):
    soup = read_url(url)
    page_descriptin = soup.find('div', class_= "product_des")
    description = get_description(page_descriptin)
    dimensions = get_dimensions(page_descriptin)
    return description, dimensions


def get_furn(url, furniture_type):

    soup = read_url(url)
    items = soup.find_all('div', class_="topic_item equal")

    for item in items:
        furniture = {}  

        price_tag = item.find('div', class_='product-price').find('span')
        price = price_tag.text.strip() if price_tag else "0"
        price_int = int(price.replace('$', '').replace(',', ''))

        furniture["store"] = "Circle Furniture"
        furniture["productName"] = item.find('img')['alt'] if item.find('img') else 'Unknown'
        furniture["productUrl"] = "https://www.circlefurniture.com" + item.find('a')['href'] if item.find('a') else 'N/A'
        furniture["productImage"] = "https://www.circlefurniture.com" + item.find('img')['src'] if item.find('img') else 'N/A'
        furniture["currentPrice"] = price_int
        furniture["type"] = furniture_type

        productDescription, dimensions = page_details(furniture["productUrl"])
        furniture["productDescription"] = productDescription
        furniture["dimensions"] = dimensions


        if type != 'N/A' and furniture["productDescription"] != 'N/A':
            circle_array.append(furniture)

sofa_url = "https://www.circlefurniture.com/products/living/sofas-and-loveseats"
media_url = "https://www.circlefurniture.com/products/living/media-consoles"
end_url = "https://www.circlefurniture.com/products/living/end-tables"
ottoman_url = "https://www.circlefurniture.com/products/living/accent-benches-and-ottomans"
rug_url = "https://www.circlefurniture.com/products/living/rugs"

get_furn(sofa_url, "sofa")
get_furn(media_url, "media_table")
get_furn(end_url, "side_table")
get_furn(ottoman_url, "ottoman")
get_furn(rug_url, "rug")

In [490]:
print(len(circle_array))
def find_duplicate_product_names(array_of_dicts):
    seen_names = set()
    duplicates = set()
    for item in array_of_dicts:
        product_name = item.get("productName")
        if product_name in seen_names:
            duplicates.add(product_name)
        seen_names.add(product_name)
    return list(duplicates)  # Return the duplicate product names

print(find_duplicate_product_names(circle_array))  # Output: ['Chair']

414
['Shaker Standard 50', 'Shaker Deluxe 50', 'Shaker Standard 65', 'Willow Deluxe 65']


In [474]:
bo_array = []
print(len(living_array))

300


In [475]:


# for Bo Furniture

def convert_measurement(measurement_str):
    fraction_to_decimal = {
        '¼': 0.25,
        '½': 0.5,
        '¾': 0.75
    }
    
    measurement_str = measurement_str.replace('"', '').strip()
    
    # Use a regular expression to split the numeric and fractional parts
    match = re.match(r"(\d+)([¼½¾]?)", measurement_str)
    if match:
        whole_number = int(match.group(1))  # The numeric part
        fraction = fraction_to_decimal.get(match.group(2), 0)  # The fractional part
        return whole_number + fraction
    else:
        return measurement_str
        # raise ValueError(f"Invalid measurement format: {measurement_str} , {url}")

def get_dimensions(url):
    try:
        soup = read_url(url)
        dimensions = soup.find('dl', class_='MeasurementsSidebar_dataList__nk0Kw')
        measurements = {}

        # Iterate over dt and dd pairs
        for dt, dd in zip(dimensions.find_all('dt'), dimensions.find_all('dd')):
            term = dt.get_text(strip=True)
            definition = convert_measurement(dd.get_text(strip=True))
            
            # Only add the term if it doesn't already exist in the dictionary
            if term not in measurements:
                measurements[term] = definition
        return measurements
    except requests.exceptions.RequestException as e:
        print(f"Error processing {url} for {url}: {e}")
        return {}

def get_description(url):
    try:
        soup = read_url(url)
        description = soup.find('p', class_='ProductAttributes_description__yOc1g Headline_heading4__cIAAR typography_heading4__5l4k8')
        if description is None:
            return 'N/A'
        return description.text.strip() 
    except requests.exceptions.RequestException as e:
        print(f"Error processing {url} for {url}: {e}")
        return 'N/A'



# ProductAttributes_container__F3kP0

def get_bo_furn(url, furniture_type):
    soup = read_url(url)

    product_cards = soup.find_all('figure', class_='ProductCard_figure__8hO5m')

    for card in product_cards:
        furniture = {
                    'store': "N/A",
                    'type': "N/A",
                    'productName': "N/A",
                    'productUrl': "N/A",
                    'productImage': "N/A",
                    'currentPrice': 0,
                    'productDescription': "N/A",
                    'dimensions': {},
                    "productId": 0
                }
        
        product_name = card.find('div', class_='ProductCard_name__Cc2fR').text.strip() if card.find('div', class_='ProductCard_name__Cc2fR') else 'N/A'
        product_url = "https://www.boconcept.com" + card.find('a')['href'] if card.find('a') else 'N/A'
        img_tag = card.find('img')
        image_url = img_tag['src'] if img_tag and 'src' in img_tag.attrs else 'N/A'
        price_tag = card.find('div', class_='ProductCard_price__nKioQ')
        price_int = int(float(price_tag.text.strip().replace('$', '').replace(',', '')))
        price = price_int
        
        title = product_name.lower()
        if "side table" in title:
            furniture_type = 'side_table'
        elif "media table" in title:
            furniture_type = 'media_table'
        elif "console table" in title:
            furniture_type = 'console_table'
        elif "coffee table" in title:
            furniture_type = 'coffee_table'
        


        productDescription = get_description(product_url)
        dimensions = get_dimensions(product_url)
        furniture["store"] = 'Bo Concept'
        furniture["productName"] = product_name
        furniture['productImage'] = image_url
        furniture["productUrl"] = product_url
        furniture["currentPrice"] = price
        furniture["type"] = furniture_type
        furniture["productName"] = product_name
        furniture['productDescription'] = productDescription
        furniture['dimensions'] = dimensions
        
        if image_url != 'N/A' and furniture_type != 'table' and productDescription != 'N/A':
            bo_array.append(furniture)
            print(furniture)

sofa_url = "https://www.boconcept.com/en-us/shop/sofas/"
table_url = "https://www.boconcept.com/en-us/shop/tables/?q=page--2"
rug_url = "https://www.boconcept.com/en-us/shop/rugs/"

get_bo_furn(sofa_url, "sofa")
get_bo_furn(table_url, "table")
get_bo_furn(rug_url, "rug")

{'store': 'Bo Concept', 'type': 'sofa', 'productName': 'Bolzano 2.5-seater', 'productUrl': 'https://www.boconcept.com/en-us/p/bolzano/4462501-14:3096/', 'productImage': 'https://assets.boconcept.com/5e877119-a345-491f-a01d-af84017feeeb/2063704_PNG-Web%2072dpi.png?format=pjpg&auto=webp&fit=bounds&width=3020&quality=75%2C60', 'currentPrice': 3696, 'productDescription': 'Soft curves and organic forms take center stage in the Bolzano sofa. With a sculptural silhouette, this elegant curved sofa has a streamlined look and a cocooning feel. Designed by Morten Georgsen, it’s made for relaxation with the perfect foam-to-frame ratio for superior seating comfort. Make your home a warm, welcoming space with Bolzano.', 'dimensions': {'Armrest height': 30, 'Depth': 36, 'Height': 30, 'Legs height': 1.25, 'Seating height': 16.75, 'Weight': 123, 'Width': 72.5}, 'productId': 0}
{'store': 'Bo Concept', 'type': 'sofa', 'productName': 'Indivi 3-seater sofa', 'productUrl': 'https://www.boconcept.com/en-us/p

In [477]:
print(len(bo_array))

def find_duplicate_product_names(array_of_dicts):
    seen_names = set()
    duplicates = set()
    for item in array_of_dicts:
        product_name = item.get("productName")
        if product_name in seen_names:
            duplicates.add(product_name)
        seen_names.add(product_name)
    return list(duplicates)  # Return the duplicate product names

print(find_duplicate_product_names(bo_array))  # Output: ['Chair']

57
['Chiva functional coffee table with storage', 'Madrid side table']


In [491]:
combined_array = []


print(len(combined_array))

print(len(living_array))
print(len(circle_array))
print(len(bo_array))

combined_array = living_array + circle_array + bo_array
print(len(combined_array))

print(combined_array[500])

0
300
414
57
771
{'store': 'Circle Furniture', 'productName': 'Reveal End Table in Natural Walnut', 'productUrl': 'https://www.circlefurniture.com/products/living/end-tables/reveal-end-table-nat-wal', 'productImage': 'https://www.circlefurniture.com/userfiles/images/Products/bdi/reveal-table/end-table/reveal-end-1196-BDI-WL-1-thumbthumb.jpg', 'currentPrice': 1199, 'type': 'side_table', 'productDescription': 'The Reveal End Table in Natural Walnut is now in stock.Slender and versatile, the Reveal End Table offers a convenient setup for your home. It stands as a beautifully crafted table featuring a glass top and lower shelf for display along with a divided compartment for storage.\xa0The table pairs perfectly with theReveal Lift Coffee Table, but it also works well on its own.Designed by BDI.SKU #18950', 'dimensions': {}}


In [492]:

# Specify the CSV file name



csv_file = "furniture_data.csv"

# Write data to CSV
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    # Create a writer object with fieldnames based on the dictionary keys
    writer = csv.DictWriter(file, fieldnames=combined_array[0].keys())

    # Write header row
    writer.writeheader()

    # Write rows from the array of dictionaries
    writer.writerows(combined_array)

print(f"Data has been written to {csv_file}")

Data has been written to furniture_data.csv


In [None]:


cursor.execute('''
    CREATE TABLE IF NOT EXISTS furniture (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        store TEXT,
        type TEXT NOT NULL,
        productId INTEGER,
        productName TEXT,
        productUrl TEXT,
        productImage TEXT,
        currentPrice REAL,
        productDescription TEXT,
        dimensions TEXT
    )
''')

for item in living_array:
    cursor.execute('''
        INSERT INTO furniture (
        , productId, productName, type, productUrl, productImage, currentPrice, productDescription, dimensions)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?)
    ''', (
        item['productId'],
        item['productName'],
        item['type'],
        item['productUrl'],
        item['productImage'],
        item['currentPrice'],
        item['productDescription'],
        item['dimensions'],
    ))


    cursor.execute('''
    CREATE TABLE IF NOT EXISTS furniture (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        store TEXT,
        type TEXT NOT NULL,
        productId INTEGER,
        productName TEXT,
        productUrl TEXT,
        productImage TEXT,
        currentPrice REAL,
        productDescription TEXT,
        dimensions TEXT
    )
''')

connection.commit()
print("Data inserted successfully.")

cursor.execute("SELECT * FROM furniture")
rows = cursor.fetchall()

for row in rows:
    print(row)

# Close the connection
connection.close()
