In [None]:
import pandas as pd
import numpy as np
categories_df = pd.read_csv('../data/amazon_categories.csv')
products_df = pd.read_csv('../data/amazon_products.csv')
data = pd.merge(categories_df, products_df, left_on='id', right_on='category_id')
data = data.drop('id', axis=1)

In [None]:
data.head(10)

In [None]:
pd.options.display.max_rows = 4000
print(data['category_id'].value_counts())


In [None]:
category_id_to_keep_str = ['45','46','47','48','49','50','52','71','72','84','90','91','97','101','103','104','105','107','108','109','110','111','112','113','114','116','118','120','121','122','123','173','174', '270']
category_id_to_keep = [int(id) for id in category_id_to_keep_str]
filtered_data = data[data['category_id'].isin(category_id_to_keep)]
filtered_data.info()


In [None]:
filtered_data.head(20)

In [None]:
print(filtered_data['boughtInLastMonth'].value_counts())


In [None]:
filtered_data = filtered_data[~filtered_data['boughtInLastMonth'].isin([0])]


In [None]:
filtered_data.info()

In [None]:
import requests
from io import BytesIO
from azure.storage.blob import BlobServiceClient
import os
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed
from dotenv import load_dotenv, find_dotenv
import time
import pandas as pd

load_dotenv(find_dotenv())

connection_string = os.getenv("BLOB_CONNECTION_STRING")
container_name = "amazondata1"

blob_service_client = BlobServiceClient.from_connection_string(connection_string)
container_client = blob_service_client.get_container_client(container_name)

session = requests.Session()

def download_image(url):
    try:
        response = session.get(url, timeout=10)
        response.raise_for_status()
        return BytesIO(response.content)
    except requests.RequestException as e:
        print(f"Failed to download image from {url}: {str(e)}")
        return None

def upload_to_blob(image_data, blob_name):
    try:
        blob_client = container_client.get_blob_client(blob_name)
        blob_client.upload_blob(image_data, overwrite=True)
        return blob_client.url
    except Exception as e:
        print(f"Failed to upload {blob_name}: {str(e)}")
        return None

def process_row(row):
    url = row['imgUrl']
    image_data = download_image(url)
    if image_data:
        blob_name = os.path.basename(urlparse(url).path)
        blob_url = upload_to_blob(image_data, blob_name)
        return row.name, blob_name, blob_url
    return row.name, None, None

def process_batch(batch):
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(process_row, row) for _, row in batch.iterrows()]
        results = [future.result() for future in as_completed(futures)]
    return results

batch_size = 1000
total_batches = len(filtered_data) // batch_size + (1 if len(filtered_data) % batch_size else 0)

filtered_data['blob_name'] = None
filtered_data['blob_url'] = None

start_time = time.time()
for i in range(0, len(filtered_data), batch_size):
    batch = filtered_data.iloc[i:i+batch_size]
    results = process_batch(batch)
    
    for index, blob_name, blob_url in results:
        filtered_data.loc[index, 'blob_name'] = blob_name
        filtered_data.loc[index, 'blob_url'] = blob_url
    
    batch_number = i // batch_size + 1
    elapsed_time = time.time() - start_time
    avg_time_per_batch = elapsed_time / batch_number
    estimated_time_remaining = avg_time_per_batch * (total_batches - batch_number)
    
    print(f"Processed batch {batch_number}/{total_batches}")
    print(f"Elapsed time: {elapsed_time:.2f} seconds")
    print(f"Estimated time remaining: {estimated_time_remaining:.2f} seconds")
    print("--------------------")
    
    time.sleep(1)  

print("Processing complete!")
print(filtered_data)

session.close()

In [None]:
import requests
from bs4 import BeautifulSoup
import json

def scrape_description(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept-Language': 'en-US,en;q=0.9',
    }

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"Error fetching the webpage: {url}")
        print(f"Error details: {e}")
        return None

    soup = BeautifulSoup(response.text, "lxml")

    # Extract title
    title_element = soup.select_one("span#productTitle")
    title = title_element.text.strip() if title_element else None

    # Extract rating
    rating = None
    rating_element = soup.select_one("span.a-icon-alt")
    if rating_element:
        rating_text = rating_element.text
        rating = rating_text.split()[0] if rating_text else None

    # Extract description
    description = None
    script_tags = soup.find_all("script", type="application/ld+json")
    for script in script_tags:
        try:
            data = json.loads(script.string)
            if "description" in data:
                description = data["description"]
                break
        except json.JSONDecodeError:
            continue

    return {
        "title": title,
        "rating": rating,
        "description": description
    }


In [None]:
print(scrape_description('https://www.amazon.com/dp/B001O867N6'))

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from time import sleep
from tqdm import tqdm
import random

df = filtered_data

def scrape_description(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept-Language': 'en-US,en;q=0.9',
    }
    max_retries = 5
    for attempt in range(max_retries):
        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            description = None
            
            prod_desc = soup.find('div', {'id': 'productDescription'})
            if prod_desc:
                description = prod_desc.text.strip()
            
            if not description:
                feature_bullets = soup.find('div', {'id': 'feature-bullets'})
                if feature_bullets:
                    description = feature_bullets.text.strip()
            
            if not description:
                tech_details = soup.find('div', {'id': 'technical-details'})
                if tech_details:
                    description = tech_details.text.strip()
            
            if not description:
                prod_info = soup.find('div', {'id': 'prodDetails'})
                if prod_info:
                    description = prod_info.text.strip()
            
            if not description:
                desc_classes = ['a-section a-spacing-medium a-spacing-top-small', 'a-expander-content a-expander-partial-collapse-content']
                for class_name in desc_classes:
                    desc_div = soup.find('div', {'class': class_name})
                    if desc_div:
                        description = desc_div.text.strip()
                        break
            
            if not description:
                keywords = ['description', 'about this item', 'product description', 'overview']
                paragraphs = soup.find_all('p')
                for p in paragraphs:
                    if any(keyword in p.text.lower() for keyword in keywords):
                        description = p.text.strip()
                        break
            
            if description:
                description = re.sub(r'\s+', ' ', description)  
                description = description.replace('\n', ' ').strip() 
                return description
            else:
                return "Description not found"
        
        except requests.RequestException as e:
            if attempt == max_retries - 1:
                print(f"Failed to scrape {url}: {str(e)}")
                return "Error: Unable to scrape"
            else:
                sleep(random.uniform(1, 3))  # Random delay between retries

    return "Error: Max retries reached"  
def process_batch(urls):
    with ThreadPoolExecutor(max_workers=10) as executor:
        future_to_url = {executor.submit(scrape_description, url): url for url in urls}
        results = {}
        for future in as_completed(future_to_url):
            url = future_to_url[future]
            try:
                results[url] = future.result()
            except Exception as exc:
                print(f'{url} generated an exception: {exc}')
                results[url] = "Error: Exception occurred"
    return results

batch_size = 100
num_batches = len(df) // batch_size + (1 if len(df) % batch_size != 0 else 0)

descriptions = {}
for i in tqdm(range(num_batches), desc="Processing batches"):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(df))
    batch_urls = df['productURL'].iloc[start_idx:end_idx].tolist()
    batch_results = process_batch(batch_urls)
    descriptions.update(batch_results)
    sleep(random.uniform(1, 3)) 

df['description'] = df['productURL'].map(descriptions)

#df.to_csv('updated_dataset.csv', index=False)
df.head(10)



In [None]:
df.to_csv('../data/updated_dataset.csv', index=False)


In [None]:
filtered_data.to_csv('../data/filtered_dataset.csv', index=False)
