In [1]:
import csv
import logging
import time
import re
import urllib3
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from typing import Dict, List, Optional, Set, Tuple
from urllib.parse import urljoin, urlparse

import pandas as pd
import requests
from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

In [2]:
# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

class StudioEquipmentScraper:
    def __init__(
        self,
        max_workers: int = 5,
        rate_limit_delay: float = 1.0,
        batch_size: int = 50,
        verify_ssl: bool = True
    ):
        self.max_workers = max_workers
        self.rate_limit_delay = rate_limit_delay
        self.batch_size = batch_size
        self.verify_ssl = verify_ssl
        
        # Equipment-related keywords
        self.url_keywords = {
            'equipment', 'gear', 'tech', 'hardware', 'spec', 'specs',
            'control room', 'studio', 'live room', 'tracking', 'isolation booth', 'recording', 
            'equipment list', 'gear list', 'inventory','facilities', 'mic list',
            'instruments', 'microphones', 'microphone', 'equip', 'technik', 
            'plugins', 'plug-ins', 'outboard', 'backline', 'desk', 'console', 
            'monitoring', 'monitors', 'speakers', 
            'equalizer', 'equaliser','compressors', 'kompressor', 
            'processors', 'pre-amps', 'preamps', 
            'tube', 'dynamic', 'condensers', 'ribbon'
        }
        
        # Sample manufacturers and equipment (expand this list)
        self.manufacturers = {
            'neumann', 'shure', 'sennheiser', 'akg', 'trident',
            'universal audio', 'chandler', 'yamaha', 'hammond',
            'telefunken','neve','ssl', 'krk', 'avantone', 'adam Audio', 
            'genelec', 'pmc', 'quested', 'atc', 'barefoot sound', 'behringer',
            'gml', 'manley', 'pultec', 'focusrite', 'presonus', 'shadow hills',
            'thermionic culture', 'dbx', 'tube-tech', 'empirical labs', 
            'teletronix', 'fairchild', 'sennheiser', 'royer', 'schoeps', 
            'coles', 'earthworks audio', 'dpa', 'lewitt', 'focal', 'eve', 
            'hedd', 'api', 'avalon', 'burl', 'dangerous music','mäag', 'moog', 
            'equential', 'arturia', 'antelope audio', 'apogee', 'eventide', 
            'lexicon','bricasti design', 'radial engineering', 'heritage audio',
            'protools', 'logic', 'tla', 'urei'
        }
        
        self.equipment = {
            'telefunken ela m 251', 'neumann u47', 'neumann u67','akg c12'
            'neumann u87', 'neumann m49', 'telefunken u47', 'akg c414', 
            'coles 4038', 'schoeps cmC6','royer r-121', 'akg c414 eb', 'neumann km84',
            'sennheiser md421', 'shure sm7b', 'electro-voice re20', 'neumann tlm103',
            'shure sm57', 'shure SM58', 'fairchild 670', 'neve 2254', 'teletronix la2a',
            'universal audio 1176ln', 'empirical labs distressor', 'tube-tech cl1b', 
            'ssl g bus compressor', 'dbx 160vu', 'api 2500', 'thermionic culture phoenix',
            'manley vari-uu', 'focusrite red', 'neve 1073', 'api 312', 'neve 1073', 'api 3124+',
            'api 512c', 'neve 1084', 'ssl superanalogue', 'universal audio 610', 'focusrite isa 110',
            'shadow hills gama', 'chandler tg2', 'focusrite scarlett', 'presonus studio 24c', 
            'pultec eqp-1a', 'manley massive passive', 'api 550a', 'neve 1073', 'gml 8200',
            'ssl 4000', 'api 550b', 'focusrite isa 110', 'chandler curve bender', 'behringer graphic',
            'barefoot sound mm27', 'atc scm150asl pro', 'atc scm25a', 'focal twin 6 be',
            'genelec 8351a', 'pmc bb5/xbd', 'quested hq210', 'focal sm9', 'genelec 1031a',
            'yamaha ns10m', 'adan audio s3h', 'yamaha hs8', 'krk rokit', 'avantone mixcubes',
            'flea 47', 'flea 12', 'josephson c700a', 'brauner vm1', 'aea r84', 'sony c800g', 
            'earthworks sr314', 'dpa 4011', 'pearlman tm-1', 'api 529', 'rupert neve designs portico ii', 
            'chandler limited rs124', 'bricasti m7', 'smart research c1', 'avedis ma5', 
            'rupert neve 5211',  'grace design m101', 'avalon vt-737sp', 'api 560', 
            'rupert neve 5033', 'mäag audio eq4', 'sontec mes-432c', 'atc scm45a pro', 
            'dynaudio core 59', 'hedd type 20', 'focal trio11 be', 'pmc mb3-xbd', 
            'burl b2 bomber adc', 'antelope audio orion 32', 'dangerous music monitor st', 
            'thermionic culture fat bustard'

        }
        
        # Configure session with retries and backoff
        self.session = requests.Session()
        retries = Retry(
            total=5,
            backoff_factor=0.5,
            status_forcelist=[500, 502, 503, 504]
        )
        self.session.mount('http://', HTTPAdapter(max_retries=retries))
        self.session.mount('https://', HTTPAdapter(max_retries=retries))
        
        # Proxy configuration (implement your proxy rotation logic here)
        self.proxies = []
    
    def normalize_text(self, text: str) -> str:
        """Clean and normalize text for matching."""
        text = text.lower()
        text = re.sub(r'[^\w\s-]', '', text)
        text = re.sub(r'\s+', ' ', text)
        return text.strip()
    
    def generate_equipment_variations(self, model: str) -> Set[str]:
        """Generate common variations of equipment names."""
        variations = {model}
        
        # Handle spacing variations
        variations.add(model.replace(' ', ''))
        variations.add(model.replace(' ', '-'))
        variations.add(model.replace('-', ' '))
        
        # Handle common prefixes/suffixes
        if any(char.isdigit() for char in model):
            variations.add(f"model {model}")
            variations.add(f"type {model}")
        
        return variations
    
    def calculate_relevance_score(self, url: str, content: str) -> float:
        """Calculate relevance score for a page based on multiple criteria."""
        score = 0.0
        normalized_content = self.normalize_text(content)
        
        # URL relevance
        if any(keyword in url.lower() for keyword in self.url_keywords):
            score += 0.3
        
        # Manufacturer mentions
        manufacturer_matches = sum(
            1 for m in self.manufacturers 
            if m in normalized_content
        )
        score += min(0.3, manufacturer_matches * 0.05)
        
        # Equipment mentions
        equipment_matches = sum(
            1 for e in self.equipment 
            if any(
                variation in normalized_content 
                for variation in self.generate_equipment_variations(e)
            )
        )
        score += min(0.4, equipment_matches * 0.05)
        
        return score
    
    def extract_download_links(self, soup: BeautifulSoup, base_url: str) -> List[str]:
        """Extract potential equipment list download links."""
        download_links = []
        
        for link in soup.find_all('a'):
            href = link.get('href')
            text = link.get_text().lower()
            
            if not href:
                continue
                
            # Skip floor plans
            if 'floor' in text and 'plan' in text:
                continue
            
            # Look for equipment-related download links
            if any(keyword in text for keyword in self.url_keywords):
                if href.endswith(('.pdf', '.doc', '.docx')):
                    download_links.append(urljoin(base_url, href))
        
        return download_links
    
    def scrape_studio(self, studio_name: str, website: str) -> Dict:
        """Scrape a single studio website for equipment information."""
        result = {
            'studio_name': studio_name,
            'status': 'Inactive',
            'error_message': '',
            'equipment_page_url': '',
            'download_link': '',
            'confidence_score': 0.0,
            'context': ''
        }
        
        try:
            # First try with SSL verification
            try:
                response = self.session.get(
                    website,
                    timeout=30,
                    verify=self.verify_ssl
                )
            except requests.exceptions.SSLError:
                if self.verify_ssl:
                    urllib3.disable_warnings()
                    response = self.session.get(
                        website,
                        timeout=30,
                        verify=False
                    )
            
            result['status'] = 'Active'
            
            soup = BeautifulSoup(response.text, 'html.parser')
            base_url = response.url
            
            # Extract and analyze all internal links
            internal_pages = []
            for link in soup.find_all('a'):
                href = link.get('href')
                if not href:
                    continue
                    
                full_url = urljoin(base_url, href)
                if urlparse(full_url).netloc == urlparse(base_url).netloc:
                    internal_pages.append(full_url)
            
            # Process each internal page
            best_score = 0.0
            best_url = ''
            best_context = ''
            
            for page_url in internal_pages:
                try:
                    page_response = self.session.get(
                        page_url,
                        timeout=30,
                        verify=self.verify_ssl
                    )
                    page_soup = BeautifulSoup(page_response.text, 'html.parser')
                    
                    # Calculate relevance score
                    score = self.calculate_relevance_score(
                        page_url,
                        page_soup.get_text()
                    )
                    
                    if score > best_score:
                        best_score = score
                        best_url = page_url
                        
                        # Extract context (surrounding text of equipment mentions)
                        context_elements = []
                        for manufacturer in self.manufacturers:
                            if manufacturer in page_soup.get_text().lower():
                                for element in page_soup.find_all(['p', 'li', 'h1', 'h2', 'h3']):
                                    if manufacturer in element.get_text().lower():
                                        context_elements.append(element.get_text().strip())
                        
                        best_context = '; '.join(context_elements[:3])  # Limit context to first 3 matches
                    
                    # Look for download links
                    download_links = self.extract_download_links(page_soup, base_url)
                    if download_links:
                        result['download_link'] = '; '.join(download_links)
                
                except Exception as e:
                    logger.warning(f"Error processing internal page {page_url}: {str(e)}")
                
                time.sleep(self.rate_limit_delay)
            
            if best_score > 0.5:  # Minimum confidence threshold
                result['equipment_page_url'] = best_url
                result['confidence_score'] = best_score
                result['context'] = best_context
            
        except Exception as e:
            result['status'] = 'Inactive'
            result['error_message'] = str(e)
        
        return result
    
    def process_batch(self, studios: List[Dict]) -> List[Dict]:
        """Process a batch of studios using parallel execution."""
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            future_to_studio = {
                executor.submit(
                    self.scrape_studio,
                    studio['studio_name'],
                    studio['website']
                ): studio
                for studio in studios
            }
            
            results = []
            for future in as_completed(future_to_studio):
                results.append(future.result())
        
        return results
    
    def scrape_studios(self, input_file: str, output_file: str, detailed_output_file: str):
        """Main method to process all studios from CSV."""
        # Read input CSV
        df = pd.read_csv(input_file)
        studios = df.to_dict('records')
        
        # Process in batches
        all_results = []
        for i in range(0, len(studios), self.batch_size):
            batch = studios[i:i + self.batch_size]
            batch_results = self.process_batch(batch)
            all_results.extend(batch_results)
            
            # Progress tracking
            logger.info(f"Processed {min(i + self.batch_size, len(studios))}/{len(studios)} studios")
        
        # Save basic results
        pd.DataFrame(all_results).to_csv(output_file, index=False)
        
        # Save detailed results
        detailed_results = [
            {
                'studio_name': r['studio_name'],
                'equipment_page_url': r['equipment_page_url'],
                'confidence_score': r['confidence_score'],
                'context': r['context']
            }
            for r in all_results
            if r['equipment_page_url']  # Only include studios with found equipment pages
        ]
        pd.DataFrame(detailed_results).to_csv(detailed_output_file, index=False)

In [3]:
# Usage example:
if __name__ == "__main__":
    scraper = StudioEquipmentScraper(
        max_workers=5,
        rate_limit_delay=1.0,
        batch_size=50,
        verify_ssl=True
    )
    
    scraper.scrape_studios(
        input_file='data/studio_websites_ii.csv',
        output_file='data/studio_equip_lists/equip_search_output_ii.csv',
        detailed_output_file='data/detailed/detailed_equip_search_ii.csv'
    )

2025-02-10 11:09:31,304 - INFO - Processed 50/51 studios
2025-02-10 11:09:31,747 - INFO - Processed 51/51 studios
