In [4]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
from concurrent.futures import ThreadPoolExecutor
import re
import logging
from datetime import datetime
import json
from typing import Dict, List, Optional
import sys
import gc
from thefuzz import fuzz
from thefuzz import process

In [5]:
class EquipmentMatcher:
    def __init__(self):
        self.equipment_categories = {
            'Microphones 0': {
                'Telefunken ELA M 251': ['ELAM 251', 'ELA M251', 'ELA-M 251', 'Telefunken 251', 'ELA M 251E'],
                'Neumann U47': ['U-47', 'U 47', 'VF14 U47', 'U47 Vintage', 'Neumann VF14'],
                'Neumann U67': ['U-67', 'U 67', 'U.67', 'U67 Vintage', 'Neumann 67'],
                'AKG C12': ['C-12', 'C 12', 'C.12', 'AKG C12VR', 'C12 Vintage']
            },
            'Microphones 1': {
                'Neumann U87': ['U-87', 'U 87', 'U87ai', 'U87 Ai', 'U87A', 'U87 Vintage'],
                'Neumann M49': ['M-49', 'M 49', 'M.49', 'M49 Vintage', 'Neumann M 49'],
                'Telefunken U47': ['Telefunken U-47', 'Tele U47', 'Telefunken U 47'],
                'AKG C414': ['C-414', 'C414 XLII', 'C414 XLS', 'C414B-ULS', 'C414 EB'],
                'Coles 4038': ['Coles 4038S', '4038 Ribbon', 'Coles Ribbon'],
                'Schoeps CMC6': ['CMC 6', 'CMC-6', 'Schoeps CMC6 MK', 'CMC6/MK'],
                'Royer R-121': ['R121', 'R 121', 'Royer 121', 'R-121 Ribbon']
            },
            'Microphones 2': {
                'AKG C414 EB': ['C414EB', 'C-414 EB', 'C414 EB Silver', 'C414 EB Vintage'],
                'Neumann KM84': ['KM-84', 'KM 84', 'KM84i', 'KM 84 Vintage'],
                'Sennheiser MD421': ['MD-421', 'MD 421 II', 'MD421-U'],
                'Shure SM7B': ['SM7 B', 'SM-7B', 'SM7b', 'SM7'],
                'Electro-Voice RE20': ['RE-20', 'RE 20', 'EV RE20', 'RE20 Broadcast'],
                'Neumann TLM103': ['TLM-103', 'TLM 103', 'TLM.103']
            },
            'Microphones 3': {
                'Shure SM57': ['SM-57', 'SM 57', 'SM57-LC'],
                'Shure SM58': ['SM-58', 'SM 58', 'SM58-LC']
            },
            'Compressors 0': {
                'Fairchild 670': ['Fairchild 670 Stereo', '670 Limiter', 'Fairchild 670/660'],
                'Neve 2254': ['2254/E', '2254E', '2254A', 'Neve 2254E'],
                'Teletronix LA2A': ['LA-2A', 'LA 2A', 'Teletronix LA-2A', 'LA2A Leveler']
            },
            'Compressors 1': {
                'Universal Audio 1176LN': ['1176 LN', 'UA 1176', 'UREI 1176', '1176LN Rev'],
                'Empirical Labs Distressor': ['EL8 Distressor', 'EL-8', 'EL8-X'],
                'Tube-Tech CL1B': ['CL 1B', 'CL-1B', 'Tube Tech CL1B'],
                'SSL G Bus Compressor': ['SSL G-Series', 'SSL G Comp', 'G Series Comp']
            },
            'Compressors 2': {
                'DBX 160VU': ['DBX 160', '160 VU', 'DBX VU Compressor'],
                'API 2500': ['API-2500', '2500 Comp', '2500 Compressor'],
                'Thermionic Culture Phoenix': ['Phoenix Compressor', 'Thermionic Phoenix', 'Phoenix Tube Compressor'],
                'Manley Vari-Mu': ['Vari Mu', 'VariMu', 'Manley Tube Compressor']
            },
            'Compressors 3': {
                'Focusrite Red': ['Red Compressor', 'Focusrite Red Series', 'Focusrite Red 3']
            },
            'Preamps 0': {
                'Neve 1073': ['1073 Preamp', 'AMS Neve 1073', 'Neve Classic 1073'],
                'API 312': ['API-312', '312 Preamp', '312 Mic Pre']
            },
            'Preamps 1': {
                'Neve 1073': ['1073 Pre', 'Neve Pre 1073', 'AMS 1073'],
                'API 3124+': ['3124+', 'API 3124 Plus', '3124 Mic Pre'],
                'API 512c': ['512C', 'API-512', '512 Mic Preamp'],
                'Neve 1084': ['1084 Preamp', 'Neve 1084 EQ/Pre', 'AMS Neve 1084'],
                'SSL SuperAnalogue': ['SuperAnalogue Preamp', 'SSL Super Analogue', 'SSL SA Pre']
            },
            'Preamps 2': {
                'Universal Audio 610': ['UA 610', '610 Tube Preamp', 'Universal 610'],
                'Focusrite ISA 110': ['ISA 110', 'Focusrite 110 Pre', 'Focusrite ISA'],
                'Shadow Hills Gama': ['Gama Preamp', 'Shadow Hills Mic Pre', 'Shadow Hills G.A.M.A.'],
                'Chandler TG2': ['TG2 Preamp', 'Chandler TG-2', 'Abbey Road TG2']
            },
            'Preamps 3': {
                'Focusrite Scarlett': ['Scarlett Preamp', 'Focusrite Scarlett Series'],
                'Presonus Studio 24C': ['24C Preamp', 'Studio 24C']
            },
            'Equalisation 0': {
                'Pultec EQP-1A': ['EQP 1A', 'Pultec 1A EQ', 'Pultec Tube EQ'],
                'Manley Massive Passive': ['Massive Passive EQ', 'Manley MP EQ']
            },
            'Equalisation 1': {
                'API 550A': ['550A EQ', 'API 550 EQ'],
                'Neve 1073': ['1073 EQ', 'Neve EQ 1073'],
                'GML 8200': ['8200 EQ', 'GML Parametric EQ'],
                'SSL 4000': ['SSL EQ 4000', '4000 Series EQ', 'SSL 4K EQ']
            },
            'Equalisation 2': {
                'API 550B': ['550B EQ', 'API 550 EQ-B'],
                'Focusrite ISA 110': ['ISA 110 EQ', 'Focusrite 110 EQ'],
                'Chandler Curve Bender': ['Curve Bender', 'Chandler CB EQ']
            },
            'Equalisation 3': {
                'Behringer graphic': ['Behringer EQ', 'Behringer Graphic EQ']
            },
            'Monitors 0': {
                'Barefoot Sound MM27': ['MM27 Monitors', 'Barefoot MM27', 'Barefoot Sound'],
                'ATC SCM150ASL Pro': ['SCM150 ASL', 'ATC SCM150', 'SCM150 Pro']
            },
            'Monitors 1': {
                'ATC SCM25A': ['SCM25A', 'ATC SCM25'],
                'Focal Twin 6 Be': ['Twin 6 Be', 'Focal Twin6', 'Focal Twin'],
                'Genelec 8351A': ['8351A Monitors', 'Genelec 8351', 'The Ones 8351'],
                'PMC BB5/XBD': ['BB5 XBD', 'PMC BB5 Monitors', 'PMC XBD'],
                'Quested HQ210': ['HQ210 Monitors', 'Quested HQ-210', 'Quested Studio Monitors']
            },
            'Monitors 2': {
                'Focal SM9': ['SM9 Monitors', 'Focal SM-9', 'Focal 3-Way Monitors'],
                'Genelec 1031A': ['1031A Monitors', 'Genelec 1031'],
                'Yamaha NS10M': ['NS-10M', 'NS10 Monitors', 'Yamaha NS10'],
                'ADAM Audio S3H': ['S3H Monitors', 'Adam S3H', 'Adam Audio']
            },
            'Monitors 3': {
                'Yamaha HS8': ['HS8 Monitors', 'Yamaha HS-8'],
                'KRK Rokit': ['Rokit Monitors', 'KRK Rokit Series'],
                'Avantone Mixcubes': ['Mixcubes', 'Avantone Cubes']
            }
        }
        
        # Common manufacturer variations
        self.manufacturer_variations = {
            'Neumann': ['Neum', 'Neuman', 'Neumaan', 'Numann'],
            'Telefunken': ['Telefunken Elektroakustik', 'Telef', 'Tele', 'Telefungen'],
            'AKG': ['AKG Acoustics', 'AKG-', 'AGK'],
            'Shure': ['Sure', 'Schure', 'Shr'],
            'Universal Audio': ['UA', 'UREI', 'Uaudio'],
            'Neve': ['Nieve', 'Nev', 'AMS Neve'],
            'SSL': ['Solid State Logic', 'SSl', 'S.S.L.']
        }

    def fuzzy_match(self, text: str, equipment: str, threshold: int = 85) -> Optional[Dict]:
        """
        Perform fuzzy matching on text with various preprocessing steps.
        
        Args:
            text: Source text to search in
            equipment: Equipment name to search for
            threshold: Minimum similarity score (0-100)
            
        Returns:
            Dict with match details if found, None otherwise
        """
        # Generate variations of the equipment name
        equipment_variations = self.generate_variations(equipment)
        
        # Clean and prepare text for matching
        cleaned_text = self.clean_text(text)
        
        # Split text into chunks for better matching
        chunks = self.split_into_chunks(cleaned_text)
        
        best_match = None
        highest_score = 0
        match_context = ''
        
        for chunk in chunks:
            for variant in equipment_variations:
                # Try different fuzzy matching algorithms
                ratio = fuzz.ratio(chunk.lower(), variant.lower())
                partial_ratio = fuzz.partial_ratio(chunk.lower(), variant.lower())
                token_sort_ratio = fuzz.token_sort_ratio(chunk.lower(), variant.lower())
                
                # Use the highest score among different algorithms
                score = max(ratio, partial_ratio, token_sort_ratio)
                
                if score > highest_score and score >= threshold:
                    highest_score = score
                    best_match = variant
                    match_context = chunk
        
        if best_match:
            return {
                'matched_text': best_match,
                'confidence': highest_score,
                'context': match_context,
                'original_term': equipment
            }
        
        return None

    def generate_variations(self, equipment: str) -> List[str]:
        """Generate various possible representations of equipment names."""
        variations = [equipment]
        parts = equipment.split()
        
        # Handle manufacturer variations
        if parts[0] in self.manufacturer_variations:
            for variant in self.manufacturer_variations[parts[0]]:
                variations.append(f"{variant} {' '.join(parts[1:])}")
        
        # Handle model number variations
        if len(parts) > 1:
            model = parts[-1]
            # Add variations with different separators
            variations.extend([
                f"{' '.join(parts[:-1])}-{model}",
                f"{' '.join(parts[:-1])} {model}",
                f"{' '.join(parts[:-1])}.{model}"
            ])
        
        return variations

    def clean_text(self, text: str) -> str:
        """Clean and normalize text for better matching."""
        # Remove special characters but keep spaces
        text = re.sub(r'[^\w\s-]', ' ', text)
        # Normalize whitespace
        text = ' '.join(text.split())
        return text

    def split_into_chunks(self, text: str, chunk_size: int = 100) -> List[str]:
        """Split text into overlapping chunks for better matching."""
        words = text.split()
        chunks = []
        
        for i in range(0, len(words), chunk_size // 2):
            chunk = ' '.join(words[i:i + chunk_size])
            chunks.append(chunk)
        
        return chunks

    def find_equipment_matches(self, text: str) -> Dict[str, List[Dict]]:
        """Find all equipment matches in text using fuzzy matching."""
        results = {}
        
        for category, equipment_dict in self.equipment_categories.items():
            category_matches = []
            
            for equipment, variations in equipment_dict.items():
                # Try exact matches first
                exact_match = any(variation.lower() in text.lower() for variation in variations)
                
                if exact_match:
                    category_matches.append({
                        'equipment': equipment,
                        'confidence': 100,
                        'match_type': 'exact'
                    })
                else:
                    # Try fuzzy matching
                    fuzzy_match = self.fuzzy_match(text, equipment)
                    if fuzzy_match:
                        category_matches.append({
                            'equipment': equipment,
                            'confidence': fuzzy_match['confidence'],
                            'match_type': 'fuzzy',
                            'context': fuzzy_match['context']
                        })
            
            if category_matches:
                results[category] = category_matches
        
        return results

class StudioScraper:
    def __init__(self, csv_path: str, batch_size: int = 50):
        """
        Initialize the scraper with enhanced logging and equipment matcher.
        
        Args:
            csv_path: Path to CSV containing studio information
            batch_size: Number of studios to process in each batch
        """
        # Setup logging
        self.setup_logging()
        
        # Load studios in batches
        self.csv_path = csv_path
        self.batch_size = batch_size
        
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
        }
        
        # Initialize equipment matcher
        self.equipment_matcher = EquipmentMatcher()

    def setup_logging(self):
        """Configure detailed logging system."""
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler(f'data/log/scraper_log_{timestamp}.log'),
                logging.StreamHandler(sys.stdout)
            ]
        )
        self.logger = logging.getLogger(__name__)

    def scrape_single_studio(self, row: pd.Series) -> Dict:
        """
        Scrape equipment information for a single studio.

        Args:
            row: Pandas Series containing studio information

        Returns:
            Dict containing all scraped information and status
        """
        url = row['website']  # Note: capitalization matters
        studio_name = row['studio_name']  # Note: capitalization matters

        self.logger.info(f"Processing studio: {studio_name}")

        # Initialize result dictionary with basic info
        result = {
            'studio_name': studio_name,
            'website': url,
            'status': 'Unknown',
            'error_message': None,
            'equipment_page_url': None
        }

        # Add empty columns for each equipment category
        for category in self.equipment_matcher.equipment_categories.keys():
            result[category] = ''

        # Check website status
        try:
            response = requests.get(url, headers=self.headers, timeout=10)
            response.raise_for_status()
            result['status'] = 'Active'

            # Get all internal links
            all_links = self.get_all_links(url)

            # Threshold for link relevance score
            threshold = 0.4

            all_equipment_matches = {}
            for link in all_links:
                try:
                    page_response = requests.get(link, headers=self.headers, timeout=10)
                    page_text = page_response.text

                    # Score link relevance
                    link_score = self.score_link_relevance(link)

                    # Search for equipment matches
                    equipment_matches = self.equipment_matcher.find_equipment_matches(page_text)

                    # Add page URL to matches if found
                    if equipment_matches and link_score > threshold:
                        for category, matches in equipment_matches.items():
                            if category not in all_equipment_matches:
                                all_equipment_matches[category] = []

                            for match in matches:
                                match_details = match.copy()
                                match_details['page_url'] = link
                                all_equipment_matches[category].append(match_details)

                    time.sleep(2)  # Rate limiting

                except Exception as e:
                    self.logger.error(f"Error processing page {link}: {str(e)}")

            # Update result with comprehensive matches
            for category, matches in all_equipment_matches.items():
                match_texts = []
                for match in matches:
                    match_text = f"{match['equipment']} ({match['confidence']}% {match['match_type']}) - Page: {match['page_url']}"
                    match_texts.append(match_text)

                result[category] = ' | '.join(match_texts)

            # Update equipment page URL if matches found
            if all_equipment_matches:
                result['equipment_page_url'] = ', '.join(set(match['page_url'] for matches in all_equipment_matches.values() for match in matches))

        except requests.exceptions.ConnectionError:
            result['status'] = 'Inactive'
            result['error_message'] = 'Connection Error'
        except requests.exceptions.Timeout:
            result['status'] = 'Inactive'
            result['error_message'] = 'Timeout Error'
        except requests.exceptions.HTTPError as e:
            result['status'] = 'Inactive'
            result['error_message'] = f'HTTP Error: {str(e)}'
        except Exception as e:
            result['status'] = 'Error'
            result['error_message'] = f'Unknown Error: {str(e)}'

        return result

    def score_link_relevance(self, link: str) -> float:
        """Score link relevance for equipment pages"""
        relevance_keywords = ['gear', 'equipment', 'studio', 'tech', 'specs', 'desk',
                              'microphone', 'microphones', 'recording', 'facilities',
                              'console', 'outboard','instruments', 'monitoring', 'monitors']
        score = 0

        # Keyword presence
        for keyword in relevance_keywords:
            if keyword in link.lower():
                score += 0.3

        # Path depth (prefer shorter paths)
        path_segments = urlparse(link).path.split('/')
        score -= 0.1 * len(path_segments)

        return max(0, min(1, score))
    
    def get_all_links(self, url: str) -> List[str]:
        """Get all internal links from a webpage with rate limiting."""
        try:
            response = requests.get(url, headers=self.headers, timeout=10)
            soup = BeautifulSoup(response.text, 'html.parser')
            base_domain = urlparse(url).netloc
            
            links = []
            for a in soup.find_all('a', href=True):
                link = urljoin(url, a['href'])
                if urlparse(link).netloc == base_domain:
                    links.append(link)
            
            time.sleep(2)  # Rate limiting
            return list(set(links))
        except Exception as e:
            self.logger.error(f"Error getting links from {url}: {str(e)}")
            return []

    def process_batch(self, batch_df: pd.DataFrame) -> List[Dict]:
        """Process a batch of studios with parallel execution."""
        results = []
        with ThreadPoolExecutor(max_workers=3) as executor:  # Limited workers for rate limiting
            future_to_studio = {
                executor.submit(self.scrape_single_studio, row): row['studio_name']
                for _, row in batch_df.iterrows()
            }
            
            for future in future_to_studio:
                try:
                    result = future.result()
                    results.append(result)
                except Exception as e:
                    self.logger.error(f"Batch processing error: {str(e)}")
        
        return results

    def scrape_all_studios(self) -> pd.DataFrame:
        """
        Scrape all studios in batches to manage memory usage.
        
        Returns:
            DataFrame with all results
        """
        all_results = []
        
        # Read CSV in chunks
        for chunk_num, chunk in enumerate(pd.read_csv(self.csv_path, chunksize=self.batch_size)):
            self.logger.info(f"Processing batch {chunk_num + 1}")
            
            results = self.process_batch(chunk)
            all_results.extend(results)
            
            # Force garbage collection
            gc.collect()
            
            # Save intermediate results every 100 studios
            if len(all_results) % 100 == 0:
                self.save_intermediate_results(all_results)
        
        return pd.DataFrame(all_results)

    def save_intermediate_results(self, results: List[Dict]):
        """Save intermediate results to prevent data loss."""
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        pd.DataFrame(results).to_csv(f'data/log/intermediate_results_{timestamp}.csv', index=False)

    def save_results(self, results_df: pd.DataFrame, output_path: str):
        """Save the final results to a CSV file."""
        results_df.to_csv(output_path, index=False)
        self.logger.info(f"Results saved to {output_path}")

In [6]:
scraper = StudioScraper('data/studio_websites_ii.csv', batch_size=20)
results = scraper.scrape_all_studios()
scraper.save_results(results, 'data/studio_equip_lists/studio_equipment_results_ii.csv')

2025-02-03 12:24:41,641 - INFO - Processing batch 1
2025-02-03 12:24:41,646 - INFO - Processing studio: Audio Sorcery
2025-02-03 12:24:41,648 - INFO - Processing studio: Audio-Vision
2025-02-03 12:24:41,648 - INFO - Processing studio: Big Scary Tree
2025-02-03 12:25:33,814 - INFO - Processing studio: Bridger Productions
2025-02-03 12:25:47,799 - INFO - Processing studio: Clear Lake Recording Studios
2025-02-03 12:27:21,524 - INFO - Processing studio: Dark Horse Recording
2025-02-03 12:28:07,046 - INFO - Processing studio: Downtown Music Studios
2025-02-03 12:28:20,081 - INFO - Processing studio: Fonoprint
2025-02-03 12:28:50,590 - INFO - Processing studio: Geejam Studios
2025-02-03 12:29:09,272 - INFO - Processing studio: Glasgow Recording Studio
2025-02-03 12:30:44,202 - INFO - Processing studio: Magik Studios
2025-02-03 12:33:15,280 - ERROR - Error processing page https://themagikstudios.com/houston-vocal-recording-services/: ('Connection aborted.', RemoteDisconnected('Remote end clo