In [5]:
import requests
from bs4 import BeautifulSoup, Tag
import pandas as pd
import time
import logging
from urllib.parse import urljoin, urlparse
import csv
from typing import Dict, List, Set, Tuple
import re
from requests.exceptions import RequestException
from memory_profiler import profile
from fuzzywuzzy import fuzz
from collections import defaultdict
import queue
import threading
from concurrent.futures import ThreadPoolExecutor
import sys

In [15]:
# Configure logging to output to both file and console
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('data/log/studio_scraper.log'),
        logging.StreamHandler(sys.stdout)
    ]
)

class StudioScraper:
    def __init__(self):
        self.equipment_categories = {
            'gear', 'equipment', 'equip', 'control room', 'studio', 'tech', 'technik', 'recording',
            'specs', 'desk', 'microphone', 'microphones', 'recording', 'equalizer', 'equaliser',
            'facilities', 'console', 'outboard', 'instruments', 'monitoring', 
            'monitors', 'speakers', 'processors', 'compressors', 'kompressor', 'plugins',
            'pre-amps', 'preamps', 'plug-ins', 'backline', 'instruments', 'tube', 'dynamic', 
            'condensers', 'ribbon'
        }

        self.target_equipment = {
            'Microphones 0': {
                'Telefunken ELA M 251': ['ELAM 251', 'ELA M251', 'ELA-M 251', 'Telefunken 251', 'ELA M 251E'],
                'Neumann U47': ['U-47', 'U 47', 'VF14 U47', 'U47 Vintage', 'Neumann VF14'],
                'Neumann U67': ['U-67', 'U 67', 'U.67', 'U67 Vintage', 'Neumann 67'],
                'AKG C12': ['C-12', 'C 12', 'C.12', 'AKG C12VR', 'C12 Vintage']
            },
            'Microphones 1': {
                'Neumann U87': ['U-87', 'U 87', 'U87ai', 'U87 Ai', 'U87A', 'U87 Vintage'],
                'Neumann M49': ['M-49', 'M 49', 'M.49', 'M49 Vintage', 'Neumann M 49'],
                'Telefunken U47': ['Telefunken U-47', 'Tele U47', 'Telefunken U 47'],
                'AKG C414': ['C-414', 'C414 XLII', 'C414 XLS', 'C414B-ULS', 'C414 EB'],
                'Coles 4038': ['Coles 4038S', '4038 Ribbon', 'Coles Ribbon'],
                'Schoeps CMC6': ['CMC 6', 'CMC-6', 'Schoeps CMC6 MK', 'CMC6/MK'],
                'Royer R-121': ['R121', 'R 121', 'Royer 121', 'R-121 Ribbon']
            },
            'Microphones 2': {
                'AKG C414 EB': ['C414EB', 'C-414 EB', 'C414 EB Silver', 'C414 EB Vintage'],
                'Neumann KM84': ['KM-84', 'KM 84', 'KM84i', 'KM 84 Vintage'],
                'Sennheiser MD421': ['MD-421', 'MD 421 II', 'MD421-U'],
                'Shure SM7B': ['SM7 B', 'SM-7B', 'SM7b', 'SM7'],
                'Electro-Voice RE20': ['RE-20', 'RE 20', 'EV RE20', 'RE20 Broadcast'],
                'Neumann TLM103': ['TLM-103', 'TLM 103', 'TLM.103']
            },
            'Microphones 3': {
                'Shure SM57': ['SM-57', 'SM 57', 'SM57-LC'],
                'Shure SM58': ['SM-58', 'SM 58', 'SM58-LC']
            },
            'Compressors 0': {
                'Fairchild 670': ['Fairchild 670 Stereo', '670 Limiter', 'Fairchild 670/660'],
                'Neve 2254': ['2254/E', '2254E', '2254A', 'Neve 2254E'],
                'Teletronix LA2A': ['LA-2A', 'LA 2A', 'Teletronix LA-2A', 'LA2A Leveler']
            },
            'Compressors 1': {
                'Universal Audio 1176LN': ['1176 LN', 'UA 1176', 'UREI 1176', '1176LN Rev'],
                'Empirical Labs Distressor': ['EL8 Distressor', 'EL-8', 'EL8-X'],
                'Tube-Tech CL1B': ['CL 1B', 'CL-1B', 'Tube Tech CL1B'],
                'SSL G Bus Compressor': ['SSL G-Series', 'SSL G Comp', 'G Series Comp']
            },
            'Compressors 2': {
                'DBX 160VU': ['DBX 160', '160 VU', 'DBX VU Compressor'],
                'API 2500': ['API-2500', '2500 Comp', '2500 Compressor'],
                'Thermionic Culture Phoenix': ['Phoenix Compressor', 'Thermionic Phoenix', 'Phoenix Tube Compressor'],
                'Manley Vari-Mu': ['Vari Mu', 'VariMu', 'Manley Tube Compressor']
            },
            'Compressors 3': {
                'Focusrite Red': ['Red Compressor', 'Focusrite Red Series', 'Focusrite Red 3']
            },
            'Preamps 0': {
                'Neve 1073': ['1073 Preamp', 'AMS Neve 1073', 'Neve Classic 1073'],
                'API 312': ['API-312', '312 Preamp', '312 Mic Pre']
            },
            'Preamps 1': {
                'Neve 1073': ['1073 Pre', 'Neve Pre 1073', 'AMS 1073'],
                'API 3124+': ['3124+', 'API 3124 Plus', '3124 Mic Pre'],
                'API 512c': ['512C', 'API-512', '512 Mic Preamp'],
                'Neve 1084': ['1084 Preamp', 'Neve 1084 EQ/Pre', 'AMS Neve 1084'],
                'SSL SuperAnalogue': ['SuperAnalogue Preamp', 'SSL Super Analogue', 'SSL SA Pre']
            },
            'Preamps 2': {
                'Universal Audio 610': ['UA 610', '610 Tube Preamp', 'Universal 610'],
                'Focusrite ISA 110': ['ISA 110', 'Focusrite 110 Pre', 'Focusrite ISA'],
                'Shadow Hills Gama': ['Gama Preamp', 'Shadow Hills Mic Pre', 'Shadow Hills G.A.M.A.'],
                'Chandler TG2': ['TG2 Preamp', 'Chandler TG-2', 'Abbey Road TG2']
            },
            'Preamps 3': {
                'Focusrite Scarlett': ['Scarlett Preamp', 'Focusrite Scarlett Series'],
                'Presonus Studio 24C': ['24C Preamp', 'Studio 24C']
            },
            'Equalisation 0': {
                'Pultec EQP-1A': ['EQP 1A', 'Pultec 1A EQ', 'Pultec Tube EQ'],
                'Manley Massive Passive': ['Massive Passive EQ', 'Manley MP EQ']
            },
            'Equalisation 1': {
                'API 550A': ['550A EQ', 'API 550 EQ'],
                'Neve 1073': ['1073 EQ', 'Neve EQ 1073'],
                'GML 8200': ['8200 EQ', 'GML Parametric EQ'],
                'SSL 4000': ['SSL EQ 4000', '4000 Series EQ', 'SSL 4K EQ']
            },
            'Equalisation 2': {
                'API 550B': ['550B EQ', 'API 550 EQ-B'],
                'Focusrite ISA 110': ['ISA 110 EQ', 'Focusrite 110 EQ'],
                'Chandler Curve Bender': ['Curve Bender', 'Chandler CB EQ']
            },
            'Equalisation 3': {
                'Behringer graphic': ['Behringer EQ', 'Behringer Graphic EQ']
            },
            'Monitors 0': {
                'Barefoot Sound MM27': ['MM27 Monitors', 'Barefoot MM27', 'Barefoot Sound'],
                'ATC SCM150ASL Pro': ['SCM150 ASL', 'ATC SCM150', 'SCM150 Pro']
            },
            'Monitors 1': {
                'ATC SCM25A': ['SCM25A', 'ATC SCM25'],
                'Focal Twin 6 Be': ['Twin 6 Be', 'Focal Twin6', 'Focal Twin'],
                'Genelec 8351A': ['8351A Monitors', 'Genelec 8351', 'The Ones 8351'],
                'PMC BB5/XBD': ['BB5 XBD', 'PMC BB5 Monitors', 'PMC XBD'],
                'Quested HQ210': ['HQ210 Monitors', 'Quested HQ-210', 'Quested Studio Monitors']
            },
            'Monitors 2': {
                'Focal SM9': ['SM9 Monitors', 'Focal SM-9', 'Focal 3-Way Monitors'],
                'Genelec 1031A': ['1031A Monitors', 'Genelec 1031'],
                'Yamaha NS10M': ['NS-10M', 'NS10 Monitors', 'Yamaha NS10'],
                'ADAM Audio S3H': ['S3H Monitors', 'Adam S3H', 'Adam Audio']
            },
            'Monitors 3': {
                'Yamaha HS8': ['HS8 Monitors', 'Yamaha HS-8'],
                'KRK Rokit': ['Rokit Monitors', 'KRK Rokit Series'],
                'Avantone Mixcubes': ['Mixcubes', 'Avantone Cubes']
            }
        }

        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        self.visited_urls = set()
        self.rate_limit_delay = 3
        self.fuzzy_match_threshold = 85
        
        self.scoring_weights = {
            'equipment_match': 3.0,    # Direct equipment matches
            'category_match': 1.5,     # Equipment category matches
            'url_structure': 1.0,      # URL contains relevant terms
            'page_structure': 1.0,     # Page has equipment-like structure
            'context_quality': 2.0     # Quality of surrounding content
        }
        
        # Add grading system configuration
        self.category_points = {
            'Microphones': 30,
            'Compressors': 30,
            'Preamps': 30,
            'Equalisation': 30,
            'Monitors': 30
        }
        
        self.tier_points = {
            0: 7,  # Exceptional/Rare Vintage
            1: 5,  # High-End
            2: 3,  # Mid-Range
            3: 1   # Entry-Level/Common
        }
        
        self.grading_scale = {
            (135, 150): 'Exceptional',
            (110, 134): 'Excellent',
            (85, 109): 'Good',
            (60, 84): 'Fair',
            (0, 59): 'Needs Improvement'
        }

    def get_category_base(self, category: str) -> str:
        """Extract base category name from category key."""
        return ''.join([i for i in category if not i.isdigit()]).strip()

    def get_tier(self, category: str) -> int:
        """Extract tier number from category key."""
        return int(re.findall(r'\d+', category)[0])

    def calculate_equipment_grade(self, matches: Dict[str, List]) -> Dict:
        """Calculate equipment grade based on matches."""
        # Initialize category scores
        category_scores = defaultdict(int)
        equipment_counts = defaultdict(lambda: defaultdict(int))
        
        # Process matches
        for category, equipment_matches in matches.items():
            if not equipment_matches:
                continue
                
            # Get unique equipment matches
            unique_matches = {match['equipment'] for match in equipment_matches}
            base_category = self.get_category_base(category)
            tier = self.get_tier(category)
            
            # Count equipment in each tier
            equipment_counts[base_category][tier] = len(unique_matches)
            
            # Calculate base points for this tier
            if len(unique_matches) > 0:
                # Base points for having equipment in this tier
                base_points = self.tier_points[tier]
                
                # Add bonus points for multiple Tier 0 or Tier 1 items
                if tier in [0, 1] and len(unique_matches) > 1:
                    if len(unique_matches) == 2:
                        base_points += 2  # Two items = +2 points
                    else:
                        base_points += 4  # Three or more items = +4 points
                
                category_scores[base_category] += base_points
        
        # Calculate total score and ensure it doesn't exceed category maximums
        total_score = 0
        for category, score in category_scores.items():
            category_scores[category] = min(score, self.category_points[category])
            total_score += category_scores[category]
        
        # Determine grade
        grade = None
        for (min_score, max_score), grade_label in self.grading_scale.items():
            if min_score <= total_score <= max_score:
                grade = grade_label
                break
        
        return {
            'total_score': total_score,
            'grade': grade,
            'category_scores': dict(category_scores),
            'equipment_counts': dict(equipment_counts)
        }
        
    def rate_page_structure(self, soup: BeautifulSoup) -> float:
        """Rate the page structure based on how likely it is to be an equipment list."""
        score = 0.0
        
        # Check for structured lists
        lists = soup.find_all(['ul', 'ol', 'dl'])
        if lists:
            score += 0.3
            
        # Check for tables
        tables = soup.find_all('table')
        if tables:
            score += 0.3
            
        # Check for equipment-related headers
        headers = soup.find_all(['h1', 'h2', 'h3', 'h4'])
        for header in headers:
            if any(term in header.get_text().lower() for term in self.equipment_categories):
                score += 0.2
                
        # Check for equipment sections
        sections = soup.find_all(['div', 'section'], class_=lambda x: x and any(
            term in x.lower() for term in self.equipment_categories))
        if sections:
            score += 0.2
            
        return min(score, 1.0)  # Normalize to 0-1

    def rate_context_quality(self, content: str, matches: List[Dict]) -> float:
        """Rate the quality of context around equipment matches."""
        score = 0.0
        
        # Check for technical specifications
        tech_terms = ['specifications', 'specs', 'technical', 'model', 'serial']
        if any(term in content.lower() for term in tech_terms):
            score += 0.3
            
        # Check for categorization
        if any(cat in content.lower() for cat in ['microphones', 'preamps', 'compressors', 'equalizers']):
            score += 0.3
            
        # Check for professional terminology
        pro_terms = ['studio', 'professional', 'audio', 'recording', 'mixing']
        if any(term in content.lower() for term in pro_terms):
            score += 0.2
            
        # Check for match density
        if matches:
            unique_matches = len(set(m['equipment'] for m in matches))
            score += min(0.2, unique_matches * 0.02)
            
        return min(score, 1.0)

    def calculate_url_rating(self, url: str, soup: BeautifulSoup, matches: List[Dict], content: str) -> Dict:
        """Calculate a comprehensive rating for the URL based on multiple factors."""
        url_lower = url.lower()
        
        # Calculate individual scores
        scores = {
            'equipment_match': min(1.0, len(matches) * 0.1),  # Equipment matches score
            'category_match': sum(term in url_lower for term in self.equipment_categories) * 0.2,  # Category relevance
            'url_structure': 1.0 if any(term in url_lower for term in ['gear', 'equipment', 'tech']) else 0.0,  # URL relevance
            'page_structure': self.rate_page_structure(soup),  # Page structure score
            'context_quality': self.rate_context_quality(content, matches)  # Context quality score
        }
        
        # Calculate weighted total
        weighted_total = sum(
            scores[key] * self.scoring_weights[key] 
            for key in scores
        )
        
        # Normalize to 0-100 scale
        max_possible = sum(self.scoring_weights.values())
        final_score = (weighted_total / max_possible) * 100
        
        return {
            'score': round(final_score, 2),
            'component_scores': {k: round(v * 100, 2) for k, v in scores.items()}
        }

    def calculate_relevance_score(self, url: str, text_content: str) -> float:
        score = 0
        url_lower = url.lower()
        for term in self.equipment_categories:
            if term in url_lower:
                score += 2
        
        text_lower = text_content.lower()
        for term in self.equipment_categories:
            if term in text_lower:
                score += 1
                
        for category in self.target_equipment.values():
            for equipment in category:
                if equipment.lower() in text_lower:
                    score += 3
        
        return score

    def fuzzy_match_equipment(self, text: str, equipment: str) -> Tuple[bool, float]:
        if equipment.lower() in text.lower():
            return True, 100
        
        words = re.findall(r'\b[\w\s-]+\b', text)
        max_score = 0
        
        for word_group in words:
            score = fuzz.partial_ratio(equipment.lower(), word_group.lower())
            max_score = max(max_score, score)
            
            if score >= self.fuzzy_match_threshold:
                return True, score
                
        return False, max_score

    def extract_page_content(self, soup: BeautifulSoup) -> Dict[str, str]:
        content = defaultdict(str)
        
        # Headers and Titles
        headers = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title'])
        content['headers'] = ' '.join(h.get_text(strip=True) for h in headers)
        
        # Navigation menus
        nav_elements = soup.find_all(['nav', 'menu'])
        nav_elements.extend(soup.find_all(class_=lambda x: x and ('nav' in x or 'menu' in x)))
        content['navigation'] = ' '.join(n.get_text(strip=True) for n in nav_elements)
        
        # Lists and Tables
        lists = soup.find_all(['ul', 'ol', 'dl', 'table'])
        content['lists'] = ' '.join(l.get_text(strip=True) for l in lists)
        
        # Buttons, Links, and Interactive Elements
        interactive = soup.find_all(['button', 'a', 'input', 'select'])
        content['interactive'] = ' '.join(i.get_text(strip=True) if isinstance(i, Tag) else '' for i in interactive)
        
        # Equipment sections
        equipment_sections = soup.find_all(
            ['div', 'section', 'article'],
            class_=lambda x: x and any(term in x.lower() for term in self.equipment_categories)
        )
        content['equipment_sections'] = ' '.join(e.get_text(strip=True) for e in equipment_sections)
        
        # Downloads
        downloads = soup.find_all(
            lambda tag: tag.name == 'a' and (
                'download' in tag.get('href', '').lower() or
                'download' in tag.get_text().lower() or
                '.pdf' in tag.get('href', '').lower() or
                '.doc' in tag.get('href', '').lower() or
                'equipment' in tag.get('href', '').lower()
            )
        )
        content['downloads'] = ' '.join(d.get('href', '') + ' ' + d.get_text(strip=True) for d in downloads)
        
        return content

    def find_relevant_subpages(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
        relevant_pages = []
        
        for link in soup.find_all('a', href=True):
            href = link['href']
            absolute_url = urljoin(base_url, href)
            
            if (absolute_url in self.visited_urls or
                urlparse(absolute_url).netloc != urlparse(base_url).netloc):
                continue
                
            url_score = 0
            url_lower = absolute_url.lower()
            
            for term in self.equipment_categories:
                if term in url_lower or term in link.get_text().lower():
                    url_score += 1
                    
            if url_score > 0:
                relevant_pages.append({
                    'url': absolute_url,
                    'initial_score': url_score,
                    'link_text': link.get_text()
                })
                
        return sorted(relevant_pages, key=lambda x: x['initial_score'], reverse=True)

    @profile
    def scrape_page(self, url: str) -> Dict:
        """Enhanced scrape_page method with unique equipment matching."""
        if url in self.visited_urls:
            logging.info(f"Skipping already visited page: {url}")
            return None
            
        self.visited_urls.add(url)
        time.sleep(self.rate_limit_delay)
        
        try:
            response = requests.get(url, headers=self.headers, timeout=10)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            page_content = self.extract_page_content(soup)
            full_content = ' '.join(page_content.values())
            
            # Track unique matches using equipment name as key
            unique_matches = {}
            
            for category, equipment_list in self.target_equipment.items():
                for equipment, variants in equipment_list.items():
                    # Check main equipment name and variants
                    all_names = [equipment] + variants
                    best_match = None
                    best_score = 0
                    
                    for name in all_names:
                        matched, score = self.fuzzy_match_equipment(full_content, name)
                        if matched and score > best_score:
                            best_score = score
                            best_match = {
                                'equipment': equipment,
                                'match_score': score,
                                'context': self.get_match_context(full_content, name)
                            }
                    
                    if best_match and best_score > self.fuzzy_match_threshold:
                        unique_matches[equipment] = best_match
            
            # Organize unique matches by category
            matches_by_category = defaultdict(list)
            for match in unique_matches.values():
                for category, equipment_list in self.target_equipment.items():
                    if match['equipment'] in equipment_list:
                        matches_by_category[category].append(match)
            
            url_rating = self.calculate_url_rating(url, soup, list(unique_matches.values()), full_content)
            
            logging.info(f"Successfully scraped page: {url} (Score: {url_rating['score']})")
            return {
                'url': url,
                'url_rating': url_rating,
                'matches': dict(matches_by_category),
                'content_types': page_content
            }
            
        except Exception as e:
            logging.error(f"Error scraping {url}: {str(e)}")
            return None


    def get_match_context(self, text: str, equipment: str, context_chars: int = 100) -> str:
        text_lower = text.lower()
        equip_lower = equipment.lower()
        
        index = text_lower.find(equip_lower)
        if index == -1:
            return ""
            
        start = max(0, index - context_chars)
        end = min(len(text), index + len(equipment) + context_chars)
        
        return f"...{text[start:end]}..."

    
    def scrape_website(self, studio_name: str, website_url: str) -> Dict:
        """Scrape entire website for equipment information."""
        logging.info(f"\nProcessing studio: {studio_name}")
        logging.info(f"Website: {website_url}")
        
        result = {
            'Studio Name': studio_name,
            'Website': website_url,
            'Status': 'Inactive',
            'Error': '',
            'Pages': [],
            'Equipment_URLs': '',
            'Equipment_Grade': '',
            **{category: '' for category in self.target_equipment.keys()}
        }
        
        try:
            # Initial page scan
            initial_page = self.scrape_page(website_url)
            if not initial_page:
                raise Exception("Failed to scan initial page")
                
            result['Status'] = 'Active'
            result['Pages'].append(initial_page)
            
            # Find and scan relevant subpages
            response = requests.get(website_url, headers=self.headers, timeout=10)
            soup = BeautifulSoup(response.text, 'html.parser')
            relevant_subpages = self.find_relevant_subpages(soup, website_url)
            
            logging.info(f"Found {len(relevant_subpages)} relevant subpages")
            
            # Use ThreadPoolExecutor for parallel scanning of subpages
            with ThreadPoolExecutor(max_workers=3) as executor:
                future_to_url = {
                    executor.submit(self.scrape_page, page['url']): page['url']
                    for page in relevant_subpages[:5]
                }
                
                for future in future_to_url:
                    page_result = future.result()
                    if page_result:
                        result['Pages'].append(page_result)
            
            # Aggregate matches across all pages
            all_matches = defaultdict(list)
            equipment_urls_with_scores = []
            
            for page in result['Pages']:
                has_matches = False
                for category, matches in page['matches'].items():
                    if matches:
                        has_matches = True
                        all_matches[category].extend(matches)
                
                if has_matches:
                    equipment_urls_with_scores.append(
                        (page['url'], page['url_rating']['score'])
                    )
            
            # Calculate equipment grade
            grade_info = self.calculate_equipment_grade(all_matches)
            
            # Format equipment grade string
            grade_details = (
                f"Grade: {grade_info['grade']} "
                f"(Total: {grade_info['total_score']}/150 pts - "
                f"Mic: {grade_info['category_scores'].get('Microphones', 0)}, "
                f"Comp: {grade_info['category_scores'].get('Compressors', 0)}, "
                f"Pre: {grade_info['category_scores'].get('Preamps', 0)}, "
                f"EQ: {grade_info['category_scores'].get('Equalisation', 0)}, "
                f"Mon: {grade_info['category_scores'].get('Monitors', 0)})"
            )
            
            result['Equipment_Grade'] = grade_details
            
            # Format final results
            for category in self.target_equipment.keys():
                if category in all_matches:
                    # Ensure unique matches by equipment name
                    unique_matches = {
                        match['equipment']: match 
                        for match in all_matches[category]
                    }.values()
                    
                    result[category] = '; '.join(
                        f"{m['equipment']} (score: {m['match_score']})" 
                        for m in sorted(unique_matches, key=lambda x: x['match_score'], reverse=True)
                    )
            
            # Add equipment URLs with scores
            equipment_urls_with_scores.sort(key=lambda x: x[1], reverse=True)
            result['Equipment_URLs'] = '; '.join(
                f"{url} (score: {score:.2f})"
                for url, score in equipment_urls_with_scores
            )
            
        except Exception as e:
            result['Status'] = 'Error'
            result['Error'] = str(e)
            logging.error(f"Error processing {studio_name}: {str(e)}")
            
        return result

    
    def process_studio_list(self, studios: List[Dict[str, str]], output_file: str):
        """Process list of studios and save results."""
        results = []
        
        for idx, studio in enumerate(studios, 1):
            logging.info(f"\nProcessing studio {idx}/{len(studios)}")
            result = self.scrape_website(studio['studio_name'], studio['website'])
            results.append(result)
            
            if len(results) % 5 == 0:
                self.save_results(results, output_file)
                logging.info(f"Progress saved after {len(results)} studios")
        
        self.save_results(results, output_file)
        logging.info(f"\nProcessing complete. Processed {len(results)} studios.")

    
    def save_results(self, results: List[Dict], output_file: str):
        """Enhanced save_results method with equipment grading."""
        # Save main results
        main_df = pd.DataFrame([
            {k: v for k, v in r.items() if k != 'Pages'}
            for r in results
        ])
        
        # Reorder columns to include Equipment_Grade after Equipment_URLs
        columns = ['Studio Name', 'Website', 'Equipment_URLs', 'Equipment_Grade', 'Status', 'Error'] + \
                 [col for col in main_df.columns if col not in [
                     'Studio Name', 'Website', 'Equipment_URLs', 'Equipment_Grade', 'Status', 'Error'
                 ]]
        main_df = main_df[columns]
        
        main_df.to_csv(output_file, index=False)
        
        # Save detailed page-level results
        detailed_results = []
        for result in results:
            for page in result.get('Pages', []):
                detailed_results.append({
                    'Studio Name': result['Studio Name'],
                    'Page URL': page['url'],
                    'Overall Score': page['url_rating']['score'],
                    'Equipment Match Score': page['url_rating']['component_scores']['equipment_match'],
                    'Category Match Score': page['url_rating']['component_scores']['category_match'],
                    'URL Structure Score': page['url_rating']['component_scores']['url_structure'],
                    'Page Structure Score': page['url_rating']['component_scores']['page_structure'],
                    'Context Quality Score': page['url_rating']['component_scores']['context_quality'],
                    'Content Types': str(page['content_types']),
                    'Matches': str(page['matches'])
                })
                
        if detailed_results:
            detailed_file = output_file.replace('.csv', '_detailed.csv')
            pd.DataFrame(detailed_results).to_csv(detailed_file, index=False)
            logging.info(f"Results saved to {output_file} and detailed results to {detailed_file}")
        else:
            logging.info(f"Results saved to {output_file}")
    

    @classmethod
    def from_csv(cls, csv_file: str) -> 'StudioScraper':
        """Create a StudioScraper instance and load studios from CSV."""
        logging.info(f"\nLoading studios from {csv_file}")
        scraper = cls()
        try:
            df = pd.read_csv(csv_file)
            
            required_columns = {'studio_name', 'website'}
            if not all(col in df.columns for col in required_columns):
                raise ValueError(f"CSV must contain columns: {required_columns}")
            
            df['website'] = df['website'].apply(lambda x: x.strip() if isinstance(x, str) else x)
            df = df.dropna(subset=['website'])
            
            scraper.studios = df.to_dict('records')
            logging.info(f"Successfully loaded {len(scraper.studios)} studios")
            
            return scraper
            
        except Exception as e:
            logging.error(f"Error loading CSV file {csv_file}: {str(e)}")
            raise

    def run(self, output_file: str = 'data/studio_equip_lists/studio_equipment_results_ii.csv'):
        """Run the scraper on all loaded studios."""
        if not hasattr(self, 'studios'):
            raise ValueError("No studios loaded. Use from_csv() to load studios first.")
            
        self.process_studio_list(self.studios, output_file)

In [17]:
# Example usage
if __name__ == "__main__":
    try:
        # Create scraper instance from CSV
        scraper = StudioScraper.from_csv('data/studio_websites_ii.csv')
        
        # Run the scraper
        scraper.run('data/studio_equip_lists/equipment_results_ii.csv')
        
    except Exception as e:
        logging.error(f"Script execution failed: {str(e)}")

2025-02-03 12:09:34,125 - INFO - 
Loading studios from studio_websites_ii.csv
2025-02-03 12:09:34,163 - INFO - Successfully loaded 51 studios
2025-02-03 12:09:34,177 - INFO - 
Processing studio 1/51
2025-02-03 12:09:34,177 - INFO - 
Processing studio: Audio Sorcery
2025-02-03 12:09:34,178 - INFO - Website: https://audiosorcery.com/
ERROR: Could not find file /var/folders/zf/w4lxv8m15z9442k97fpp97ch0000gn/T/ipykernel_51672/755900924.py
2025-02-03 12:09:37,477 - ERROR - Error scraping https://audiosorcery.com/: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
2025-02-03 12:09:37,480 - ERROR - Error processing Audio Sorcery: Failed to scan initial page
2025-02-03 12:09:37,480 - INFO - 
Processing studio 2/51
2025-02-03 12:09:37,481 - INFO - 
Processing studio: Audio-Vision
2025-02-03 12:09:37,482 - INFO - Website: https://audiovisionstudios.com/
ERROR: Could not find file /var/folders/zf/w4lxv8m15z9442k97fpp97ch0000gn/T/ipykernel_51672/755900924

ERROR: Could not find file /var/folders/zf/w4lxv8m15z9442k97fpp97ch0000gn/T/ipykernel_51672/755900924.py
2025-02-03 12:10:22,997 - ERROR - Error scraping https://www.mission-control-studios.com/: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
2025-02-03 12:10:23,001 - ERROR - Error processing Mission Control Studios: Failed to scan initial page
2025-02-03 12:10:23,001 - INFO - 
Processing studio 15/51
2025-02-03 12:10:23,002 - INFO - 
Processing studio: Morplay Studios
2025-02-03 12:10:23,002 - INFO - Website: https://morplaystudios.com/
ERROR: Could not find file /var/folders/zf/w4lxv8m15z9442k97fpp97ch0000gn/T/ipykernel_51672/755900924.py
2025-02-03 12:10:26,631 - ERROR - Error scraping https://morplaystudios.com/: 520 Server Error:  for url: https://morplaystudios.com/
2025-02-03 12:10:26,635 - ERROR - Error processing Morplay Studios: Failed to scan initial page
2025-02-03 12:10:26,643 - INFO - Results saved to studio_equip_lists/equipm

2025-02-03 12:11:10,382 - INFO - Progress saved after 25 studios
2025-02-03 12:11:10,383 - INFO - 
Processing studio 26/51
2025-02-03 12:11:10,383 - INFO - 
Processing studio: Rak Studio 3
2025-02-03 12:11:10,384 - INFO - Website: https://rakstudios.co.uk/studios/studio-3
ERROR: Could not find file /var/folders/zf/w4lxv8m15z9442k97fpp97ch0000gn/T/ipykernel_51672/755900924.py
2025-02-03 12:11:13,591 - ERROR - Error scraping https://rakstudios.co.uk/studios/studio-3: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
2025-02-03 12:11:13,595 - ERROR - Error processing Rak Studio 3: Failed to scan initial page
2025-02-03 12:11:13,596 - INFO - 
Processing studio 27/51
2025-02-03 12:11:13,597 - INFO - 
Processing studio: Rak Studio 4
2025-02-03 12:11:13,599 - INFO - Website: https://rakstudios.co.uk/studios/studio-4
ERROR: Could not find file /var/folders/zf/w4lxv8m15z9442k97fpp97ch0000gn/T/ipykernel_51672/755900924.py
2025-02-03 12:11:16,786 - ERROR

ERROR: Could not find file /var/folders/zf/w4lxv8m15z9442k97fpp97ch0000gn/T/ipykernel_51672/755900924.py
2025-02-03 12:11:58,215 - ERROR - Error scraping https://www.sunsetsound.com/: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
2025-02-03 12:11:58,217 - ERROR - Error processing Sunset Sound: Failed to scan initial page
2025-02-03 12:11:58,218 - INFO - 
Processing studio 39/51
2025-02-03 12:11:58,219 - INFO - 
Processing studio: Tape To Tape
2025-02-03 12:11:58,219 - INFO - Website: https://tapelondonstudio.com/
ERROR: Could not find file /var/folders/zf/w4lxv8m15z9442k97fpp97ch0000gn/T/ipykernel_51672/755900924.py
2025-02-03 12:12:01,480 - ERROR - Error scraping https://tapelondonstudio.com/: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
2025-02-03 12:12:01,483 - ERROR - Error processing Tape To Tape: Failed to scan initial page
2025-02-03 12:12:01,484 - INFO - 
Processing studio 40/51
2025-

2025-02-03 12:13:06,099 - ERROR - Error scraping https://tweedrecording.com/: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
2025-02-03 12:13:06,102 - ERROR - Error processing Tweed Recording: Failed to scan initial page
2025-02-03 12:13:06,103 - INFO - 
Processing studio 47/51
2025-02-03 12:13:06,104 - INFO - 
Processing studio: Village (Recorder), The
2025-02-03 12:13:06,104 - INFO - Website: https://www.villagestudios.com/studios
ERROR: Could not find file /var/folders/zf/w4lxv8m15z9442k97fpp97ch0000gn/T/ipykernel_51672/755900924.py
2025-02-03 12:13:11,482 - INFO - Successfully scraped page: https://www.villagestudios.com/studios (Score: 28.0)
2025-02-03 12:13:11,679 - INFO - Found 19 relevant subpages
ERROR: Could not find file /var/folders/zf/w4lxv8m15z9442k97fpp97ch0000gn/T/ipykernel_51672/755900924.py
ERROR: Could not find file /var/folders/zf/w4lxv8m15z9442k97fpp97ch0000gn/T/ipykernel_51672/755900924.py
ERROR: Could not find file /v