In [2]:
import json
import os
import shutil
from datetime import datetime

def backup_corrupted_log(log_file="scrape_log.json"):
    """
    Create a backup of the corrupted log file
    """
    if os.path.exists(log_file):
        backup_name = f"{log_file}.corrupted_backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        shutil.copy2(log_file, backup_name)
        print(f"Backed up corrupted log to: {backup_name}")
        return backup_name
    return None

def attempt_json_recovery(log_file="scrape_log.json"):
    """
    Attempt to recover as much data as possible from the corrupted JSON
    """
    print(f"Attempting to recover data from {log_file}...")
    
    try:
        with open(log_file, 'r') as f:
            content = f.read()
        
        print(f"File size: {len(content)} characters")
        
        # Try to find where the corruption starts
        # Look for the last complete entry
        lines = content.split('\n')
        print(f"Total lines: {len(lines)}")
        
        # Strategy 1: Try to find the last valid JSON structure
        # Look for the pattern that indicates complete entries
        recovered_data = {}
        
        # Find all complete key-value pairs
        import re
        
        # Pattern to match complete log entries
        pattern = r'"(\d+_\d+)"\s*:\s*\{[^}]*"error_msg":\s*(?:"[^"]*"|null)\s*\}'
        matches = re.findall(pattern, content, re.DOTALL)
        
        print(f"Found {len(matches)} potentially complete entries using regex")
        
        # Try a different approach: manually parse line by line
        print("\nTrying line-by-line recovery...")
        
        # Look for the opening brace
        brace_count = 0
        in_string = False
        escape_next = False
        last_complete_pos = 0
        
        for i, char in enumerate(content):
            if escape_next:
                escape_next = False
                continue
                
            if char == '\\' and in_string:
                escape_next = True
                continue
                
            if char == '"' and not escape_next:
                in_string = not in_string
                continue
                
            if not in_string:
                if char == '{':
                    brace_count += 1
                elif char == '}':
                    brace_count -= 1
                    if brace_count == 1:  # Back to main object level
                        last_complete_pos = i
        
        if last_complete_pos > 0:
            print(f"Found last complete position at character {last_complete_pos}")
            
            # Extract the valid portion and try to close it properly
            valid_portion = content[:last_complete_pos + 1]
            
            # Make sure it ends properly
            if not valid_portion.rstrip().endswith('}'):
                valid_portion += '\n}'
            
            # Try to parse this portion
            try:
                recovered_data = json.loads(valid_portion)
                print(f"✓ Successfully recovered {len(recovered_data)} entries")
                return recovered_data
            except json.JSONDecodeError as e:
                print(f"Still couldn't parse recovered portion: {e}")
        
        # Strategy 2: Try to extract individual entries manually
        print("\nTrying manual entry extraction...")
        
        # Look for individual complete entries
        entry_pattern = r'"(\d+_\d+)"\s*:\s*(\{(?:[^{}]|(?2))*\})'
        individual_matches = re.finditer(entry_pattern, content)
        
        recovered_entries = {}
        valid_entries = 0
        
        for match in individual_matches:
            key = match.group(1)
            entry_json = match.group(2)
            try:
                entry_data = json.loads(entry_json)
                recovered_entries[key] = entry_data
                valid_entries += 1
            except json.JSONDecodeError:
                continue
        
        if valid_entries > 0:
            print(f"✓ Manually recovered {valid_entries} individual entries")
            return recovered_entries
        
        print("❌ Could not recover any data from the corrupted file")
        return {}
        
    except Exception as e:
        print(f"Error during recovery attempt: {e}")
        return {}

def rebuild_log_from_csv(output_dir="scraped_data"):
    """
    Rebuild the log from existing CSV files
    """
    print("Rebuilding log from existing CSV files...")
    
    if not os.path.exists(output_dir):
        print("No scraped data directory found")
        return {}
    
    rebuilt_log = {}
    
    # Find all year directories
    year_dirs = [d for d in os.listdir(output_dir) if d.startswith('year_') and os.path.isdir(os.path.join(output_dir, d))]
    
    for year_dir in year_dirs:
        year_path = os.path.join(output_dir, year_dir)
        year = year_dir.replace('year_', '')
        
        # Check for combined file first
        combined_file = os.path.join(year_path, f"combined_video_details_{year}.csv")
        files_to_process = []
        
        if os.path.exists(combined_file):
            files_to_process.append(combined_file)
        else:
            # Get all batch files
            batch_files = [f for f in os.listdir(year_path) if f.startswith('batch_') and f.endswith('.csv')]
            files_to_process = [os.path.join(year_path, f) for f in batch_files]
        
        # Process each file
        for file_path in files_to_process:
            try:
                import pandas as pd
                df = pd.read_csv(file_path)
                
                if 'def_id' in df.columns and 'gi' in df.columns:
                    # Group by player_id and game_id to count records
                    grouped = df.groupby(['def_id', 'gi']).agg({
                        'team_id': 'first',
                        'player_name': 'first',
                        'year': 'first'
                    }).reset_index()
                    
                    for _, row in grouped.iterrows():
                        player_id = str(row['def_id'])
                        game_id = str(row['gi'])
                        key = f"{player_id}_{game_id}"
                        
                        # Count records for this combination
                        record_count = len(df[(df['def_id'] == row['def_id']) & (df['gi'] == row['gi'])])
                        
                        rebuilt_log[key] = {
                            "player_id": player_id,
                            "game_id": game_id,
                            "team_id": str(row['team_id']),
                            "player_name": row['player_name'],
                            "year": int(row['year']),
                            "timestamp": "rebuilt_from_csv",
                            "success": True,
                            "record_count": record_count,
                            "has_data": True,
                            "error_msg": None
                        }
                
            except Exception as e:
                print(f"Error processing {file_path}: {e}")
    
    print(f"Rebuilt log with {len(rebuilt_log)} entries from CSV files")
    return rebuilt_log

def fix_corrupted_log(log_file="scrape_log.json"):
    """
    Main function to fix the corrupted log file
    """
    print("=== FIXING CORRUPTED SCRAPE LOG ===")
    
    # Step 1: Backup the corrupted file
    backup_path = backup_corrupted_log(log_file)
    
    # Step 2: Try to recover data from the corrupted file
    recovered_data = attempt_json_recovery(log_file)
    
    # Step 3: Rebuild from CSV files
    csv_data = rebuild_log_from_csv()
    
    # Step 4: Merge recovered data with CSV data (CSV data takes precedence for conflicts)
    final_log = {}
    
    # Start with recovered data
    final_log.update(recovered_data)
    print(f"Started with {len(recovered_data)} recovered entries")
    
    # Add CSV data (will overwrite any conflicts)
    added_from_csv = 0
    for key, value in csv_data.items():
        if key not in final_log:
            added_from_csv += 1
        final_log[key] = value
    
    print(f"Added {added_from_csv} new entries from CSV files")
    print(f"Final log contains {len(final_log)} total entries")
    
    # Step 5: Save the fixed log
    try:
        with open(log_file, 'w') as f:
            json.dump(final_log, f, indent=2)
        print(f"✓ Successfully saved fixed log to {log_file}")
        
        # Verify the fixed file
        with open(log_file, 'r') as f:
            verification = json.load(f)
        print(f"✓ Verification successful: {len(verification)} entries loaded")
        
        return True
        
    except Exception as e:
        print(f"❌ Error saving fixed log: {e}")
        return False

if __name__ == "__main__":
    success = fix_corrupted_log()
    if success:
        print("\n🎉 Log recovery completed successfully!")
        print("You can now restart your scraping process.")
    else:
        print("\n❌ Log recovery failed. You may need to start with a fresh log.")
        print("Consider renaming the corrupted file and starting fresh.")

=== FIXING CORRUPTED SCRAPE LOG ===
Backed up corrupted log to: scrape_log.json.corrupted_backup_20250619_091454
Attempting to recover data from scrape_log.json...
File size: 2 characters
Total lines: 1
Found 0 potentially complete entries using regex

Trying line-by-line recovery...

Trying manual entry extraction...
Error during recovery attempt: unknown extension ?2 at position 31
Rebuilding log from existing CSV files...
No scraped data directory found
Started with 0 recovered entries
Added 0 new entries from CSV files
Final log contains 0 total entries
✓ Successfully saved fixed log to scrape_log.json
✓ Verification successful: 0 entries loaded

🎉 Log recovery completed successfully!
You can now restart your scraping process.
