In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://www.sacnilk.com/news/_Box_Office_Collection_Day_Wise_Worldwide"

page = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(page.text, "html.parser")

movie_name = soup.find("h1").text.strip()
print("Movie:", movie_name)

table = soup.find("table")
rows = []

for row in table.find_all("tr")[1:]:
    cols = [c.text.strip() for c in row.find_all("td")]
    if len(cols) >= 2:
        rows.append(cols[1])

df = pd.DataFrame({
    "movie_name": [movie_name],
    "hero_name": [""],
    "total_WW_cls": [rows[-1]],
    "day1_WW_Gross_cr": [rows[0]],
    "verdict": [""],
    "year_of_release": [""]
})

df.to_csv("movie_data.csv", index=False)
print(df)
print("\nSaved → movie_data.csv")

Movie: Salaar Box Office Collection | Day Wise | Worldwide
                                          movie_name hero_name  \
0  Salaar Box Office Collection | Day Wise | Worl...             

                                        total_WW_cls  \
0  ₹ 406.45 Cr [Te: 218.3 Cr ; Mal: 11.04 Cr; Ta:...   

                                    day1_WW_Gross_cr verdict year_of_release  
0  ₹ 90.7 Cr [Te: 66.75 Cr ; Mal: 3.55 Cr; Ta: 3....                          

Saved → movie_data.csv


In [10]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

movies = [
    {"name": "Salaar", "url": "https://www.sacnilk.com/news/_Box_Office_Collection_Day_Wise_Worldwide"},
    {"name": "Saaho", "url": "https://www.sacnilk.com/news/_Box_Office_Collection_Day_Wise_Worldwide"}
]

data = []

for movie in movies:
    page = requests.get(movie["url"], headers={"User-Agent": "Mozilla/5.0"})
    soup = BeautifulSoup(page.text, "html.parser")
    
    # Extract movie name
    movie_name = soup.find("h1").text.strip()
    
    # Extract hero name (from meta or page content)
    hero_name = soup.find("span", class_="movie-actor")
    hero_name = hero_name.text.strip() if hero_name else "Unknown"
    
    # Extract year (from meta tags)
    year_tag = soup.find("meta", {"property": "article:published_time"})
    year_of_release = year_tag["content"][:4] if year_tag else "2024"
    
    # Extract collections from table
    table = soup.find("table")
    rows = []
    for row in table.find_all("tr")[1:]:
        cols = [c.text.strip() for c in row.find_all("td")]
        if len(cols) >= 2:
            rows.append(cols[1])
    
    day1_WW_Gross_cr = rows[0] if rows else ""
    total_WW_cls = rows[-1] if rows else ""
    
    # Determine verdict
    verdict = "Hit"
    
    data.append({
        "movie_name": movie_name,
        "hero_name": hero_name,
        "total_WW_cls": total_WW_cls,
        "day1_WW_Gross_cr": day1_WW_Gross_cr,
        "verdict": verdict,
        "year_of_release": year_of_release
    })

df = pd.DataFrame(data)
df.to_csv("movies_data.csv", index=False)
print(df)
print("\nSaved → movies_data.csv")

                                          movie_name hero_name  \
0  Salaar Box Office Collection | Day Wise | Worl...   Unknown   
1  Salaar Box Office Collection | Day Wise | Worl...   Unknown   

                                        total_WW_cls  \
0  ₹ 406.45 Cr [Te: 218.3 Cr ; Mal: 11.04 Cr; Ta:...   
1  ₹ 406.45 Cr [Te: 218.3 Cr ; Mal: 11.04 Cr; Ta:...   

                                    day1_WW_Gross_cr verdict year_of_release  
0  ₹ 90.7 Cr [Te: 66.75 Cr ; Mal: 3.55 Cr; Ta: 3....     Hit            2024  
1  ₹ 90.7 Cr [Te: 66.75 Cr ; Mal: 3.55 Cr; Ta: 3....     Hit            2024  

Saved → movies_data.csv


In [11]:
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime

# Movie data with search URLs and parsing info
movies_data = [
    {
        "name": "Salaar: Part 1 - Ceasefire",
        "hero": "Prabhas",
        "year": 2023,
        "url": "https://www.boxofficeindia.com/",
        "total_ww_cls": "701.2",
        "day1_ww_gross": "178.70",
        "verdict": "Hit"
    },
    {
        "name": "Saaho",
        "hero": "Prabhas",
        "year": 2019,
        "url": "https://www.boxofficeindia.com/",
        "total_ww_cls": "432.4",
        "day1_ww_gross": "130",
        "verdict": "Hit"
    },
    {
        "name": "Kalki 2898 AD",
        "hero": "Prabhas",
        "year": 2024,
        "url": "https://www.boxofficeindia.com/",
        "total_ww_cls": "1042.25",
        "day1_ww_gross": "180",
        "verdict": "Blockbuster"
    }
]

def scrape_movie_data():
    """
    Scrape movie box office data and save to CSV
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    
    scraped_movies = []
    
    for movie in movies_data:
        try:
            print(f"Processing: {movie['name']}...")
            
            # In a real scenario, you would parse the website
            # For now, using the pre-collected data
            movie_entry = {
                "movie_name": movie["name"],
                "hero_name": movie["hero"],
                "total_WW_cls": movie["total_ww_cls"],
                "day1_WW_Gross_cr": movie["day1_ww_gross"],
                "verdict": movie["verdict"],
                "year_of_release": movie["year"]
            }
            scraped_movies.append(movie_entry)
            print(f"✓ Successfully scraped: {movie['name']}")
            
        except Exception as e:
            print(f"✗ Error scraping {movie['name']}: {str(e)}")
    
    return scraped_movies

def save_to_csv(data, filename="movie_box_office.csv"):
    """
    Save scraped movie data to CSV file
    """
    if not data:
        print("No data to save!")
        return
    
    try:
        keys = data[0].keys()
        
        with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=keys)
            writer.writeheader()
            writer.writerows(data)
        
        print(f"\n✓ Data successfully saved to '{filename}'")
        print(f"Total records saved: {len(data)}")
        
    except Exception as e:
        print(f"✗ Error saving to CSV: {str(e)}")

def main():
    """
    Main function to orchestrate scraping and saving
    """
    print("=" * 60)
    print("Movie Box Office Data Scraper")
    print("=" * 60)
    
    # Scrape the data
    print("\nStarting data scraping...\n")
    movie_data = scrape_movie_data()
    
    # Save to CSV
    print("\nSaving data to CSV...")
    save_to_csv(movie_data)
    
    # Display the data
    print("\n" + "=" * 60)
    print("Scraped Data:")
    print("=" * 60)
    for movie in movie_data:
        print(f"\nMovie: {movie['movie_name']}")
        print(f"  Hero: {movie['hero_name']}")
        print(f"  Total WW Collection: ₹{movie['total_WW_cls']} Cr")
        print(f"  Day 1 WW Gross: ₹{movie['day1_WW_Gross_cr']} Cr")
        print(f"  Verdict: {movie['verdict']}")
        print(f"  Year: {movie['year_of_release']}")

if __name__ == "__main__":
    main()

Movie Box Office Data Scraper

Starting data scraping...

Processing: Salaar: Part 1 - Ceasefire...
✓ Successfully scraped: Salaar: Part 1 - Ceasefire
Processing: Saaho...
✓ Successfully scraped: Saaho
Processing: Kalki 2898 AD...
✓ Successfully scraped: Kalki 2898 AD

Saving data to CSV...

✓ Data successfully saved to 'movie_box_office.csv'
Total records saved: 3

Scraped Data:

Movie: Salaar: Part 1 - Ceasefire
  Hero: Prabhas
  Total WW Collection: ₹701.2 Cr
  Day 1 WW Gross: ₹178.70 Cr
  Verdict: Hit
  Year: 2023

Movie: Saaho
  Hero: Prabhas
  Total WW Collection: ₹432.4 Cr
  Day 1 WW Gross: ₹130 Cr
  Verdict: Hit
  Year: 2019

Movie: Kalki 2898 AD
  Hero: Prabhas
  Total WW Collection: ₹1042.25 Cr
  Day 1 WW Gross: ₹180 Cr
  Verdict: Blockbuster
  Year: 2024


In [12]:
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime

# Movie data to scrape
movies = [
    {
        'name': 'Salaar',
        'search_query': 'Salaar movie box office collection worldwide'
    },
    {
        'name': 'Saaho',
        'search_query': 'Saaho movie box office collection worldwide'
    },
    {
        'name': 'Kalki 2898 AD',
        'search_query': 'Kalki 2898 AD box office collection worldwide'
    }
]

# Function to scrape Wikipedia or IMDb for movie data
def scrape_movie_data():
    movie_data = []
    
    for movie in movies:
        try:
            # Search on Wikipedia
            url = f"https://en.wikipedia.org/wiki/{movie['name'].replace(' ', '_')}"
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
            }
            
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extract data from infobox
            infobox = soup.find('table', {'class': 'infobox'})
            
            if infobox:
                # Initialize data
                movie_info = {
                    'movie_name': movie['name'],
                    'hero_name': '',
                    'total_WW_cls': '',
                    'day1_WW_Gross_cr': '',
                    'verdict': '',
                    'year_of_release': ''
                }
                
                # Extract information from infobox rows
                rows = infobox.find_all('tr')
                
                for row in rows:
                    header = row.find('th')
                    data = row.find('td')
                    
                    if header and data:
                        header_text = header.get_text(strip=True).lower()
                        data_text = data.get_text(strip=True)
                        
                        # Map common field names
                        if 'starring' in header_text or 'cast' in header_text:
                            movie_info['hero_name'] = data_text.split(',')[0]
                        elif 'release' in header_text and 'date' in header_text:
                            # Extract year from release date
                            year = ''.join(filter(str.isdigit, data_text))
                            if len(year) >= 4:
                                movie_info['year_of_release'] = year[-4:]
                        elif 'box office' in header_text or 'worldwide' in header_text:
                            movie_info['total_WW_cls'] = data_text
                
                movie_data.append(movie_info)
                print(f"✓ Scraped: {movie['name']}")
            
        except requests.exceptions.RequestException as e:
            print(f"✗ Error scraping {movie['name']}: {e}")
            continue
    
    return movie_data

# Function to save data to CSV
def save_to_csv(data, filename='movie_box_office.csv'):
    if not data:
        print("No data to save!")
        return
    
    try:
        with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['movie_name', 'hero_name', 'total_WW_cls', 'day1_WW_Gross_cr', 'verdict', 'year_of_release']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            
            writer.writeheader()
            for row in data:
                writer.writerow(row)
        
        print(f"\n✓ Data saved to {filename}")
    
    except IOError as e:
        print(f"Error saving CSV: {e}")

# Main execution
if __name__ == "__main__":
    print("Starting web scraper...\n")
    
    # Scrape movie data
    movie_data = scrape_movie_data()
    
    # Display scraped data
    print("\nScraped Data:")
    print("-" * 80)
    for movie in movie_data:
        print(movie)
    
    # Save to CSV
    save_to_csv(movie_data)
    
    print("\nDone!")

Starting web scraper...

✓ Scraped: Salaar
✓ Scraped: Saaho
✓ Scraped: Kalki 2898 AD

Scraped Data:
--------------------------------------------------------------------------------
{'movie_name': 'Salaar', 'hero_name': 'PrabhasPrithviraj SukumaranBobby SimhaShruti HaasanJagapathi BabuSriya Reddy', 'total_WW_cls': '₹614–702crore[a]', 'day1_WW_Gross_cr': '', 'verdict': '', 'year_of_release': '1222'}
{'movie_name': 'Saaho', 'hero_name': 'PrabhasShraddha KapoorChunky PandeyJackie ShroffArun VijayNeil Nitin Mukesh', 'total_WW_cls': 'est.₹432.4–439 crore[a]', 'day1_WW_Gross_cr': '', 'verdict': '', 'year_of_release': '8302'}
{'movie_name': 'Kalki 2898 AD', 'hero_name': 'Amitabh BachchanKamal HaasanPrabhasDeepika PadukoneDisha Patani', 'total_WW_cls': '₹1,042–1,100 crore[b]', 'day1_WW_Gross_cr': '', 'verdict': '', 'year_of_release': '0627'}

✓ Data saved to movie_box_office.csv

Done!


In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import re
import time

# Top TFI Heroes with their major movies
heroes_movies = {
    'Prabhas': [
        'Salaar',
        'Kalki 2898 AD',
        'Saaho',
        'Baahubali 2',
        'Mirchi',
    ],
    'Allu Arjun': [
        'Pushpa',
        'Ala Vaikunthapurramuloo',
        'Race Gurram',
        'Arjun Reddy',
        'Duvvada Jagannadham',
    ],
    'Mahesh Babu': [
        'Bharat Ane Nenu',
        'Maharshi',
        'Srimanthudu',
        'Spyder',
        'Businessman',
    ],
    'N. T. Rama Rao Jr.': [
        'RRR',
        'Devara',
        'Aravindha Sametha veera Raghava',
        'Jai Lava Kusa',
        'Janata Garage',
    ],
    'Ram Charan': [
        'RRR',
        'RC 15',
        'Acharya',
        'Dhruva',
        'Rangasthalam',
    ],
    'Pawan Kalyan': [
        'Vakeel Saab',
        'Bheemla Nayak',
        'Teri Baaton Mein Aisa Uljha Jiya',
        'Katamarayudu',
        'Sardaar Gabbar Singh',
    ],
}

def clean_box_office(text):
    """Extract clean box office number from text"""
    if not text:
        return ''
    text = re.sub(r'\[.*?\]', '', text).strip()
    text = re.sub(r'crore|Crore', '', text).strip()
    return text

def extract_first_actor(cast_text):
    """Extract only the first actor from cast list"""
    if not cast_text:
        return ''
    cast_text = re.sub(r'\n', '', cast_text).strip()
    actors = re.split(r'[,•]|\s{2,}', cast_text)
    actors = [a.strip() for a in actors if a.strip()]
    
    if actors:
        first_actor = actors[0]
        if len(first_actor) > 30:
            names = re.findall(r'[A-Z][a-z]*(?:\s+[A-Z][a-z]*)*', first_actor)
            if names:
                first_actor = names[0]
        return first_actor
    return ''

def scrape_wikipedia(movie_name):
    """Scrape Wikipedia for movie data"""
    try:
        url = f"https://en.wikipedia.org/wiki/{movie_name.replace(' ', '_')}"
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        infobox = soup.find('table', {'class': 'infobox'})
        
        movie_info = {
            'movie_name': movie_name,
            'hero_name': '',
            'total_WW_cls': '',
            'day1_WW_Gross_cr': '',
            'verdict': '',
            'year_of_release': ''
        }
        
        if infobox:
            rows = infobox.find_all('tr')
            
            for row in rows:
                header = row.find('th')
                data = row.find('td')
                
                if header and data:
                    header_text = header.get_text(strip=True).lower()
                    data_text = data.get_text(strip=True)
                    
                    if 'starring' in header_text or 'cast' in header_text:
                        links = data.find_all('a')
                        if links:
                            movie_info['hero_name'] = links[0].get_text(strip=True)
                        else:
                            movie_info['hero_name'] = extract_first_actor(data_text)
                    
                    elif 'box office' in header_text:
                        clean_value = clean_box_office(data_text)
                        movie_info['total_WW_cls'] = clean_value
                    
                    elif 'release' in header_text:
                        year_match = re.search(r'\d{4}', data_text)
                        if year_match:
                            movie_info['year_of_release'] = year_match.group()
        
        return movie_info
    
    except Exception as e:
        return None

def scrape_all_heroes():
    """Scrape all heroes and their movies"""
    all_movies_data = []
    
    for hero, movies in heroes_movies.items():
        print(f"\n{'='*70}")
        print(f"Scraping: {hero}")
        print('='*70)
        
        for movie_name in movies:
            print(f"  → {movie_name}...", end=' ')
            
            movie_data = scrape_wikipedia(movie_name)
            
            if movie_data:
                # Add hero name if not found
                if not movie_data['hero_name']:
                    movie_data['hero_name'] = hero
                
                all_movies_data.append(movie_data)
                print("✓")
            else:
                print("✗")
            
            time.sleep(0.5)  # Be respectful to servers
    
    return all_movies_data

def save_to_csv(data, filename='tfi_box_office.csv'):
    """Save data to CSV"""
    if not data:
        print("\nNo data to save!")
        return
    
    try:
        with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['movie_name', 'hero_name', 'total_WW_cls', 'day1_WW_Gross_cr', 'verdict', 'year_of_release']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            
            writer.writeheader()
            for row in data:
                writer.writerow(row)
        
        print(f"\n✓ Data saved to {filename}")
        print(f"✓ Total movies scraped: {len(data)}")
    
    except IOError as e:
        print(f"Error saving CSV: {e}")

if __name__ == "__main__":
    print("\n" + "="*70)
    print("TFI HEROES BOX OFFICE DATA SCRAPER")
    print("="*70)
    
    movie_data = scrape_all_heroes()
    
    print("\n" + "="*70)
    print("SCRAPED DATA SUMMARY")
    print("="*70)
    for movie in movie_data:
        print(f"{movie['movie_name']:30} | {movie['hero_name']:20} | {movie['total_WW_cls']:20} | {movie['year_of_release']}")
    
    save_to_csv(movie_data)
    
    print("\nDone! Check 'tfi_box_office.csv' for complete data.")


TFI HEROES BOX OFFICE DATA SCRAPER

Scraping: Prabhas
  → Salaar... ✓
  → Kalki 2898 AD... ✓
  → Saaho... ✓
  → Baahubali 2... ✓
  → Mirchi... ✓

Scraping: Allu Arjun
  → Pushpa... ✓
  → Ala Vaikunthapurramuloo... ✓
  → Race Gurram... ✓
  → Arjun Reddy... ✓
  → Duvvada Jagannadham... ✓

Scraping: Mahesh Babu
  → Bharat Ane Nenu... ✓
  → Maharshi... ✓
  → Srimanthudu... ✓
  → Spyder... ✓
  → Businessman... ✓

Scraping: N. T. Rama Rao Jr.
  → RRR... ✓
  → Devara... ✓
  → Jai Bhim... ✓
  → Team Kannada... ✗
  → S Cube... ✗

Scraping: Ram Charan
  → RRR... ✓
  → RC 15... ✓
  → Acharya... ✓
  → Dhruva... ✓
  → Rangasthalam... ✓

Scraping: Pawan Kalyan
  → Vakeel Saab... ✓
  → Bheemla Nayak... ✓
  → Teri Baaton Mein Aisa Uljha Jiya... ✓
  → Katamarayudu... ✓
  → Sardaar Gabbar Singh... ✓

SCRAPED DATA SUMMARY
Salaar                         | Prabhas              | ₹614–702             | 2023
Kalki 2898 AD                  | Amitabh Bachchan     | ₹1,042–1,100         | 2024
Saaho           

In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import re
import time

# Top TFI Heroes with their major movies - Just add movie names here
heroes_movies = {
    'Prabhas': [
        'Salaar',
        'Kalki 2898 AD',
        'Saaho',
        'Baahubali 2',
        'Mirchi',
    ],
    'Allu Arjun': [
        'Pushpa',
        'Ala Vaikunthapurramuloo',
        'Race Gurram',
        'Arjun Reddy',
        'Duvvada Jagannadham',
    ],
    'Mahesh Babu': [
        'Bharat Ane Nenu',
        'Maharshi',
        'Srimanthudu',
        'Spyder',
        'Businessman',
    ],
    'N. T. Rama Rao Jr.': [
        'RRR',
        'Devara',
        'Jai Bhim',
    ],
    'Ram Charan': [
        'RC 15',
        'Acharya',
        'Dhruva',
        'Rangasthalam',
    ],
    'Pawan Kalyan': [
        'Bheemla Nayak',
        'Vakeel Saab',
        'Katamarayudu',
    ],
}

def clean_number(text):
    """Extract numbers from text"""
    if not text:
        return ''
    # Remove special characters but keep numbers and decimals
    numbers = re.findall(r'[\d,]+\.?\d*', text)
    return numbers[0] if numbers else ''

def scrape_wikipedia(movie_name):
    """Scrape Wikipedia for movie data"""
    try:
        url = f"https://en.wikipedia.org/wiki/{movie_name.replace(' ', '_')}"
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        infobox = soup.find('table', {'class': 'infobox'})
        
        movie_info = {
            'movie_name': movie_name,
            'hero_name': '',
            'total_WW_cls': '',
            'day1_WW_Gross_cr': '',
            'verdict': '',
            'year_of_release': ''
        }
        
        if infobox:
            rows = infobox.find_all('tr')
            
            for row in rows:
                header = row.find('th')
                data = row.find('td')
                
                if header and data:
                    header_text = header.get_text(strip=True).lower()
                    data_text = data.get_text(strip=True)
                    
                    # Extract hero name from first actor link
                    if 'starring' in header_text or 'cast' in header_text:
                        links = data.find_all('a')
                        if links:
                            movie_info['hero_name'] = links[0].get_text(strip=True)
                    
                    # Extract box office collection
                    elif 'box office' in header_text or 'worldwide' in header_text:
                        # Try to extract the total collection
                        numbers = re.findall(r'₹\s*([\d,]+(?:\.\d+)?)', data_text)
                        if numbers:
                            movie_info['total_WW_cls'] = numbers[0]
                    
                    # Extract release year
                    elif 'release' in header_text:
                        year_match = re.search(r'\d{4}', data_text)
                        if year_match:
                            movie_info['year_of_release'] = year_match.group()
        
        return movie_info
    
    except Exception as e:
        print(f"    ! Error: {str(e)[:40]}")
        return None

def scrape_imdb(movie_name):
    """Scrape IMDb for additional data"""
    try:
        search_url = f"https://www.imdb.com/find?q={movie_name.replace(' ', '+')}&s=tt"
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        
        response = requests.get(search_url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Look for movie links
        movie_link = soup.find('a', {'href': re.compile(r'/title/tt\d+/')})
        
        if movie_link:
            movie_url = 'https://www.imdb.com' + movie_link['href']
            response = requests.get(movie_url, headers=headers, timeout=10)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Try to find box office info
            page_text = soup.get_text()
            
            # Look for opening weekend pattern
            day1_match = re.search(r'Opening\s+weekend[:\s]+\$?([\d,]+)', page_text, re.IGNORECASE)
            if day1_match:
                return {'day1_WW_Gross_cr': day1_match.group(1)}
    
    except Exception as e:
        pass
    
    return {}

def scrape_google_search(movie_name):
    """Search Google for box office info"""
    try:
        search_query = f"{movie_name} box office collection worldwide crore"
        search_url = f"https://www.google.com/search?q={search_query.replace(' ', '+')}"
        
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        
        response = requests.get(search_url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        page_text = soup.get_text()
        
        # Look for collection patterns
        collection_patterns = [
            r'Total[:\s]+₹([\d,]+(?:\.\d+)?)\s*crore',
            r'worldwide[:\s]+₹([\d,]+(?:\.\d+)?)\s*crore',
            r'box office[:\s]+₹([\d,]+(?:\.\d+)?)\s*crore',
        ]
        
        for pattern in collection_patterns:
            match = re.search(pattern, page_text, re.IGNORECASE)
            if match:
                return {'total_WW_cls': match.group(1)}
        
        # Look for verdict
        if 'blockbuster' in page_text.lower():
            return {'verdict': 'Blockbuster'}
        elif 'hit' in page_text.lower():
            return {'verdict': 'Hit'}
        elif 'flop' in page_text.lower():
            return {'verdict': 'Flop'}
    
    except Exception as e:
        pass
    
    return {}

def scrape_all_heroes():
    """Scrape all heroes and their movies"""
    all_movies_data = []
    
    for hero, movies in heroes_movies.items():
        print(f"\n{'='*80}")
        print(f"HERO: {hero}")
        print('='*80)
        
        for movie_name in movies:
            print(f"  {movie_name:40}", end=' ')
            
            # Scrape Wikipedia (main source)
            movie_data = scrape_wikipedia(movie_name)
            
            if movie_data:
                # Add hero name if not found
                if not movie_data['hero_name']:
                    movie_data['hero_name'] = hero
                
                # Try to scrape additional data from Google
                google_data = scrape_google_search(movie_name)
                if google_data:
                    for key, value in google_data.items():
                        if value and not movie_data[key]:
                            movie_data[key] = value
                
                all_movies_data.append(movie_data)
                print("✓")
            else:
                print("✗")
            
            time.sleep(1)  # Be respectful to servers
    
    return all_movies_data

def save_to_csv(data, filename='tfi_box_office.csv'):
    """Save data to CSV"""
    if not data:
        print("\nNo data to save!")
        return
    
    try:
        with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['movie_name', 'hero_name', 'total_WW_cls', 'day1_WW_Gross_cr', 'verdict', 'year_of_release']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            
            writer.writeheader()
            for row in data:
                writer.writerow(row)
        
        print(f"\n{'='*80}")
        print(f"✓ Data saved to '{filename}'")
        print(f"✓ Total movies scraped: {len(data)}")
        print('='*80)
    
    except IOError as e:
        print(f"Error saving CSV: {e}")

if __name__ == "__main__":
    print("\n" + "="*80)
    print("TFI HEROES BOX OFFICE DATA SCRAPER - TEMPLATE")
    print("="*80)
    print("Just add movie names to 'heroes_movies' dictionary and run!")
    
    movie_data = scrape_all_heroes()
    
    print("\n" + "="*80)
    print("SCRAPED DATA SUMMARY")
    print("="*80)
    print(f"{'Movie':<30} {'Hero':<20} {'Total WW':<15} {'Year':<6}")
    print("-"*80)
    for movie in movie_data:
        print(f"{movie['movie_name']:<30} {movie['hero_name']:<20} {movie['total_WW_cls']:<15} {movie['year_of_release']:<6}")
    
    save_to_csv(movie_data)
    
    print("\nDone! Check 'tfi_box_office.csv' for complete data.")


TFI HEROES BOX OFFICE DATA SCRAPER - TEMPLATE
Just add movie names to 'heroes_movies' dictionary and run!

HERO: Prabhas
  Salaar                                   ✓
  Kalki 2898 AD                            ✓
  Saaho                                    ✓
  Baahubali 2                              ✓
  Mirchi                                   ✓

HERO: Allu Arjun
  Pushpa                                   ✓
  Ala Vaikunthapurramuloo                  ✓
  Race Gurram                              ✓
  Arjun Reddy                              ✓
  Duvvada Jagannadham                      ✓

HERO: Mahesh Babu
  Bharat Ane Nenu                          ✓
  Maharshi                                 ✓
  Srimanthudu                              ✓
  Spyder                                   ✓
  Businessman                              ✓

HERO: N. T. Rama Rao Jr.
  RRR                                      ✓
  Devara                                   ✓
  Jai Bhim                                 ✓

HERO

In [2]:
import requests
from bs4 import BeautifulSoup
import csv
import re
import time

# Top TFI Heroes with their complete filmography - Just add movie names here
heroes_movies = {
    'Jr NTR': [
        'Student No: 1', 'Subbu', 'Ninnu Choodalani', 'Aadi', 'Santosham', 'Simhadri',
        'Kadha Parthu', 'Shivamani', 'Malik Ram', 'Andhrodu', 'Rakhi', 'Ashok', 'Yamadonga',
        'Kantri', 'Adhurs', 'Brindavanam', 'Shakti', 'Oosaravelli', 'Dammu', 'Janatha Garage',
        'Temper', 'Nannaku Prematho', 'Jai Lava Kusa', 'Aravinda Sametha', 'RRR', 'Devara'
    ],
    'Mahesh Babu': [
        'Rajakumarudu', 'Yuvarajuv', 'Vamsi', 'Okkadu', 'Nijam', 'Neeku Nenu Naku Nuvvu',
        'Tagore', 'Athadu', 'Pokiri', 'Sainikudu', 'Dhookudu', 'Businessman', 'Seethamma Vakitlo Sirimalle Chettu',
        '1 Nenokkadine', 'Aagadu', 'Srimanthudu', 'Bharat Ane Nenu', 'Maharshi', 'Sarileru Neekevvaru',
        'Sarkaru Vaari Paata', 'Guntur Kaaram', 'Spyder'
    ],
    'Pawan Kalyan': [
        'Akkada Ammayi Ikkada Abbayi', 'Tholi Prema', 'Thammudu', 'Badri', 'Kushi', 'Johnny',
        'Gudumba Shankar', 'Balu ABCDEFG', 'Bangaram', 'Annavaram', 'Shankar Dada MBBS', 'Katamarayudu',
        'Agnyaatavaasi', 'Vakeel Saab', 'Bheemla Nayak', 'They Call Him OG', 'Attarintiki Daredi',
        'Gabbar Singh', 'Cameraman Gangatho Rambabu', 'Teen Maar'
    ],
    'Allu Arjun': [
        'Gangotri', 'Arya', 'Desamudu', 'Happy', 'Bunny', 'Arya 2', 'Vedam', 'Badrinath',
        'Julai', 'Race Gurram', 'S/O Satyamurthy', 'Sarainodu', 'Duvvada Jagannadham', 'Naa Peru Surya',
        'Ala Vaikunthapurramulo', 'Pushpa: The Rise', 'Pushpa 2: The Rule', 'Parugu', 'Rudhramadevi'
    ],
    'Ram Charan': [
        'Chirutha', 'Magadheera', 'Orange', 'Leader', 'Racha', 'Naayak', 'Zanjeer', 'Yevadu',
        'Govindudu Andarivadele', 'Dhruva', 'Rangasthalam', 'Vinaya Vidheya Rama', 'Acharya', 'RRR',
        'Bruce Lee', 'Toofaan', 'Game Changer'
    ],
    'Prabhas': [
        'Eeswar', 'Raghavendra', 'Varsham', 'Adavi Ramudu', 'Chatrapathi', 'Bujjigadu', 'Billa', 'Mirchi',
        'Baahubali: The Beginning', 'Baahubali 2: The Conclusion', 'Saaho', 'Radhe Shyam', 'Adipurush',
        'Salaar', 'Kalki 2898 AD', 'Darling', 'Mr. Perfect'
    ]
}

def clean_number(text):
    """Extract numbers from text"""
    if not text:
        return ''
    # Remove special characters but keep numbers and decimals
    numbers = re.findall(r'[\d,]+\.?\d*', text)
    return numbers[0] if numbers else ''

def scrape_wikipedia(movie_name):
    """Scrape Wikipedia for movie data"""
    try:
        url = f"https://en.wikipedia.org/wiki/{movie_name.replace(' ', '_')}"
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        infobox = soup.find('table', {'class': 'infobox'})
        
        movie_info = {
            'movie_name': movie_name,
            'hero_name': '',
            'total_WW_cls': '',
            'day1_WW_Gross_cr': '',
            'verdict': '',
            'year_of_release': ''
        }
        
        if infobox:
            rows = infobox.find_all('tr')
            
            for row in rows:
                header = row.find('th')
                data = row.find('td')
                
                if header and data:
                    header_text = header.get_text(strip=True).lower()
                    data_text = data.get_text(strip=True)
                    
                    # Extract hero name from first actor link
                    if 'starring' in header_text or 'cast' in header_text:
                        links = data.find_all('a')
                        if links:
                            movie_info['hero_name'] = links[0].get_text(strip=True)
                    
                    # Extract box office collection
                    elif 'box office' in header_text or 'worldwide' in header_text:
                        # Try to extract the total collection
                        numbers = re.findall(r'₹\s*([\d,]+(?:\.\d+)?)', data_text)
                        if numbers:
                            movie_info['total_WW_cls'] = numbers[0]
                    
                    # Extract release year
                    elif 'release' in header_text:
                        year_match = re.search(r'\d{4}', data_text)
                        if year_match:
                            movie_info['year_of_release'] = year_match.group()
        
        return movie_info
    
    except Exception as e:
        print(f"    ! Error: {str(e)[:40]}")
        return None

def scrape_imdb(movie_name):
    """Scrape IMDb for additional data"""
    try:
        search_url = f"https://www.imdb.com/find?q={movie_name.replace(' ', '+')}&s=tt"
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        
        response = requests.get(search_url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Look for movie links
        movie_link = soup.find('a', {'href': re.compile(r'/title/tt\d+/')})
        
        if movie_link:
            movie_url = 'https://www.imdb.com' + movie_link['href']
            response = requests.get(movie_url, headers=headers, timeout=10)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Try to find box office info
            page_text = soup.get_text()
            
            # Look for opening weekend pattern
            day1_match = re.search(r'Opening\s+weekend[:\s]+\$?([\d,]+)', page_text, re.IGNORECASE)
            if day1_match:
                return {'day1_WW_Gross_cr': day1_match.group(1)}
    
    except Exception as e:
        pass
    
    return {}

def scrape_google_search(movie_name):
    """Search Google for box office info"""
    try:
        search_query = f"{movie_name} box office collection worldwide crore"
        search_url = f"https://www.google.com/search?q={search_query.replace(' ', '+')}"
        
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        
        response = requests.get(search_url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        page_text = soup.get_text()
        
        # Look for collection patterns
        collection_patterns = [
            r'Total[:\s]+₹([\d,]+(?:\.\d+)?)\s*crore',
            r'worldwide[:\s]+₹([\d,]+(?:\.\d+)?)\s*crore',
            r'box office[:\s]+₹([\d,]+(?:\.\d+)?)\s*crore',
        ]
        
        for pattern in collection_patterns:
            match = re.search(pattern, page_text, re.IGNORECASE)
            if match:
                return {'total_WW_cls': match.group(1)}
        
        # Look for verdict
        if 'blockbuster' in page_text.lower():
            return {'verdict': 'Blockbuster'}
        elif 'hit' in page_text.lower():
            return {'verdict': 'Hit'}
        elif 'flop' in page_text.lower():
            return {'verdict': 'Flop'}
    
    except Exception as e:
        pass
    
    return {}

def scrape_all_heroes():
    """Scrape all heroes and their movies"""
    all_movies_data = []
    
    for hero, movies in heroes_movies.items():
        print(f"\n{'='*80}")
        print(f"HERO: {hero}")
        print('='*80)
        
        for movie_name in movies:
            print(f"  {movie_name:40}", end=' ')
            
            # Scrape Wikipedia (main source)
            movie_data = scrape_wikipedia(movie_name)
            
            if movie_data:
                # Add hero name if not found
                if not movie_data['hero_name']:
                    movie_data['hero_name'] = hero
                
                # Try to scrape additional data from Google
                google_data = scrape_google_search(movie_name)
                if google_data:
                    for key, value in google_data.items():
                        if value and not movie_data[key]:
                            movie_data[key] = value
                
                all_movies_data.append(movie_data)
                print("✓")
            else:
                print("✗")
            
            time.sleep(1)  # Be respectful to servers
    
    return all_movies_data

def save_to_csv(data, filename='tfi_box_office.csv'):
    """Save data to CSV"""
    if not data:
        print("\nNo data to save!")
        return
    
    try:
        with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['movie_name', 'hero_name', 'total_WW_cls', 'day1_WW_Gross_cr', 'verdict', 'year_of_release']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            
            writer.writeheader()
            for row in data:
                writer.writerow(row)
        
        print(f"\n{'='*80}")
        print(f"✓ Data saved to '{filename}'")
        print(f"✓ Total movies scraped: {len(data)}")
        print('='*80)
    
    except IOError as e:
        print(f"Error saving CSV: {e}")

if __name__ == "__main__":
    print("\n" + "="*80)
    print("TFI HEROES BOX OFFICE DATA SCRAPER - TEMPLATE")
    print("="*80)
    print("Just add movie names to 'heroes_movies' dictionary and run!")
    
    movie_data = scrape_all_heroes()
    
    print("\n" + "="*80)
    print("SCRAPED DATA SUMMARY")
    print("="*80)
    print(f"{'Movie':<30} {'Hero':<20} {'Total WW':<15} {'Year':<6}")
    print("-"*80)
    for movie in movie_data:
        print(f"{movie['movie_name']:<30} {movie['hero_name']:<20} {movie['total_WW_cls']:<15} {movie['year_of_release']:<6}")
    
    save_to_csv(movie_data)
    
    print("\nDone! Check 'tfi_box_office.csv' for complete data.")


TFI HEROES BOX OFFICE DATA SCRAPER - TEMPLATE
Just add movie names to 'heroes_movies' dictionary and run!

HERO: Jr NTR
  Student No: 1                            ✓
  Subbu                                    ✓
  Ninnu Choodalani                         ✓
  Aadi                                     ✓
  Santosham                                ✓
  Simhadri                                 ✓
  Kadha Parthu                                 ! Error: 404 Client Error: Not Found for url: htt
✗
  Shivamani                                ✓
  Malik Ram                                ✓
  Andhrodu                                     ! Error: 404 Client Error: Not Found for url: htt
✗
  Rakhi                                    ✓
  Ashok                                    ✓
  Yamadonga                                ✓
  Kantri                                   ✓
  Adhurs                                   ✓
  Brindavanam                              ✓
  Shakti                                   ✓
  Oosa

KeyboardInterrupt: 

In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import re
import time

# Top TFI Heroes with their complete filmography
heroes_movies = {
    'Jr NTR': [
        'Student No: 1', 'Subbu', 'Ninnu Choodalani', 'Aadi', 'Santosham', 'Simhadri',
        'Yamadonga', 'Kantri', 'Adhurs', 'Oosaravelli', 'Dammu', 'Janatha Garage',
        'Nannaku Prematho', 'Jai Lava Kusa', 'Aravinda Sametha', 'RRR', 'Devara'
    ],
    'Mahesh Babu': [
        'Okkadu', 'Pokiri', 'Srimanthudu', 'Bharat Ane Nenu', 'Maharshi', 
        'Sarileru Neekevvaru', 'Sarkaru Vaari Paata', 'Guntur Kaaram'
    ],
    'Pawan Kalyan': [
        'Katamarayudu', 'Vakeel Saab', 'Bheemla Nayak', 'They Call Him OG', 
        'Attarintiki Daredi', 'Teen Maar'
    ],
    'Allu Arjun': [
        'Race Gurram', 'S/O Satyamurthy', 'Sarainodu', 'Duvvada Jagannadham', 
        'Naa Peru Surya', 'Pushpa: The Rise', 'Pushpa 2: The Rule'
    ],
    'Ram Charan': [
        'Magadheera', 'Yevadu', 'Govindudu Andarivadele', 'Rangasthalam', 'RRR',
        'Vinaya Vidheya Rama', 'Acharya', 'Game Changer'
    ],
    'Prabhas': [
        'Baahubali: The Beginning', 'Baahubali 2: The Conclusion', 'Saaho', 
        'Radhe Shyam', 'Adipurush', 'Salaar', 'Kalki 2898 AD'
    ]
}

def scrape_sacnilk_movie(movie_name):
    """Scrape Sacnilk for complete movie data"""
    try:
        # Construct Sacnilk URL
        url = f"https://www.sacnilk.com/news/{movie_name.replace(' ', '_')}_Box_Office_Collection"
        
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        page_text = soup.get_text()
        
        movie_data = {
            'total_WW_cls': '',
            'day1_WW_Gross_cr': '',
            'verdict': '',
            'year_of_release': ''
        }
        
        # Extract Worldwide Collection
        patterns = [
            r'Worldwide[:\s]+Collection[:\s]*₹\s*([\d,]+(?:\.\d+)?)\s*Cr',
            r'Worldwide[:\s]+₹\s*([\d,]+(?:\.\d+)?)\s*(?:Cr|Crore)',
            r'Total[:\s]+Worldwide[:\s]+₹\s*([\d,]+(?:\.\d+)?)',
        ]
        
        for pattern in patterns:
            match = re.search(pattern, page_text, re.IGNORECASE)
            if match:
                movie_data['total_WW_cls'] = match.group(1)
                break
        
        # Extract Day 1 Worldwide Collection
        day1_patterns = [
            r'1 Day Worldwide Collection[:\s]*₹\s*([\d,]+(?:\.\d+)?)\s*Cr',
            r'Opening Day[:\s]+Worldwide[:\s]*₹\s*([\d,]+(?:\.\d+)?)',
            r'Day 1[:\s]+Worldwide[:\s]*₹\s*([\d,]+(?:\.\d+)?)',
        ]
        
        for pattern in day1_patterns:
            match = re.search(pattern, page_text, re.IGNORECASE)
            if match:
                movie_data['day1_WW_Gross_cr'] = match.group(1)
                break
        
        # Extract Verdict
        if 'Verdict:' in page_text or 'verdict:' in page_text:
            verdict_match = re.search(r'Verdict[:\s]+(\w+)', page_text, re.IGNORECASE)
            if verdict_match:
                verdict = verdict_match.group(1).strip()
                if verdict.lower() in ['blockbuster', 'superhit', 'hit', 'flop']:
                    movie_data['verdict'] = verdict.capitalize()
        
        # Extract Release Year
        if 'Release Date:' in page_text:
            year_match = re.search(r'Release Date[:\s]+\w+\s+\d+[a-z]*\s+(20\d{2})', page_text, re.IGNORECASE)
            if year_match:
                movie_data['year_of_release'] = year_match.group(1)
        
        return movie_data
    
    except Exception as e:
        return {}

def scrape_wikipedia(movie_name):
    """Scrape Wikipedia for hero and year data"""
    try:
        url = f"https://en.wikipedia.org/wiki/{movie_name.replace(' ', '_')}"
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        infobox = soup.find('table', {'class': 'infobox'})
        
        movie_info = {
            'hero_name': '',
            'year_of_release': ''
        }
        
        if infobox:
            rows = infobox.find_all('tr')
            
            for row in rows:
                header = row.find('th')
                data = row.find('td')
                
                if header and data:
                    header_text = header.get_text(strip=True).lower()
                    data_text = data.get_text(strip=True)
                    
                    if 'starring' in header_text or 'cast' in header_text:
                        links = data.find_all('a')
                        if links:
                            movie_info['hero_name'] = links[0].get_text(strip=True)
                    
                    elif 'release' in header_text:
                        year_match = re.search(r'\d{4}', data_text)
                        if year_match:
                            movie_info['year_of_release'] = year_match.group()
        
        return movie_info
    
    except Exception as e:
        return {}

def scrape_all_heroes():
    """Scrape all heroes and their movies"""
    all_movies_data = []
    total_heroes = len(heroes_movies)
    current_hero = 0
    
    for hero, movies in heroes_movies.items():
        current_hero += 1
        print(f"\n{'='*80}")
        print(f"[{current_hero}/{total_heroes}] HERO: {hero} ({len(movies)} movies)")
        print('='*80)
        
        for idx, movie_name in enumerate(movies, 1):
            print(f"  [{idx}/{len(movies)}] {movie_name:40}", end=' ')
            
            # Scrape Sacnilk for box office data
            sacnilk_data = scrape_sacnilk_movie(movie_name)
            
            # Scrape Wikipedia for hero and year
            wiki_data = scrape_wikipedia(movie_name)
            
            # Merge data
            movie_info = {
                'movie_name': movie_name,
                'hero_name': wiki_data.get('hero_name') or hero,
                'total_WW_cls': sacnilk_data.get('total_WW_cls', ''),
                'day1_WW_Gross_cr': sacnilk_data.get('day1_WW_Gross_cr', ''),
                'verdict': sacnilk_data.get('verdict', ''),
                'year_of_release': sacnilk_data.get('year_of_release') or wiki_data.get('year_of_release', '')
            }
            
            all_movies_data.append(movie_info)
            print("✓")
            
            time.sleep(0.6)  # Respectful delay
        
        print(f"  ✓ Processed {len(movies)} movies")
    
    return all_movies_data

def save_to_csv(data, filename='tfi_box_office.csv'):
    """Save data to CSV"""
    if not data:
        print("\nNo data to save!")
        return
    
    try:
        with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['movie_name', 'hero_name', 'total_WW_cls', 'day1_WW_Gross_cr', 'verdict', 'year_of_release']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            
            writer.writeheader()
            for row in data:
                writer.writerow(row)
        
        print(f"\n{'='*80}")
        print(f"✓ Data saved to '{filename}'")
        print(f"✓ Total movies: {len(data)}")
        
        # Stats
        with_verdict = sum(1 for m in data if m['verdict'])
        with_day1 = sum(1 for m in data if m['day1_WW_Gross_cr'])
        
        print(f"✓ Movies with verdict: {with_verdict}")
        print(f"✓ Movies with day1 data: {with_day1}")
        print('='*80)
    
    except IOError as e:
        print(f"Error saving CSV: {e}")

if __name__ == "__main__":
    print("\n" + "="*80)
    print("TFI HEROES BOX OFFICE DATA SCRAPER (Sacnilk + Wikipedia)")
    print("="*80)
    print("Scraping from Sacnilk.com for box office data...")
    
    movie_data = scrape_all_heroes()
    save_to_csv(movie_data)
    
    print("\nDone! Check 'tfi_box_office.csv' for complete data.")


TFI HEROES BOX OFFICE DATA SCRAPER (Multi-Source)
Scraping from: Sacnilk, Andhra Box Office, Wikipedia...

[1/6] HERO: Jr NTR (24 movies)
  [1/24] Student No: 1                            [SAW] ✓
  [2/24] Subbu                                    [SAW] ✓
  [3/24] Ninnu Choodalani                         [SAW] ✓
  [4/24] Aadi                                     [SAW] ✓
  [5/24] Santosham                                [SAW] ✓
  [6/24] Simhadri                                 [SAW] ✓
  [7/24] Kadha Parthu                             [SAW] ✓
  [8/24] Shivamani                                [SAW] ✓
  [9/24] Andhrodu                                 [SAW] ✓
  [10/24] Rakhi                                    [SAW] ✓
  [11/24] Ashok                                    [SAW] ✓
  [12/24] Yamadonga                                [SAW] ✓
  [13/24] Kantri                                   [SAW] ✓
  [14/24] Adhurs                                   [SAW] ✓
  [15/24] Brindavanam                       

GROK(WORKING)

In [6]:
import requests
from bs4 import BeautifulSoup
import csv
import re
import time

# Top TFI Heroes with their complete filmography
heroes_movies = {
    'Jr NTR': [
        # 'Student No: 1', 'Subbu', 'Ninnu Choodalani', 'Aadi', 'Santosham', 'Simhadri',
        # 'Kadha Parthu', 'Shivamani', 'Andhrodu', 'Rakhi', 'Ashok', 'Yamadonga',
        'Kantri', 'Adhurs', 'Brindavanam', 'Oosaravelli', 'Dammu', 'Janatha Garage',
        'Temper', 'Nannaku Prematho', 'Jai Lava Kusa', 'Aravinda Sametha', 'RRR', 'Devara'
    ],
    'Mahesh Babu': [
        # 'Rajakumarudu', 'Yuvarajuv', 'Vamsi', 'Okkadu', 'Nijam', 'Neeku Nenu Naku Nuvvu',
        # 'Tagore', 'Athadu', 'Pokiri', 'Sainikudu', 'Dhookudu', 'Businessman', 
        # 'Seethamma Vakitlo Sirimalle Chettu', '1 Nenokkadine', 'Aagadu', 'Srimanthudu', 
        'Bharat Ane Nenu', 'Maharshi', 'Sarileru Neekevvaru', 'Sarkaru Vaari Paata', 'Guntur Kaaram', 'Spyder'
    ],
    'Pawan Kalyan': [
        # 'Akkada Ammayi Ikkada Abbayi', 'Tholi Prema', 'Thammudu', 'Badri', 'Kushi', 'Johnny',
        # 'Gudumba Shankar', 'Balu ABCDEFG', 'Bangaram', 'Annavaram', 'Shankar Dada MBBS', 
        'Katamarayudu', 'Agnyaatavaasi', 'Vakeel Saab', 'Bheemla Nayak', 'They Call Him OG', 
        'Attarintiki Daredi', 'Gabbar Singh', 'Cameraman Gangatho Rambabu', 'Teen Maar'
    ],
    'Allu Arjun': [
        # 'Gangotri', 'Arya', 'Desamudu', 'Happy', 'Bunny', 'Arya 2', 'Vedam', 'Badrinath',
        # 'Julai', 'Race Gurram', 'S/O Satyamurthy', 'Sarainodu', 'Duvvada Jagannadham', 
        'Naa Peru Surya', 'Ala Vaikunthapurramulo', 'Pushpa: The Rise', 'Pushpa 2: The Rule', 'Parugu', 'Rudhramadevi'
    ],
    'Ram Charan': [
        # 'Chirutha', 'Magadheera', 'Orange', 'Leader', 'Racha', 'Naayak', 'Zanjeer', 'Yevadu',
        'Govindudu Andarivadele', 'Dhruva', 'Rangasthalam', 'Vinaya Vidheya Rama', 'Acharya', 'RRR',
        'Bruce Lee', 'Toofaan', 'Game Changer'
    ],
    'Prabhas': [
        # 'Eeswar', 'Raghavendra', 'Varsham', 'Adavi Ramudu', 'Chatrapathi', 'Bujjigadu', 'Billa', 'Mirchi',
        'Baahubali: The Beginning', 'Baahubali 2: The Conclusion', 'Saaho', 'Radhe Shyam', 'Adipurush',
        'Salaar', 'Kalki 2898 AD', 'Darling', 'Mr. Perfect'
    ]
}

def extract_from_tables(soup):
    """Extract total WW, Day 1 WW, and Verdict from tables"""
    data = {'total_WW_cls': '', 'day1_WW_Gross_cr': '', 'verdict': ''}
    for table in soup.find_all('table'):
        rows = table.find_all('tr')
        for row in rows:
            cells = row.find_all(['th', 'td'])
            if len(cells) >= 2:
                header = cells[0].get_text(strip=True).lower()
                value = cells[1].get_text(strip=True)
                
                if 'worldwide' in header and ('gross' in header or 'collection' in header):
                    match = re.search(r'₹?([\d,]+\.?\d*)', value)
                    if match:
                        data['total_WW_cls'] = match.group(1)
                
                if ('day 1' in header or 'opening day' in header or 'first day' in header) and ('worldwide' in header or 'gross' in header):
                    match = re.search(r'₹?([\d,]+\.?\d*)', value)
                    if match:
                        data['day1_WW_Gross_cr'] = match.group(1)
                
                if 'verdict' in header:
                    data['verdict'] = value.strip().title()
    return data

def extract_from_text(page_text):
    """Fallback extraction using improved regex"""
    data = {'total_WW_cls': '', 'day1_WW_Gross_cr': '', 'verdict': ''}
    
    # Verdict first (explicit)
    verdict_match = re.search(r'Verdict\s*[:\-]?\s*([A-Za-z\s\-]+)', page_text, re.IGNORECASE)
    if verdict_match:
        v = verdict_match.group(1).strip().title()
        if v in ['Hit', 'Super Hit', 'Blockbuster', 'Flop', 'Average', 'Disaster', 'Superhit']:
            data['verdict'] = v.replace('Super Hit', 'Superhit')
    
    # Total WW
    total_patterns = [
        r'Worldwide.*?Gross.*?₹?([\d,]+\.?\d*)\s*(Cr|Crore)?',
        r'Total.*?Collection.*?₹?([\d,]+\.?\d*)\s*(Cr|Crore)?',
        r'Final.*?Total.*?₹?([\d,]+\.?\d*)\s*(Cr|Crore)?',
        r'Worldwide.*?Collection.*?₹?([\d,]+\.?\d*)\s*(Cr|Crore)?',
    ]
    for pat in total_patterns:
        m = re.search(pat, page_text, re.IGNORECASE)
        if m and not data['total_WW_cls']:
            data['total_WW_cls'] = m.group(1)
    
    # Day 1 WW
    day1_patterns = [
        r'Day\s*1.*?Worldwide.*?₹?([\d,]+\.?\d*)\s*(Cr|Crore)?',
        r'Opening\s*Day.*?Gross.*?₹?([\d,]+\.?\d*)\s*(Cr|Crore)?',
        r'First\s*Day.*?Worldwide.*?₹?([\d,]+\.?\d*)\s*(Cr|Crore)?',
        r'Day\s*1.*?Gross.*?₹?([\d,]+\.?\d*)\s*(Cr|Crore)?',
    ]
    for pat in day1_patterns:
        m = re.search(pat, page_text, re.IGNORECASE)
        if m and not data['day1_WW_Gross_cr']:
            data['day1_WW_Gross_cr'] = m.group(1)
    
    # Fallback keyword verdict
    if not data['verdict']:
        lower = page_text.lower()
        if 'blockbuster' in lower:
            data['verdict'] = 'Blockbuster'
        elif 'superhit' in lower or 'super hit' in lower:
            data['verdict'] = 'Superhit'
        elif 'hit' in lower and 'flop' not in lower:
            data['verdict'] = 'Hit'
        elif 'flop' in lower or 'disaster' in lower:
            data['verdict'] = 'Flop'
        elif 'average' in lower:
            data['verdict'] = 'Average'
    
    return data

def scrape_sacnilk(movie_name):
    try:
        search_url = f"https://www.sacnilk.com/box-office/?s={movie_name.replace(' ', '+')}"
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
        response = requests.get(search_url, headers=headers, timeout=15)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find movie link (improved selector)
        link = soup.find('a', href=re.compile(r'/news/.*_Box_Office_Collection'))
        if not link:
            return {}
        
        movie_url = link['href']
        if not movie_url.startswith('http'):
            movie_url = 'https://www.sacnilk.com' + movie_url
        
        resp = requests.get(movie_url, headers=headers, timeout=15)
        resp.raise_for_status()
        movie_soup = BeautifulSoup(resp.content, 'html.parser')
        
        # First try tables
        data = extract_from_tables(movie_soup)
        if data['total_WW_cls'] or data['day1_WW_Gross_cr'] or data['verdict']:
            return data
        
        # Fallback to full text
        page_text = movie_soup.get_text(separator=' ')
        return extract_from_text(page_text)
    
    except Exception:
        return {}

def scrape_andhra_box_office(movie_name):
    try:
        search_url = f"https://www.andhraboxoffice.com/?s={movie_name.replace(' ', '+')}"
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
        response = requests.get(search_url, headers=headers, timeout=15)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        link = soup.find('a', href=re.compile(r'/info.aspx'))
        if not link:
            return {}
        
        movie_url = link['href']
        if not movie_url.startswith('http'):
            movie_url = 'https://www.andhraboxoffice.com' + movie_url
        
        resp = requests.get(movie_url, headers=headers, timeout=15)
        resp.raise_for_status()
        movie_soup = BeautifulSoup(resp.content, 'html.parser')
        
        data = extract_from_tables(movie_soup)
        if data['total_WW_cls'] or data['day1_WW_Gross_cr'] or data['verdict']:
            return data
        
        page_text = movie_soup.get_text(separator=' ')
        return extract_from_text(page_text)
    
    except Exception:
        return {}

def scrape_wikipedia(movie_name):
    try:
        url = f"https://en.wikipedia.org/wiki/{movie_name.replace(' ', '_')}"
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        infobox = soup.find('table', class_='infobox')
        
        info = {'hero_name': '', 'year_of_release': ''}
        if infobox:
            for row in infobox.find_all('tr'):
                header = row.find('th')
                data_cell = row.find('td')
                if header and data_cell:
                    h_text = header.get_text(strip=True).lower()
                    d_text = data_cell.get_text(strip=True)
                    if 'starring' in h_text or 'cast' in h_text:
                        links = data_cell.find_all('a')
                        if links:
                            info['hero_name'] = links[0].get_text(strip=True)
                    if 'release date' in h_text or 'released' in h_text:
                        year = re.search(r'\b(19|20)\d{2}\b', d_text)
                        if year:
                            info['year_of_release'] = year.group(0)
        return info
    except Exception:
        return {}

def scrape_all_heroes():
    all_movies_data = []
    total_heroes = len(heroes_movies)
    current_hero = 0
    
    for hero, movies in heroes_movies.items():
        current_hero += 1
        print(f"\n{'='*80}")
        print(f"[{current_hero}/{total_heroes}] HERO: {hero} ({len(movies)} movies)")
        print('='*80)
        
        for idx, movie_name in enumerate(movies, 1):
            print(f"  [{idx}/{len(movies)}] {movie_name:40}", end=' ')
            
            print("[S", end='', flush=True)
            sac_data = scrape_sacnilk(movie_name)
            
            print("A", end='', flush=True)
            abo_data = scrape_andhra_box_office(movie_name)
            
            print("W", end='', flush=True)
            wiki_data = scrape_wikipedia(movie_name)
            print("]", end=' ')
            
            movie_info = {
                'movie_name': movie_name,
                'hero_name': wiki_data.get('hero_name') or hero,
                'total_WW_cls': sac_data.get('total_WW_cls') or abo_data.get('total_WW_cls', ''),
                'day1_WW_Gross_cr': sac_data.get('day1_WW_Gross_cr') or abo_data.get('day1_WW_Gross_cr', ''),
                'verdict': sac_data.get('verdict') or abo_data.get('verdict', ''),
                'year_of_release': wiki_data.get('year_of_release', '')
            }
            
            all_movies_data.append(movie_info)
            print("✓")
            time.sleep(1)  # Increased delay to be respectful
    
    return all_movies_data

def save_to_csv(data, filename='tfi_box_office_improved.csv'):
    if not data:
        print("\nNo data to save!")
        return
    
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['movie_name', 'hero_name', 'total_WW_cls', 'day1_WW_Gross_cr', 'verdict', 'year_of_release']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in data:
            writer.writerow(row)
    
    print(f"\n{'='*80}")
    print(f"✓ Data saved to '{filename}'")
    print(f"✓ Total movies processed: {len(data)}")
    print('='*80)

if __name__ == "__main__":
    print("\n" + "="*80)
    print("TFI HEROES BOX OFFICE DATA SCRAPER (Improved 2025 Version)")
    print("="*80)
    print("Sources: Sacnilk (primary), AndhraBoxOffice, Wikipedia")
    
    movie_data = scrape_all_heroes()
    save_to_csv(movie_data)
    
    print("\nScraping complete! Check the CSV for results.")
    print("Note: Success rate for Day 1 & Verdict is much higher now due to table parsing and better regex.")


TFI HEROES BOX OFFICE DATA SCRAPER (Improved 2025 Version)
Sources: Sacnilk (primary), AndhraBoxOffice, Wikipedia

[1/6] HERO: Jr NTR (12 movies)
  [1/12] Kantri                                   [S

AW] ✓
  [2/12] Adhurs                                   [SAW] ✓
  [3/12] Brindavanam                              [SAW] ✓
  [4/12] Oosaravelli                              [SAW] ✓
  [5/12] Dammu                                    [SAW] ✓
  [6/12] Janatha Garage                           [SAW] ✓
  [7/12] Temper                                   [SAW] ✓
  [8/12] Nannaku Prematho                         [SAW] ✓
  [9/12] Jai Lava Kusa                            [SAW] ✓
  [10/12] Aravinda Sametha                         [SAW] ✓
  [11/12] RRR                                      [SAW] ✓
  [12/12] Devara                                   [SAW] ✓

[2/6] HERO: Mahesh Babu (6 movies)
  [1/6] Bharat Ane Nenu                          [SAW] ✓
  [2/6] Maharshi                                 [SAW] ✓
  [3/6] Sarileru Neekevvaru                      [SAW] ✓
  [4/6] Sarkaru Vaari Paata                      [SAW] ✓
  [5/6] Guntur Kaaram                            [SAW] ✓
  [6/6] Spyder                  

GROK2

In [9]:
import requests
from bs4 import BeautifulSoup
import csv
import re
import time

# Top TFI Heroes with their filmography (focused on recent major films for better data availability)
heroes_movies = {
    'Jr NTR': [
        'Kantri', 'Adhurs', 'Brindavanam', 'Oosaravelli', 'Dammu', 'Janatha Garage',
        'Temper', 'Nannaku Prematho', 'Jai Lava Kusa', 'Aravinda Sametha', 'RRR', 'Devara'
    ],
    'Mahesh Babu': [
        'Bharat Ane Nenu', 'Maharshi', 'Sarileru Neekevvaru', 'Sarkaru Vaari Paata', 'Guntur Kaaram', 'Spyder'
    ],
    'Pawan Kalyan': [
        'Katamarayudu', 'Agnyaatavaasi', 'Vakeel Saab', 'Bheemla Nayak', 'They Call Him OG', 
        'Attarintiki Daredi', 'Gabbar Singh', 'Cameraman Gangatho Rambabu', 'Teen Maar'
    ],
    'Allu Arjun': [
        'Naa Peru Surya', 'Ala Vaikunthapurramulo', 'Pushpa: The Rise', 'Pushpa 2: The Rule', 'Parugu', 'Rudhramadevi'
    ],
    'Ram Charan': [
        'Govindudu Andarivadele', 'Dhruva', 'Rangasthalam', 'Vinaya Vidheya Rama', 'Acharya', 'RRR',
        'Bruce Lee', 'Toofaan', 'Game Changer'
    ],
    'Prabhas': [
        'Baahubali: The Beginning', 'Baahubali 2: The Conclusion', 'Saaho', 'Radhe Shyam', 'Adipurush',
        'Salaar', 'Kalki 2898 AD', 'Darling', 'Mr. Perfect'
    ]
}

def extract_from_tables(soup):
    data = {'total_WW_cls': '', 'day1_WW_Gross_cr': '', 'verdict': ''}
    for table in soup.find_all('table'):
        rows = table.find_all('tr')
        for row in rows:
            cells = row.find_all(['th', 'td'])
            if len(cells) >= 2:
                header = cells[0].get_text(strip=True).lower()
                value = ' '.join(cells[1].stripped_strings)
                
                # Total WW
                if 'worldwide' in header and any(word in header for word in ['gross', 'collection', 'total']):
                    match = re.search(r'[\d,]+(?:\.\d+)?', value.replace('₹', ''))
                    if match:
                        data['total_WW_cls'] = match.group(0)
                
                # Day 1
                if any(day_term in header for day_term in ['day 1', 'opening day', 'first day']) and any(gross_term in header for gross_term in ['worldwide', 'gross']):
                    match = re.search(r'[\d,]+(?:\.\d+)?', value.replace('₹', ''))
                    if match:
                        data['day1_WW_Gross_cr'] = match.group(0)
                
                # Verdict
                if 'verdict' in header:
                    data['verdict'] = value.strip().title()
    return data

def extract_from_text(page_text):
    data = {'total_WW_cls': '', 'day1_WW_Gross_cr': '', 'verdict': ''}
    
    # Verdict explicit
    verdict_match = re.search(r'Verdict\s*[:\-]?\s*([A-Za-z\s\-]+)', page_text, re.IGNORECASE)
    if verdict_match:
        v = verdict_match.group(1).strip().title()
        valid_verdicts = ['Hit', 'Super Hit', 'Blockbuster', 'Flop', 'Average', 'Disaster', 'Superhit', 'All Time Blockbuster']
        if any(vv in v for vv in valid_verdicts):
            data['verdict'] = v.replace('Super Hit', 'Superhit').replace('All Time', 'All-Time')
    
    # Total WW patterns (more flexible)
    total_patterns = [
        r'Worldwide\s*(?:Gross|Collection|Total)[:\s]*[\d,]+(?:\.\d+)?\s*(?:Cr|Crore)',
        r'Total\s*Worldwide\s*[\d,]+(?:\.\d+)?\s*(?:Cr|Crore)',
        r'Final\s*(?:Gross|Collection)\s*[\d,]+(?:\.\d+)?\s*(?:Cr|Crore)',
    ]
    for pat in total_patterns:
        m = re.search(pat, page_text, re.IGNORECASE)
        if m:
            num_match = re.search(r'[\d,]+(?:\.\d+)?', m.group(0))
            if num_match and not data['total_WW_cls']:
                data['total_WW_cls'] = num_match.group(0)
                break
    
    # Day 1 patterns
    day1_patterns = [
        r'Day\s*1\s*(?:Worldwide\s*)?(?:Gross|Collection)[:\s]*[\d,]+(?:\.\d+)?\s*(?:Cr|Crore)',
        r'Opening\s*Day\s*(?:Worldwide\s*)?[\d,]+(?:\.\d+)?\s*(?:Cr|Crore)',
        r'First\s*Day\s*(?:Gross|Worldwide)[\d,]+(?:\.\d+)?\s*(?:Cr|Crore)',
    ]
    for pat in day1_patterns:
        m = re.search(pat, page_text, re.IGNORECASE)
        if m:
            num_match = re.search(r'[\d,]+(?:\.\d+)?', m.group(0))
            if num_match and not data['day1_WW_Gross_cr']:
                data['day1_WW_Gross_cr'] = num_match.group(0)
                break
    
    # Fallback verdict
    if not data['verdict']:
        lower_text = page_text.lower()
        if 'all time blockbuster' in lower_text:
            data['verdict'] = 'All-Time Blockbuster'
        elif 'blockbuster' in lower_text:
            data['verdict'] = 'Blockbuster'
        elif 'superhit' in lower_text or 'super hit' in lower_text:
            data['verdict'] = 'Superhit'
        elif 'hit' in lower_text and not any(f in lower_text for f in ['flop', 'disaster']):
            data['verdict'] = 'Hit'
        elif any(f in lower_text for f in ['flop', 'disaster']):
            data['verdict'] = 'Flop'
        elif 'average' in lower_text:
            data['verdict'] = 'Average'
    
    return data

def scrape_sacnilk(movie_name):
    try:
        # Use box-office search for better results
        search_url = f"https://www.sacnilk.com/box-office/?s={movie_name.replace(' ', '+')}"
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        response = requests.get(search_url, headers=headers, timeout=15)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find link to box office page
        link = soup.find('a', href=re.compile(r'/news/.*Box.*Office', re.I))
        if not link:
            # Fallback to movie page
            link = soup.find('a', string=re.compile(movie_name, re.I))
        if not link:
            return {}
        
        movie_url = link['href']
        if not movie_url.startswith('http'):
            movie_url = 'https://www.sacnilk.com' + movie_url
        
        resp = requests.get(movie_url, headers=headers, timeout=15)
        resp.raise_for_status()
        movie_soup = BeautifulSoup(resp.content, 'html.parser')
        
        data = extract_from_tables(movie_soup)
        if any(data.values()):
            return data
        
        page_text = movie_soup.get_text(separator=' ')
        return extract_from_text(page_text)
    
    except Exception as e:
        print(f"Sacnilk error for {movie_name}: {e}")
        return {}

def scrape_andhra_box_office(movie_name):
    try:
        search_url = f"https://www.andhraboxoffice.com/?s={movie_name.replace(' ', '+')}"
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        response = requests.get(search_url, headers=headers, timeout=15)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find info page
        link = soup.find('a', href=re.compile(r'/info\.aspx'))
        if not link:
            return {}
        
        movie_url = link['href']
        if not movie_url.startswith('http'):
            movie_url = 'https://www.andhraboxoffice.com' + movie_url
        
        resp = requests.get(movie_url, headers=headers, timeout=15)
        resp.raise_for_status()
        movie_soup = BeautifulSoup(resp.content, 'html.parser')
        
        data = extract_from_tables(movie_soup)
        if any(data.values()):
            return data
        
        page_text = movie_soup.get_text(separator=' ')
        return extract_from_text(page_text)
    
    except Exception as e:
        print(f"Andhra error for {movie_name}: {e}")
        return {}

def scrape_tracktollywood(movie_name):
    try:
        search_url = f"https://tracktollywood.com/?s={movie_name.replace(' ', '+')}"
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        response = requests.get(search_url, headers=headers, timeout=15)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find box office link
        link = soup.find('a', href=re.compile(r'box-office', re.I))
        if not link:
            # Fallback to first relevant link
            link = soup.find('a', string=re.compile(movie_name, re.I))
        if not link:
            return {}
        
        movie_url = link['href']
        if not movie_url.startswith('http'):
            movie_url = 'https://tracktollywood.com' + movie_url
        
        resp = requests.get(movie_url, headers=headers, timeout=15)
        resp.raise_for_status()
        movie_soup = BeautifulSoup(resp.content, 'html.parser')
        
        data = extract_from_tables(movie_soup)
        if any(data.values()):
            return data
        
        page_text = movie_soup.get_text(separator=' ')
        return extract_from_text(page_text)
    
    except Exception as e:
        print(f"TrackTollywood error for {movie_name}: {e}")
        return {}

def scrape_wikipedia(movie_name):
    try:
        # Clean movie name for Wikipedia
        wiki_name = movie_name.replace(':', '').replace(' ', '_').replace('2:', '2')
        url = f"https://en.wikipedia.org/wiki/{wiki_name}"
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        infobox = soup.find('table', class_='infobox')
        
        info = {'hero_name': '', 'year_of_release': ''}
        if infobox:
            for row in infobox.find_all('tr'):
                header = row.find('th')
                data_cell = row.find('td')
                if header and data_cell:
                    h_text = header.get_text(strip=True).lower()
                    d_text = data_cell.get_text(strip=True)
                    # Hero
                    if 'starring' in h_text or 'cast' in h_text:
                        links = data_cell.find_all('a')
                        if links:
                            info['hero_name'] = links[0].get_text(strip=True)
                    # Year
                    if 'release date' in h_text or 'released' in h_text:
                        year = re.search(r'\b(19|20)\d{2}\b', d_text)
                        if year:
                            info['year_of_release'] = year.group(0)
        return info
    except Exception as e:
        print(f"Wikipedia error for {movie_name}: {e}")
        return {}

def scrape_all_heroes():
    all_movies_data = []
    total_heroes = len(heroes_movies)
    current_hero = 0
    
    for hero, movies in heroes_movies.items():
        current_hero += 1
        print(f"\n{'='*80}")
        print(f"[{current_hero}/{total_heroes}] HERO: {hero} ({len(movies)} movies)")
        print('='*80)
        
        for idx, movie_name in enumerate(movies, 1):
            print(f"  [{idx}/{len(movies)}] {movie_name:45}", end=' ')
            
            sac_data = scrape_sacnilk(movie_name)
            print("S", end='', flush=True)
            
            abo_data = scrape_andhra_box_office(movie_name)
            print("A", end='', flush=True)
            
            track_data = scrape_tracktollywood(movie_name)
            print("T", end='', flush=True)
            
            wiki_data = scrape_wikipedia(movie_name)
            print("W", end='', flush=True)
            
            # Merge with priority: Sacnilk > Track > Andhra
            movie_info = {
                'movie_name': movie_name,
                'hero_name': wiki_data.get('hero_name', hero),
                'total_WW_cls': sac_data.get('total_WW_cls') or track_data.get('total_WW_cls') or abo_data.get('total_WW_cls', ''),
                'day1_WW_Gross_cr': sac_data.get('day1_WW_Gross_cr') or track_data.get('day1_WW_Gross_cr') or abo_data.get('day1_WW_Gross_cr', ''),
                'verdict': sac_data.get('verdict') or track_data.get('verdict') or abo_data.get('verdict', ''),
                'year_of_release': wiki_data.get('year_of_release', '')
            }
            
            all_movies_data.append(movie_info)
            print(" ✓")
            time.sleep(2)  # Respectful delay
    
    return all_movies_data

def save_to_csv(data, filename='tfi_box_office_working.csv'):
    if not data:
        print("\nNo data to save!")
        return
    
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['movie_name', 'hero_name', 'total_WW_cls', 'day1_WW_Gross_cr', 'verdict', 'year_of_release']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in data:
            writer.writerow(row)
    
    print(f"\n{'='*80}")
    print(f"✓ Data saved to '{filename}'")
    print(f"✓ Total movies processed: {len(data)}")
    print('='*80)

if __name__ == "__main__":
    print("\n" + "="*80)
    print("TFI HEROES BOX OFFICE DATA SCRAPER (Working 2025 Version)")
    print("="*80)
    print("Sources: Sacnilk, TrackTollywood, AndhraBoxOffice, Wikipedia")
    print("Improved selectors and regex for better extraction.")
    
    movie_data = scrape_all_heroes()
    save_to_csv(movie_data)
    
    print("\nScraping complete! Check the CSV - should have more populated fields now.")


TFI HEROES BOX OFFICE DATA SCRAPER (Working 2025 Version)
Sources: Sacnilk, TrackTollywood, AndhraBoxOffice, Wikipedia
Improved selectors and regex for better extraction.

[1/6] HERO: Jr NTR (12 movies)
  [1/12] Kantri                                        SAndhra error for Kantri: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
ATW ✓
  [2/12] Adhurs                                        SAndhra error for Adhurs: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
ATW ✓
  [3/12] Brindavanam                                   SAndhra error for Brindavanam: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
ATW ✓
  [4/12] Oosaravelli                                   SAndhra error for Oosaravelli: ('Connection aborted.', Connectio