In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


In [2]:


def scrape_sacnilk(url):
    # Get page
    r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    soup = BeautifulSoup(r.text, "html.parser")

    # ---------- MOVIE NAME ----------
    try:
        movie_name = soup.find("h1").text.strip()
    except:
        movie_name = "Unknown Movie"

    print(f"Scraping: {movie_name}")

    # ---------- DAY-WISE COLLECTION TABLE ----------
    day_data = []

    day_table = soup.find("table")
    if day_table:
        for tr in day_table.find_all("tr")[1:]:
            cols = [td.text.strip() for td in tr.find_all("td")]
            if len(cols) >= 3:
                day_data.append({
                    "day": cols[0],
                    "india_net": cols[1],
                    "worldwide": cols[2]
                })

    # ---------- STATE-WISE COLLECTION ----------
    state_data = {}
    state_table = soup.find_all("table")
    if len(state_table) > 1:   # second table = state wise
        rows = state_table[1].find_all("tr")
        for tr in rows[1:]:
            cols = [td.text.strip() for td in tr.find_all("td")]
            if len(cols) == 2:
                state = cols[0]
                gross = cols[1]
                state_data[state] = gross

    # ---------- CREATE DATAFRAME ----------
    df_daywise = pd.DataFrame(day_data)

    # Add movie name to all rows
    df_daywise["movie"] = movie_name

    # Add state-wise data as separate columns
    for state, gross in state_data.items():
        df_daywise[f"state_{state}"] = gross

    return df_daywise


# ---------------------------------------------------
# RUN SCRAPER ON YOUR MOVIE PAGE
# ---------------------------------------------------

url = "https://www.sacnilk.com/news/_Box_Office_Collection_Day_Wise_Worldwide"

df = scrape_sacnilk(url)

df.to_csv("movie_collection.csv", index=False)

print("\nsalaar.csv")
print(df.head())


Scraping: Salaar Box Office Collection | Day Wise | Worldwide

salaar.csv
                    day                                          india_net  \
0    Day 1 [1st Friday]  ₹ 90.7 Cr [Te: 66.75 Cr ; Mal: 3.55 Cr; Ta: 3....   
1  Day 2 [1st Saturday]  ₹ 56.35 Cr [Te: 34.25 Cr ; Mal: 1.75 Cr; Ta: 3...   
2    Day 3 [1st Sunday]  ₹ 62.05 Cr [Te: 35 Cr ; Mal: 1.55 Cr; Ta: 3.2 ...   
3    Day 4 [1st Monday]  ₹ 46.3 Cr [Te: 27.1 Cr ; Mal: 1.3 Cr; Ta: 2.05...   
4   Day 5 [1st Tuesday]  ₹ 24.9 Cr [Te: 13.7 Cr ; Mal: 0.7 Cr; Ta: 1.1 ...   

  worldwide                                              movie  
0         -  Salaar Box Office Collection | Day Wise | Worl...  
1   -37.87%  Salaar Box Office Collection | Day Wise | Worl...  
2    10.12%  Salaar Box Office Collection | Day Wise | Worl...  
3   -25.38%  Salaar Box Office Collection | Day Wise | Worl...  
4   -46.22%  Salaar Box Office Collection | Day Wise | Worl...  


In [3]:
import requests
from bs4 import BeautifulSoup

# URL you want to scrape
url = "https://www.sacnilk.com/news/_Box_Office_Collection_Day_Wise_Worldwide"

# Fetch the website
page = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(page.text, "html.parser")

# ---------- MOVIE NAME ----------
title = soup.find("h1").text
print("Movie:", title)
print()

# ---------- DAY-WISE COLLECTION ----------
print("Day-wise Collection:")
table = soup.find("table")

for row in table.find_all("tr")[1:]:
    cols = [c.text.strip() for c in row.find_all("td")]
    print(cols)

print()

# ---------- STATE-WISE COLLECTION ----------
print("State-wise Collection:")

# Sacnilk usually has 2nd table for states
tables = soup.find_all("table")

if len(tables) > 1:
    state_table = tables[1]
    for row in state_table.find_all("tr")[1:]:
        cols = [c.text.strip() for c in row.find_all("td")]
        print(cols)
else:
    print("No state-wise data found.")


Movie: Salaar Box Office Collection | Day Wise | Worldwide

Day-wise Collection:
['Day 1 [1st Friday]', '₹ 90.7 Cr [Te: 66.75 Cr ; Mal: 3.55 Cr; Ta: 3.75 Cr; Ka: 0.9 Cr; Hi: 15.75 Cr]', '-']
['Day 2 [1st Saturday]', '₹ 56.35 Cr [Te: 34.25 Cr ; Mal: 1.75 Cr; Ta: 3.05 Cr; Ka: 0.95 Cr; Hi: 16.35 Cr]', '-37.87%']
['Day 3 [1st Sunday]', '₹ 62.05 Cr [Te: 35 Cr ; Mal: 1.55 Cr; Ta: 3.2 Cr; Ka: 1.2 Cr; Hi: 21.1 Cr]', '10.12%']
['Day 4 [1st Monday]', '₹ 46.3 Cr [Te: 27.1 Cr ; Mal: 1.3 Cr; Ta: 2.05 Cr; Ka: 0.85 Cr; Hi: 15 Cr]', '-25.38%']
['Day 5 [1st Tuesday]', '₹ 24.9 Cr [Te: 13.7 Cr ; Mal: 0.7 Cr; Ta: 1.1 Cr; Ka: 0.3 Cr; Hi: 9.1 Cr]', '-46.22%']
['Day 6 [1st Wednesday]', '₹ 15.6 Cr [Te: 5.75 Cr ; Mal: 0.5 Cr; Ta: 1.1 Cr; Ka: 0.25 Cr; Hi: 8 Cr]', '-37.35%']
['Day 7 [1st Thursday]', '₹ 12.1 Cr [Te: 3.5 Cr ; Mal: 0.3 Cr; Ta: 0.95 Cr; Ka: 0.15 Cr; Hi: 7.2 Cr]', '-22.44%']
['Week 1 Collection', '₹ 308 Cr [Te: 186.05 Cr ; Mal: 9.65; Ta: 15.2; Ka: 4.6; Hi: 92.5]', '-']
['Day 8 [2nd Friday]', '₹ 9.62 

In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://www.sacnilk.com/news/_Box_Office_Collection_Day_Wise_Worldwide"

page = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(page.text, "html.parser")

# Movie name
title = soup.find("h1").text.strip()
print("Movie:", title)

# ---------- DAY-WISE TABLE ----------
table = soup.find("table")

rows = []
for row in table.find_all("tr")[1:]:
    cols = [c.text.strip() for c in row.find_all("td")]
    rows.append(cols)

# Detect number of columns
num_cols = len(rows[0])
print("\nDetected columns:", num_cols)

# Assign column names automatically
if num_cols == 3:
    col_names = ["day", "india_net", "worldwide"]
elif num_cols == 4:
    col_names = ["day", "india_net", "india_gross", "worldwide"]
else:
    col_names = [f"col_{i+1}" for i in range(num_cols)]

df = pd.DataFrame(rows, columns=col_names)

# Add movie name & primary key
df["movie_name"] = title
df["pk"] = df["movie_name"] + "_" + df["day"]

# Save CSV
df.to_csv("Salaar_raw.csv", index=False, encoding="utf-8")
print("\nSaved → Salaar_raw.csv")


Movie: Salaar Box Office Collection | Day Wise | Worldwide

Detected columns: 3

Saved → Salaar_raw.csv


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://www.sacnilk.com/news/_Box_Office_Collection_Day_Wise_Worldwide"

page = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(page.text, "html.parser")

movie_name = soup.find("h1").text.strip()
print("Movie:", movie_name)

# ---------------------------- TABLE 1 (Day wise) ----------------------------
tables = soup.find_all("table")

daywise_table = tables[0]
rows = []

for row in daywise_table.find_all("tr")[1:]:
    cols = [c.text.strip() for c in row.find_all("td")]
    if len(cols) >= 2:
        rows.append({
            "day": cols[0],
            "india_net": cols[1],
            "change_percent": cols[2] if len(cols) > 2 else None
        })

df_daywise = pd.DataFrame(rows)

# ---------------------------- TABLE 2 (State wise) ----------------------------
state_table = tables[1]
state_header = [th.text.strip().replace(" ", "_").lower() for th in state_table.find_all("th")]

state_rows = []
for row in state_table.find_all("tr")[1:]:
    cols = [c.text.strip() for c in row.find_all("td")]
    if cols:
        entry = {}
        for i, col in enumerate(cols):
            entry[state_header[i]] = col
        state_rows.append(entry)

df_state = pd.DataFrame(state_rows)

# ---------------------------- MERGE BOTH TABLES ----------------------------
df = pd.merge(df_daywise, df_state, left_on="day", right_on="day", how="left")

df["movie_name"] = movie_name

df.to_csv("Salaar_raw.csv", index=False)
print("\nSaved → Salaar_raw.csv")
print(df.head())


Movie: Salaar Box Office Collection | Day Wise | Worldwide

Saved → Salaar_raw.csv
                    day                                          india_net  \
0    Day 1 [1st Friday]  ₹ 90.7 Cr [Te: 66.75 Cr ; Mal: 3.55 Cr; Ta: 3....   
1  Day 2 [1st Saturday]  ₹ 56.35 Cr [Te: 34.25 Cr ; Mal: 1.75 Cr; Ta: 3...   
2    Day 3 [1st Sunday]  ₹ 62.05 Cr [Te: 35 Cr ; Mal: 1.55 Cr; Ta: 3.2 ...   
3    Day 4 [1st Monday]  ₹ 46.3 Cr [Te: 27.1 Cr ; Mal: 1.3 Cr; Ta: 2.05...   
4   Day 5 [1st Tuesday]  ₹ 24.9 Cr [Te: 13.7 Cr ; Mal: 0.7 Cr; Ta: 1.1 ...   

  change_percent   karnataka       aptg tamil_nadu     kerala rest_of_india  \
0              -  ₹ 11.65 Cr  ₹ 67.1 Cr   ₹ 4.9 Cr  ₹ 4.65 Cr     ₹ 18.8 Cr   
1        -37.87%   ₹ 7.25 Cr  ₹ 33.5 Cr     ₹ 4 Cr  ₹ 2.65 Cr     ₹ 19.3 Cr   
2         10.12%    ₹ 6.7 Cr  ₹ 35.6 Cr   ₹ 4.1 Cr   ₹ 2.3 Cr     ₹ 24.8 Cr   
3        -25.38%   ₹ 4.65 Cr  ₹ 28.4 Cr   ₹ 2.7 Cr   ₹ 1.8 Cr    ₹ 16.65 Cr   
4        -46.22%    ₹ 2.2 Cr  ₹ 16.2 Cr   ₹ 1.1 Cr  ₹

SALAAR

In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# ------------------- CHANGE ONLY THIS LINE FOR OTHER MOVIES -------------------
url = "https://www.sacnilk.com/news/_Box_Office_Collection_Day_Wise_Worldwide"
# ------------------------------------------------------------------------------

page = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(page.text, "html.parser")

movie_name = soup.find("h1").text.strip()
print("Movie:", movie_name)

# ------------------- FIXED: GRAB BUDGET, VERDICT, SCREENS, RELEASE DATE -------------------
# Search for specific elements with keywords (handles HTML structure like <strong>)
budget = "Not Found"
verdict = "Not Found"
india_screens = "Not Found"
overseas_screens = "Not Found"
worldwide_screens = "Not Found"
release_date = "Not Found"

# Find all relevant text containers (p, div, span, etc.)
all_elements = soup.find_all(['p', 'div', 'span', 'strong'])

for elem in all_elements:
    text = elem.get_text(strip=True)
    
    # Budget
    if "Budget:" in text:
        budget_match = re.search(r'Budget:\s*(₹\s*\d+(?:\.\d+)?\s*Cr\s*\*\s*Approx)', text)
        if budget_match:
            budget = budget_match.group(1).strip()
    
    # Verdict (look for <strong>Salaar Verdict:</strong> <strong>Hit</strong>)
    if "Verdict:" in text:
        # Get the next sibling or parent text for the value
        verdict_elem = elem.find_next_sibling('strong') or elem.parent.find('strong')
        if verdict_elem:
            verdict = verdict_elem.get_text(strip=True)
        else:
            # Fallback regex for plain text
            verdict_match = re.search(r'Verdict:\s*(Hit|Flop|Blockbuster|Super Hit|Average|Disaster)', text)
            verdict = verdict_match.group(1).strip() if verdict_match else "Not Found"
    
    # Screen counts (look for lines with numbers after labels)
    if "India:" in text and re.search(r'\d+', text):
        india_match = re.search(r'India:\s*(\d+)', text)
        if india_match:
            india_screens = f"India: {india_match.group(1)}"
    if "Overseas:" in text and re.search(r'\d+', text):
        overseas_match = re.search(r'Overseas:\s*(\d+)', text)
        if overseas_match:
            overseas_screens = f"Overseas: {overseas_match.group(1)}"
    if "Worldwide total:" in text and re.search(r'\d+', text):
        worldwide_match = re.search(r'Worldwide total:\s*(\d+)', text)
        if worldwide_match:
            worldwide_screens = f"Worldwide total: {worldwide_match.group(1)}"
    
    # Release Date
    if "Release Date:" in text:
        release_match = re.search(r'Release Date:\s*(.+)', text)
        if release_match:
            release_date = release_match.group(1).strip()

print("Budget:", budget)
print("Verdict:", verdict)
print("India Screens:", india_screens)
print("Overseas Screens:", overseas_screens)
print("Worldwide Screens:", worldwide_screens)
print("Release Date:", release_date)

# ------------------- TABLE 1 (Day wise) -------------------
tables = soup.find_all("table")
daywise_table = tables[0]
rows = []

for row in daywise_table.find_all("tr")[1:]:
    cols = [c.text.strip() for c in row.find_all("td")]
    if len(cols) >= 2:
        rows.append({
            "day": cols[0],
            "india_net": cols[1],
            "change_percent": cols[2] if len(cols) > 2 else None
        })

df_daywise = pd.DataFrame(rows)

# ------------------- TABLE 2 (State wise) -------------------
if len(tables) > 1:
    state_table = tables[1]
    state_header = [th.text.strip().replace(" ", "_").lower() for th in state_table.find_all("th")]

    state_rows = []
    for row in state_table.find_all("tr")[1:]:
        cols = [c.text.strip() for c in row.find_all("td")]
        if cols:
            entry = {}
            for i, col in enumerate(cols):
                if i < len(state_header):
                    entry[state_header[i]] = col
            state_rows.append(entry)

    df_state = pd.DataFrame(state_rows)
else:
    df_state = pd.DataFrame()  # empty if no state table

# ------------------- MERGE BOTH TABLES -------------------
df = pd.merge(df_daywise, df_state, left_on="day", right_on="day", how="left")

# ------------------- ADD ALL NEW INFO -------------------
df["movie_name"] = movie_name
df["budget"] = budget
df["verdict"] = verdict
df["india_screens"] = india_screens
df["overseas_screens"] = overseas_screens
df["worldwide_screens"] = worldwide_screens
df["release_date"] = release_date

# ------------------- SAVE TO CSV -------------------
df.to_csv("Salaar_raw.csv", index=False)
print("\nSaved → Salaar_raw.csv")
print("\nFirst 5 rows:")
print(df[["day", "india_net", "budget", "verdict", "india_screens", "release_date"]].head())

Movie: Salaar Box Office Collection | Day Wise | Worldwide
Budget: ₹ 200 Cr * Approx
Verdict: Hit
India Screens: India: 6000
Overseas Screens: Overseas: 1000
Worldwide Screens: Worldwide total: 7000
Release Date: 14th April 2022For more and the latest news aboutBox Office Collection, Stay tuned to us.Disclaimer: The Box Office Data are compiled from various sources and by our own research.

Saved → Salaar_raw.csv

First 5 rows:
                    day                                          india_net  \
0    Day 1 [1st Friday]  ₹ 90.7 Cr [Te: 66.75 Cr ; Mal: 3.55 Cr; Ta: 3....   
1  Day 2 [1st Saturday]  ₹ 56.35 Cr [Te: 34.25 Cr ; Mal: 1.75 Cr; Ta: 3...   
2    Day 3 [1st Sunday]  ₹ 62.05 Cr [Te: 35 Cr ; Mal: 1.55 Cr; Ta: 3.2 ...   
3    Day 4 [1st Monday]  ₹ 46.3 Cr [Te: 27.1 Cr ; Mal: 1.3 Cr; Ta: 2.05...   
4   Day 5 [1st Tuesday]  ₹ 24.9 Cr [Te: 13.7 Cr ; Mal: 0.7 Cr; Ta: 1.1 ...   

              budget verdict india_screens  \
0  ₹ 200 Cr * Approx     Hit   India: 6000   
1  ₹ 20

SAAHO

In [6]:


import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# ------------------- CHANGE ONLY THIS LINE FOR OTHER MOVIES -------------------
url = "https://www.sacnilk.com/news/Saaho_Box_Office_Collection_All_Language_Day_Wise_Worldwide"
# ------------------------------------------------------------------------------

page = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(page.text, "html.parser")

movie_name = soup.find("h1").text.strip()
print("Movie:", movie_name)

# ------------------- FIXED: GRAB BUDGET, VERDICT, SCREENS, RELEASE DATE -------------------
# Search for specific elements with keywords (handles HTML structure like <strong>)
budget = "Not Found"
verdict = "Not Found"
india_screens = "Not Found"
overseas_screens = "Not Found"
worldwide_screens = "Not Found"
release_date = "Not Found"

# Find all relevant text containers (p, div, span, etc.)
all_elements = soup.find_all(['p', 'div', 'span', 'strong'])

for elem in all_elements:
    text = elem.get_text(strip=True)
    
    # Budget
    if "Budget:" in text:
        budget_match = re.search(r'Budget:\s*(₹\s*\d+(?:\.\d+)?\s*Cr\s*\*\s*Approx)', text)
        if budget_match:
            budget = budget_match.group(1).strip()
    
    # Verdict (look for <strong>Salaar Verdict:</strong> <strong>Hit</strong>)
    if "Verdict:" in text:
        # Get the next sibling or parent text for the value
        verdict_elem = elem.find_next_sibling('strong') or elem.parent.find('strong')
        if verdict_elem:
            verdict = verdict_elem.get_text(strip=True)
        else:
            # Fallback regex for plain text
            verdict_match = re.search(r'Verdict:\s*(Hit|Flop|Blockbuster|Super Hit|Average|Disaster)', text)
            verdict = verdict_match.group(1).strip() if verdict_match else "Not Found"
    
    # Screen counts (look for lines with numbers after labels)
    if "India:" in text and re.search(r'\d+', text):
        india_match = re.search(r'India:\s*(\d+)', text)
        if india_match:
            india_screens = f"India: {india_match.group(1)}"
    if "Overseas:" in text and re.search(r'\d+', text):
        overseas_match = re.search(r'Overseas:\s*(\d+)', text)
        if overseas_match:
            overseas_screens = f"Overseas: {overseas_match.group(1)}"
    if "Worldwide total:" in text and re.search(r'\d+', text):
        worldwide_match = re.search(r'Worldwide total:\s*(\d+)', text)
        if worldwide_match:
            worldwide_screens = f"Worldwide total: {worldwide_match.group(1)}"
    
    # Release Date
    if "Release Date:" in text:
        release_match = re.search(r'Release Date:\s*(.+)', text)
        if release_match:
            release_date = release_match.group(1).strip()

print("Budget:", budget)
print("Verdict:", verdict)
print("India Screens:", india_screens)
print("Overseas Screens:", overseas_screens)
print("Worldwide Screens:", worldwide_screens)
print("Release Date:", release_date)

# ------------------- TABLE 1 (Day wise) -------------------
tables = soup.find_all("table")
daywise_table = tables[0]
rows = []

for row in daywise_table.find_all("tr")[1:]:
    cols = [c.text.strip() for c in row.find_all("td")]
    if len(cols) >= 2:
        rows.append({
            "day": cols[0],
            "india_net": cols[1],
            "change_percent": cols[2] if len(cols) > 2 else None
        })

df_daywise = pd.DataFrame(rows)

# ------------------- TABLE 2 (State wise) -------------------
if len(tables) > 1:
    state_table = tables[1]
    state_header = [th.text.strip().replace(" ", "_").lower() for th in state_table.find_all("th")]

    state_rows = []
    for row in state_table.find_all("tr")[1:]:
        cols = [c.text.strip() for c in row.find_all("td")]
        if cols:
            entry = {}
            for i, col in enumerate(cols):
                if i < len(state_header):
                    entry[state_header[i]] = col
            state_rows.append(entry)

    df_state = pd.DataFrame(state_rows)
else:
    df_state = pd.DataFrame()  # empty if no state table

# ------------------- MERGE BOTH TABLES -------------------
df = pd.merge(df_daywise, df_state, left_on="day", right_on="day", how="left")

# ------------------- ADD ALL NEW INFO -------------------
df["movie_name"] = movie_name
df["budget"] = budget
df["verdict"] = verdict
df["india_screens"] = india_screens
df["overseas_screens"] = overseas_screens
df["worldwide_screens"] = worldwide_screens
df["release_date"] = release_date

# ------------------- SAVE TO CSV -------------------
df.to_csv("Saaho_raw.csv", index=False)
print("\nSaved → Saaho_raw.csv")
print("\nFirst 5 rows:")
print(df[["day", "india_net", "budget", "verdict", "india_screens", "release_date"]].head())


Movie: Saaho Box Office Collection | All Language | Day Wise | Worldwide
Budget: Not Found
Verdict: Not Found
India Screens: India: 6500
Overseas Screens: Overseas: 1500
Worldwide Screens: Worldwide total: 8000
Release Date: 30th Aug 2019Saaho Records-Saaho is the second-highest opening day collection movie in India [All Language] after Baahubali 2 [121 Cr].- Saaho may become the second-highest India Net collection movie in India [All Language] after Baahubali 2 [1030.42 Cr].-Saaho is the 10th highest India net collection movie of all time.-Saaho is the 39th highest Hindi net collection movie of all time.Story Line--For more and the latest news aboutBollywood and Tollywood Box Office Collection, Stay tuned to us.Disclaimer: The Box Office Data are compiled from various sources and by our own research.

Saved → Saaho_raw.csv

First 5 rows:
                    day                                          india_net  \
0    Day 1 [1st Friday]  ₹ 89.00 Cr [Hi: 24.40; Ta: 3.20; Te: 60.40 ; M

KALKI2898AD

In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# ------------------- CHANGE ONLY THIS LINE FOR OTHER MOVIES -------------------
url = "https://www.sacnilk.com/news/Project_K_2024_Box_Office_Collection_Day_Wise_Worldwide"
# ------------------------------------------------------------------------------

page = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(page.text, "html.parser")

movie_name = soup.find("h1").text.strip()
print("Movie:", movie_name)

# ------------------- FIXED: GRAB BUDGET, VERDICT, SCREENS, RELEASE DATE -------------------
# Search for specific elements with keywords (handles HTML structure like <strong>)
budget = "Not Found"
verdict = "Not Found"
india_screens = "Not Found"
overseas_screens = "Not Found"
worldwide_screens = "Not Found"
release_date = "Not Found"

# Find all relevant text containers (p, div, span, etc.)
all_elements = soup.find_all(['p', 'div', 'span', 'strong'])

for elem in all_elements:
    text = elem.get_text(strip=True)
    
    # Budget
    if "Budget:" in text:
        budget_match = re.search(r'Budget:\s*(₹\s*\d+(?:\.\d+)?\s*Cr\s*\*\s*Approx)', text)
        if budget_match:
            budget = budget_match.group(1).strip()
    
    # Verdict (look for <strong>Salaar Verdict:</strong> <strong>Hit</strong>)
    if "Verdict:" in text:
        # Get the next sibling or parent text for the value
        verdict_elem = elem.find_next_sibling('strong') or elem.parent.find('strong')
        if verdict_elem:
            verdict = verdict_elem.get_text(strip=True)
        else:
            # Fallback regex for plain text
            verdict_match = re.search(r'Verdict:\s*(Hit|Flop|Blockbuster|Super Hit|Average|Disaster)', text)
            verdict = verdict_match.group(1).strip() if verdict_match else "Not Found"
    
    # Screen counts (look for lines with numbers after labels)
    if "India:" in text and re.search(r'\d+', text):
        india_match = re.search(r'India:\s*(\d+)', text)
        if india_match:
            india_screens = f"India: {india_match.group(1)}"
    if "Overseas:" in text and re.search(r'\d+', text):
        overseas_match = re.search(r'Overseas:\s*(\d+)', text)
        if overseas_match:
            overseas_screens = f"Overseas: {overseas_match.group(1)}"
    if "Worldwide total:" in text and re.search(r'\d+', text):
        worldwide_match = re.search(r'Worldwide total:\s*(\d+)', text)
        if worldwide_match:
            worldwide_screens = f"Worldwide total: {worldwide_match.group(1)}"
    
    # Release Date
    if "Release Date:" in text:
        release_match = re.search(r'Release Date:\s*(.+)', text)
        if release_match:
            release_date = release_match.group(1).strip()

print("Budget:", budget)
print("Verdict:", verdict)
print("India Screens:", india_screens)
print("Overseas Screens:", overseas_screens)
print("Worldwide Screens:", worldwide_screens)
print("Release Date:", release_date)

# ------------------- TABLE 1 (Day wise) -------------------
tables = soup.find_all("table")
daywise_table = tables[0]
rows = []

for row in daywise_table.find_all("tr")[1:]:
    cols = [c.text.strip() for c in row.find_all("td")]
    if len(cols) >= 2:
        rows.append({
            "day": cols[0],
            "india_net": cols[1],
            "change_percent": cols[2] if len(cols) > 2 else None
        })

df_daywise = pd.DataFrame(rows)

# ------------------- TABLE 2 (State wise) -------------------
if len(tables) > 1:
    state_table = tables[1]
    state_header = [th.text.strip().replace(" ", "_").lower() for th in state_table.find_all("th")]

    state_rows = []
    for row in state_table.find_all("tr")[1:]:
        cols = [c.text.strip() for c in row.find_all("td")]
        if cols:
            entry = {}
            for i, col in enumerate(cols):
                if i < len(state_header):
                    entry[state_header[i]] = col
            state_rows.append(entry)

    df_state = pd.DataFrame(state_rows)
else:
    df_state = pd.DataFrame()  # empty if no state table

# ------------------- MERGE BOTH TABLES -------------------
df = pd.merge(df_daywise, df_state, left_on="day", right_on="day", how="left")

# ------------------- ADD ALL NEW INFO -------------------
df["movie_name"] = movie_name
df["budget"] = budget
df["verdict"] = verdict
df["india_screens"] = india_screens
df["overseas_screens"] = overseas_screens
df["worldwide_screens"] = worldwide_screens
df["release_date"] = release_date

# ------------------- SAVE TO CSV -------------------
df.to_csv("Kalki2898AD_raw.csv", index=False)
print("\nSaved → Kalki2898AD_raw.csv")
print("\nFirst 5 rows:")
print(df[["day", "india_net", "budget", "verdict", "india_screens", "release_date"]].head())


Movie: Kalki 2898 AD Box Office Collection | All Language | Day Wise | Worldwide
Budget: ₹ 500 Cr * Approx
Verdict: Blockbuster
India Screens: Not Found
Overseas Screens: Not Found
Worldwide Screens: Not Found
Release Date: 27th June 2024For more and the latest news aboutTollywood Box Office Collection, Stay tuned to us.Disclaimer: The Box Office Data are compiled from various sources and by our own research.

Saved → Kalki2898AD_raw.csv

First 5 rows:
                    day india_net             budget      verdict  \
0  Day 1 [1st Thursday]   ₹ 11 Cr  ₹ 500 Cr * Approx  Blockbuster   
1    Day 2 [1st Friday]    ₹ 6 Cr  ₹ 500 Cr * Approx  Blockbuster   
2  Day 3 [1st Saturday]  ₹ 8.3 Cr  ₹ 500 Cr * Approx  Blockbuster   
3    Day 4 [1st Sunday]  ₹ 9.8 Cr  ₹ 500 Cr * Approx  Blockbuster   
4    Day 5 [1st Monday]  ₹ 3.3 Cr  ₹ 500 Cr * Approx  Blockbuster   

  india_screens                                       release_date  
0     Not Found  27th June 2024For more and the latest news

ADIPURUSH

In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# ------------------- CHANGE ONLY THIS LINE FOR OTHER MOVIES -------------------
url = "https://www.sacnilk.com/news/Adipurush_2022_Box_Office_Collection_Day_Wise_Worldwide"
# ------------------------------------------------------------------------------

page = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(page.text, "html.parser")

movie_name = soup.find("h1").text.strip()
print("Movie:", movie_name)

# ------------------- FIXED: GRAB BUDGET, VERDICT, SCREENS, RELEASE DATE -------------------
# Search for specific elements with keywords (handles HTML structure like <strong>)
budget = "Not Found"
verdict = "Not Found"
india_screens = "Not Found"
overseas_screens = "Not Found"
worldwide_screens = "Not Found"
release_date = "Not Found"

# Find all relevant text containers (p, div, span, etc.)
all_elements = soup.find_all(['p', 'div', 'span', 'strong'])

for elem in all_elements:
    text = elem.get_text(strip=True)
    
    # Budget
    if "Budget:" in text:
        budget_match = re.search(r'Budget:\s*(₹\s*\d+(?:\.\d+)?\s*Cr\s*\*\s*Approx)', text)
        if budget_match:
            budget = budget_match.group(1).strip()
    
    # Verdict (look for <strong>Salaar Verdict:</strong> <strong>Hit</strong>)
    if "Verdict:" in text:
        # Get the next sibling or parent text for the value
        verdict_elem = elem.find_next_sibling('strong') or elem.parent.find('strong')
        if verdict_elem:
            verdict = verdict_elem.get_text(strip=True)
        else:
            # Fallback regex for plain text
            verdict_match = re.search(r'Verdict:\s*(Hit|Flop|Blockbuster|Super Hit|Average|Disaster)', text)
            verdict = verdict_match.group(1).strip() if verdict_match else "Not Found"
    
    # Screen counts (look for lines with numbers after labels)
    if "India:" in text and re.search(r'\d+', text):
        india_match = re.search(r'India:\s*(\d+)', text)
        if india_match:
            india_screens = f"India: {india_match.group(1)}"
    if "Overseas:" in text and re.search(r'\d+', text):
        overseas_match = re.search(r'Overseas:\s*(\d+)', text)
        if overseas_match:
            overseas_screens = f"Overseas: {overseas_match.group(1)}"
    if "Worldwide total:" in text and re.search(r'\d+', text):
        worldwide_match = re.search(r'Worldwide total:\s*(\d+)', text)
        if worldwide_match:
            worldwide_screens = f"Worldwide total: {worldwide_match.group(1)}"
    
    # Release Date
    if "Release Date:" in text:
        release_match = re.search(r'Release Date:\s*(.+)', text)
        if release_match:
            release_date = release_match.group(1).strip()

print("Budget:", budget)
print("Verdict:", verdict)
print("India Screens:", india_screens)
print("Overseas Screens:", overseas_screens)
print("Worldwide Screens:", worldwide_screens)
print("Release Date:", release_date)

# ------------------- TABLE 1 (Day wise) -------------------
tables = soup.find_all("table")
daywise_table = tables[0]
rows = []

for row in daywise_table.find_all("tr")[1:]:
    cols = [c.text.strip() for c in row.find_all("td")]
    if len(cols) >= 2:
        rows.append({
            "day": cols[0],
            "india_net": cols[1],
            "change_percent": cols[2] if len(cols) > 2 else None
        })

df_daywise = pd.DataFrame(rows)

# ------------------- TABLE 2 (State wise) -------------------
if len(tables) > 1:
    state_table = tables[1]
    state_header = [th.text.strip().replace(" ", "_").lower() for th in state_table.find_all("th")]

    state_rows = []
    for row in state_table.find_all("tr")[1:]:
        cols = [c.text.strip() for c in row.find_all("td")]
        if cols:
            entry = {}
            for i, col in enumerate(cols):
                if i < len(state_header):
                    entry[state_header[i]] = col
            state_rows.append(entry)

    df_state = pd.DataFrame(state_rows)
else:
    df_state = pd.DataFrame()  # empty if no state table

# ------------------- MERGE BOTH TABLES -------------------
df = pd.merge(df_daywise, df_state, left_on="day", right_on="day", how="left")

# ------------------- ADD ALL NEW INFO -------------------
df["movie_name"] = movie_name
df["budget"] = budget
df["verdict"] = verdict
df["india_screens"] = india_screens
df["overseas_screens"] = overseas_screens
df["worldwide_screens"] = worldwide_screens
df["release_date"] = release_date

# ------------------- SAVE TO CSV -------------------
df.to_csv("Adipurush_raw.csv", index=False)
print("\nSaved → Adipurush_raw.csv")
print("\nFirst 5 rows:")
print(df[["day", "india_net", "budget", "verdict", "india_screens", "release_date"]].head())


Movie: Adipurush Box Office Collection | Day Wise | Worldwide
Budget: ₹ 450 Cr * Approx
Verdict: Flop
India Screens: India: 7000
Overseas Screens: Overseas: 3000
Worldwide Screens: Worldwide total: 10000
Release Date: 16th June 2023For more and the latest news aboutBollywood Box Office Collection, Stay tuned to us.Disclaimer: The Box Office Data are compiled from various sources and by our own research.

Saved → Adipurush_raw.csv

First 5 rows:
                    day  india_net             budget verdict india_screens  \
0    Day 1 [1st Friday]  ₹ 13.7 Cr  ₹ 450 Cr * Approx    Flop   India: 7000   
1  Day 2 [1st Saturday]  ₹ 7.78 Cr  ₹ 450 Cr * Approx    Flop   India: 7000   
2    Day 3 [1st Sunday]  ₹ 8.15 Cr  ₹ 450 Cr * Approx    Flop   India: 7000   
3    Day 4 [1st Monday]  ₹ 2.15 Cr  ₹ 450 Cr * Approx    Flop   India: 7000   
4   Day 5 [1st Tuesday]  ₹ 1.22 Cr  ₹ 450 Cr * Approx    Flop   India: 7000   

                                        release_date  
0  16th June 2023For 

RADHEYSHYAM

In [9]:

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# ------------------- CHANGE ONLY THIS LINE FOR OTHER MOVIES -------------------
url = "https://www.sacnilk.com/news/Radhe_Shyam_2021_Box_Office_Collection_Day_Wise_Worldwide"
# ------------------------------------------------------------------------------

page = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(page.text, "html.parser")

movie_name = soup.find("h1").text.strip()
print("Movie:", movie_name)

# ------------------- FIXED: GRAB BUDGET, VERDICT, SCREENS, RELEASE DATE -------------------
# Search for specific elements with keywords (handles HTML structure like <strong>)
budget = "Not Found"
verdict = "Not Found"
india_screens = "Not Found"
overseas_screens = "Not Found"
worldwide_screens = "Not Found"
release_date = "Not Found"

# Find all relevant text containers (p, div, span, etc.)
all_elements = soup.find_all(['p', 'div', 'span', 'strong'])

for elem in all_elements:
    text = elem.get_text(strip=True)
    
    # Budget
    if "Budget:" in text:
        budget_match = re.search(r'Budget:\s*(₹\s*\d+(?:\.\d+)?\s*Cr\s*\*\s*Approx)', text)
        if budget_match:
            budget = budget_match.group(1).strip()
    
    # Verdict (look for <strong>Salaar Verdict:</strong> <strong>Hit</strong>)
    if "Verdict:" in text:
        # Get the next sibling or parent text for the value
        verdict_elem = elem.find_next_sibling('strong') or elem.parent.find('strong')
        if verdict_elem:
            verdict = verdict_elem.get_text(strip=True)
        else:
            # Fallback regex for plain text
            verdict_match = re.search(r'Verdict:\s*(Hit|Flop|Blockbuster|Super Hit|Average|Disaster)', text)
            verdict = verdict_match.group(1).strip() if verdict_match else "Not Found"
    
    # Screen counts (look for lines with numbers after labels)
    if "India:" in text and re.search(r'\d+', text):
        india_match = re.search(r'India:\s*(\d+)', text)
        if india_match:
            india_screens = f"India: {india_match.group(1)}"
    if "Overseas:" in text and re.search(r'\d+', text):
        overseas_match = re.search(r'Overseas:\s*(\d+)', text)
        if overseas_match:
            overseas_screens = f"Overseas: {overseas_match.group(1)}"
    if "Worldwide total:" in text and re.search(r'\d+', text):
        worldwide_match = re.search(r'Worldwide total:\s*(\d+)', text)
        if worldwide_match:
            worldwide_screens = f"Worldwide total: {worldwide_match.group(1)}"
    
    # Release Date
    if "Release Date:" in text:
        release_match = re.search(r'Release Date:\s*(.+)', text)
        if release_match:
            release_date = release_match.group(1).strip()

print("Budget:", budget)
print("Verdict:", verdict)
print("India Screens:", india_screens)
print("Overseas Screens:", overseas_screens)
print("Worldwide Screens:", worldwide_screens)
print("Release Date:", release_date)

# ------------------- TABLE 1 (Day wise) -------------------
tables = soup.find_all("table")
daywise_table = tables[0]
rows = []

for row in daywise_table.find_all("tr")[1:]:
    cols = [c.text.strip() for c in row.find_all("td")]
    if len(cols) >= 2:
        rows.append({
            "day": cols[0],
            "india_net": cols[1],
            "change_percent": cols[2] if len(cols) > 2 else None
        })

df_daywise = pd.DataFrame(rows)

# ------------------- TABLE 2 (State wise) -------------------
if len(tables) > 1:
    state_table = tables[1]
    state_header = [th.text.strip().replace(" ", "_").lower() for th in state_table.find_all("th")]

    state_rows = []
    for row in state_table.find_all("tr")[1:]:
        cols = [c.text.strip() for c in row.find_all("td")]
        if cols:
            entry = {}
            for i, col in enumerate(cols):
                if i < len(state_header):
                    entry[state_header[i]] = col
            state_rows.append(entry)

    df_state = pd.DataFrame(state_rows)
else:
    df_state = pd.DataFrame()  # empty if no state table

# ------------------- MERGE BOTH TABLES -------------------
df = pd.merge(df_daywise, df_state, left_on="day", right_on="day", how="left")

# ------------------- ADD ALL NEW INFO -------------------
df["movie_name"] = movie_name
df["budget"] = budget
df["verdict"] = verdict
df["india_screens"] = india_screens
df["overseas_screens"] = overseas_screens
df["worldwide_screens"] = worldwide_screens
df["release_date"] = release_date

# ------------------- SAVE TO CSV -------------------
df.to_csv("RadheyShyam_raw.csv", index=False)
print("\nSaved → RadheyShyam_raw.csv")
print("\nFirst 5 rows:")
print(df[["day", "india_net", "budget", "verdict", "india_screens", "release_date"]].head())


Movie: Radhe Shyam Box Office Collection | Day Wise | Worldwide
Budget: ₹ 300 Cr* Approx
Verdict: Not Found
India Screens: Not Found
Overseas Screens: Not Found
Worldwide Screens: Not Found
Release Date: 11th March 2022For more and the latest news aboutIndian Box Office Collection, Stay tuned to us.Disclaimer: The Box Office Data are compiled from various sources and by our own research.

Saved → RadheyShyam_raw.csv

First 5 rows:
                    day  india_net            budget    verdict india_screens  \
0    Day 1 [1st Friday]  ₹ 10.8 Cr  ₹ 300 Cr* Approx  Not Found     Not Found   
1  Day 2 [1st Saturday]  ₹ 6.65 Cr  ₹ 300 Cr* Approx  Not Found     Not Found   
2    Day 3 [1st Sunday]  ₹ 4.95 Cr  ₹ 300 Cr* Approx  Not Found     Not Found   
3    Day 4 [1st Monday]  ₹ 0.97 Cr  ₹ 300 Cr* Approx  Not Found     Not Found   
4   Day 5 [1st Tuesday]  ₹ 0.55 Cr  ₹ 300 Cr* Approx  Not Found     Not Found   

                                        release_date  
0  11th March 2022For m

In [19]:

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# ------------------- CHANGE ONLY THIS LINE FOR OTHER MOVIES -------------------
url = "https://www.sacnilk.com/news/Radhe_Shyam_2021_Box_Office_Collection_Day_Wise_Worldwide"
# ------------------------------------------------------------------------------

page = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(page.text, "html.parser")

movie_name = soup.find("h1").text.strip()
print("Movie:", movie_name)

# ------------------- FIXED: GRAB BUDGET, VERDICT, SCREENS, RELEASE DATE -------------------
# Search for specific elements with keywords (handles HTML structure like <strong>)
budget = "Not Found"
verdict = "Not Found"
india_screens = "Not Found"
overseas_screens = "Not Found"
worldwide_screens = "Not Found"
release_date = "Not Found"

# Find all relevant text containers (p, div, span, etc.)
all_elements = soup.find_all(['p', 'div', 'span', 'strong'])

for elem in all_elements:
    text = elem.get_text(strip=True)
    
    # Budget
    if "Budget:" in text:
        budget_match = re.search(r'Budget:\s*(₹\s*\d+(?:\.\d+)?\s*Cr\s*\*\s*Approx)', text)
        if budget_match:
            budget = budget_match.group(1).strip()
    
    # Verdict (look for <strong>Salaar Verdict:</strong> <strong>Hit</strong>)
    if "Verdict:" in text:
        # Get the next sibling or parent text for the value
        verdict_elem = elem.find_next_sibling('strong') or elem.parent.find('strong')
        if verdict_elem:
            verdict = verdict_elem.get_text(strip=True)
        else:
            # Fallback regex for plain text
            verdict_match = re.search(r'Verdict:\s*(Hit|Flop|Blockbuster|Super Hit|Average|Disaster)', text)
            verdict = verdict_match.group(1).strip() if verdict_match else "Not Found"
    
    # Screen counts (look for lines with numbers after labels)
    if "India:" in text and re.search(r'\d+', text):
        india_match = re.search(r'India:\s*(\d+)', text)
        if india_match:
            india_screens = f"India: {india_match.group(1)}"
    if "Overseas:" in text and re.search(r'\d+', text):
        overseas_match = re.search(r'Overseas:\s*(\d+)', text)
        if overseas_match:
            overseas_screens = f"Overseas: {overseas_match.group(1)}"
    if "Worldwide total:" in text and re.search(r'\d+', text):
        worldwide_match = re.search(r'Worldwide total:\s*(\d+)', text)
        if worldwide_match:
            worldwide_screens = f"Worldwide total: {worldwide_match.group(1)}"
    
    # Release Date
    if "Release Date:" in text:
        release_match = re.search(r'Release Date:\s*(.+)', text)
        if release_match:
            release_date = release_match.group(1).strip()

print("Budget:", budget)
print("Verdict:", verdict)
print("India Screens:", india_screens)
print("Overseas Screens:", overseas_screens)
print("Worldwide Screens:", worldwide_screens)
print("Release Date:", release_date)

# ------------------- TABLE 1 (Day wise) -------------------
tables = soup.find_all("table")
daywise_table = tables[0]
rows = []

for row in daywise_table.find_all("tr")[1:]:
    cols = [c.text.strip() for c in row.find_all("td")]
    if len(cols) >= 2:
        rows.append({
            "day": cols[0],
            "india_net": cols[1],
            "change_percent": cols[2] if len(cols) > 2 else None
        })

df_daywise = pd.DataFrame(rows)

# ------------------- TABLE 2 (State wise) -------------------
if len(tables) > 1:
    state_table = tables[1]
    state_header = [th.text.strip().replace(" ", "_").lower() for th in state_table.find_all("th")]

    state_rows = []
    for row in state_table.find_all("tr")[1:]:
        cols = [c.text.strip() for c in row.find_all("td")]
        if cols:
            entry = {}
            for i, col in enumerate(cols):
                if i < len(state_header):
                    entry[state_header[i]] = col
            state_rows.append(entry)

    df_state = pd.DataFrame(state_rows)
else:
    df_state = pd.DataFrame()  # empty if no state table

# ------------------- MERGE BOTH TABLES -------------------
df = pd.merge(df_daywise, df_state, left_on="day", right_on="day", how="left")

# ------------------- ADD ALL NEW INFO -------------------
df["movie_name"] = movie_name
df["budget"] = budget
df["verdict"] = verdict
df["india_screens"] = india_screens
df["overseas_screens"] = overseas_screens
df["worldwide_screens"] = worldwide_screens
df["release_date"] = release_date

# ------------------- SAVE TO CSV -------------------
df.to_csv("RadheyShyam_raw.csv", index=False)
print("\nSaved → RadheyShyam_raw.csv")
print("\nFirst 5 rows:")
print(df[["day", "india_net", "budget", "verdict", "india_screens", "release_date"]].head())


Movie: Radhe Shyam Box Office Collection | Day Wise | Worldwide
Budget: ₹ 300 Cr* Approx
Verdict: Not Found
India Screens: Not Found
Overseas Screens: Not Found
Worldwide Screens: Not Found
Release Date: 11th March 2022For more and the latest news aboutIndian Box Office Collection, Stay tuned to us.Disclaimer: The Box Office Data are compiled from various sources and by our own research.

Saved → RadheyShyam_raw.csv

First 5 rows:
                    day  india_net            budget    verdict india_screens  \
0    Day 1 [1st Friday]  ₹ 10.8 Cr  ₹ 300 Cr* Approx  Not Found     Not Found   
1  Day 2 [1st Saturday]  ₹ 6.65 Cr  ₹ 300 Cr* Approx  Not Found     Not Found   
2    Day 3 [1st Sunday]  ₹ 4.95 Cr  ₹ 300 Cr* Approx  Not Found     Not Found   
3    Day 4 [1st Monday]  ₹ 0.97 Cr  ₹ 300 Cr* Approx  Not Found     Not Found   
4   Day 5 [1st Tuesday]  ₹ 0.55 Cr  ₹ 300 Cr* Approx  Not Found     Not Found   

                                        release_date  
0  11th March 2022For m

CONCAT ALL PRABHAS MOVIES 

In [13]:
import pandas as pd
import glob
import os

# ------------------- AUTO FIND ALL YOUR RAW FILES -------------------
files = glob.glob("*_raw.csv")

if not files:
    print("No *_raw.csv files found!")
else:
    print(f"Found {len(files)} movies: {files}")

dfs = []

for i, file in enumerate(files, start=1):  # movie_id starts from 1
    print(f"Processing ({i}/{len(files)}): {file}")
    df = pd.read_csv(file)

    # --- Extract clean movie name ---
    raw_name = os.path.splitext(file)[0].replace("_raw", "")
    if 'movie_name' in df.columns and df['movie_name'].notna().any():
        movie_name = df['movie_name'].iloc[0].split(" Box Office")[0].strip()
    else:
        movie_name = raw_name

    # --- Add essential columns ---
    df["movie"] = movie_name
    df["movie_id"] = i  # Unique ID: 1,2,3,...

    # --- Create primary_key safely (only if 'day' exists) ---
    if 'day' in df.columns:
        df["primary_key"] = df["movie"] + " | " + df["day"].astype(str)
    else:
        df["primary_key"] = df["movie"] + " | Row_" + df.index.astype(str)

    # --- Keep only useful columns (safe even if some are missing) ---
    desired_cols = [
        "movie_id", "movie", "day", "india_net", "change_percent",
        "karnataka", "aptg", "tamil_nadu", "kerala", "rest_of_india", "day_total",
        "budget", "verdict", "india_screens", "overseas_screens",
        "worldwide_screens", "release_date", "primary_key"
    ]
    df = df[[col for col in desired_cols if col in df.columns]]

    dfs.append(df)

# ------------------- COMBINE ALL -------------------
combined_df = pd.concat(dfs, ignore_index=True)

# Drop duplicates using primary_key (now it exists in ALL dataframes)
combined_df.drop_duplicates(subset=["primary_key"], keep="first", inplace=True)

# Sort by movie_id and day number
combined_df["day_num"] = combined_df["day"].str.extract(r'(\d+)').astype(float).fillna(999)
combined_df = combined_df.sort_values(["movie_id", "day_num"]).drop(columns=["day_num"], errors="ignore")

# Final cleanup
combined_df.reset_index(drop=True, inplace=True)

# ------------------- SAVE FINAL FILES -------------------
combined_df.to_csv("TFI_MASTER_DATASET.csv", index=False)
combined_df.to_excel("TFI_MASTER_DATASET.xlsx", index=False)

print("\nSUCCESS! ALL MOVIES COMBINED")
print(f"Total rows: {len(combined_df):,}")
print(f"Total movies: {combined_df['movie'].nunique()}")
print(f"Unique movie_ids: {sorted(combined_df['movie_id'].unique())}")
print("\nFiles saved:")
print("   TFI_MASTER_DATASET.csv")
print("   TFI_MASTER_DATASET.xlsx  ← Open in Power BI!")

Found 5 movies: ['Adipurush_raw.csv', 'Kalki2898AD_raw.csv', 'RadheyShyam_raw.csv', 'Saaho_raw.csv', 'Salaar_raw.csv']
Processing (1/5): Adipurush_raw.csv
Processing (2/5): Kalki2898AD_raw.csv
Processing (3/5): RadheyShyam_raw.csv
Processing (4/5): Saaho_raw.csv
Processing (5/5): Salaar_raw.csv

SUCCESS! ALL MOVIES COMBINED
Total rows: 140
Total movies: 5
Unique movie_ids: [1, 2, 3, 4, 5]

Files saved:
   TFI_MASTER_DATASET.csv
   TFI_MASTER_DATASET.xlsx  ← Open in Power BI!


ALLU ARJUN

NAA PERU SURYA NAA ILLU INDIA

In [14]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://www.boxofficeandhra.com/2018/06/naa-peru-surya-total-worldwide.html"
page = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(page.text, "html.parser")

movie_name = "Naa Peru Surya Naa Illu India"
print("Movie:", movie_name)

# Find the main collections table (week-wise)
table = soup.find("table", {"class": "table table-striped"})  # Adjust class if needed
if not table:
    table = soup.find("table")  # Fallback to first table

rows = []
if table:
    for row in table.find_all("tr")[1:]:  # Skip header
        cols = [c.text.strip() for c in row.find_all("td")]
        if len(cols) >= 2:
            rows.append({
                "week": cols[0],
                "india_net": cols[1],
                "worldwide_gross": cols[2] if len(cols) > 2 else None
            })

df = pd.DataFrame(rows)
df["movie"] = movie_name
df["hero"] = "Allu Arjun"
df["budget"] = "₹ 55 Cr"
df["verdict"] = "Semi-Hit"
df["release_date"] = "27th April 2018"

df.to_csv("NPS_raw.csv", index=False)
print("Saved → NPS_raw.csv")
print(df)

Movie: Naa Peru Surya Naa Illu India
Saved → NPS_raw.csv
Empty DataFrame
Columns: [movie, hero, budget, verdict, release_date]
Index: []


In [4]:



import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# ------------------- CHANGE ONLY THIS LINE FOR OTHER MOVIES -------------------
url = "https://www.sacnilk.com/news/Ala_Vaikunthapurramloo_2020_Box_Office_Collection_Day_Wise_Worldwide"
# ------------------------------------------------------------------------------

page = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(page.text, "html.parser")

movie_name = soup.find("h1").text.strip()
print("Movie:", movie_name)

# ------------------- FIXED: GRAB BUDGET, VERDICT, SCREENS, RELEASE DATE -------------------
# Search for specific elements with keywords (handles HTML structure like <strong>)
budget = "Not Found"
verdict = "Not Found"
india_screens = "Not Found"
overseas_screens = "Not Found"
worldwide_screens = "Not Found"
release_date = "Not Found"

# Find all relevant text containers (p, div, span, etc.)
all_elements = soup.find_all(['p', 'div', 'span', 'strong'])

for elem in all_elements:
    text = elem.get_text(strip=True)
    
    # Budget
    if "Budget:" in text:
        budget_match = re.search(r'Budget:\s*(₹\s*\d+(?:\.\d+)?\s*Cr\s*\*\s*Approx)', text)
        if budget_match:
            budget = budget_match.group(1).strip()
    
    # Verdict (look for <strong>Salaar Verdict:</strong> <strong>Hit</strong>)
    if "Verdict:" in text:
        # Get the next sibling or parent text for the value
        verdict_elem = elem.find_next_sibling('strong') or elem.parent.find('strong')
        if verdict_elem:
            verdict = verdict_elem.get_text(strip=True)
        else:
            # Fallback regex for plain text
            verdict_match = re.search(r'Verdict:\s*(Hit|Flop|Blockbuster|Super Hit|Average|Disaster)', text)
            verdict = verdict_match.group(1).strip() if verdict_match else "Not Found"
    
    # Screen counts (look for lines with numbers after labels)
    if "India:" in text and re.search(r'\d+', text):
        india_match = re.search(r'India:\s*(\d+)', text)
        if india_match:
            india_screens = f"India: {india_match.group(1)}"
    if "Overseas:" in text and re.search(r'\d+', text):
        overseas_match = re.search(r'Overseas:\s*(\d+)', text)
        if overseas_match:
            overseas_screens = f"Overseas: {overseas_match.group(1)}"
    if "Worldwide total:" in text and re.search(r'\d+', text):
        worldwide_match = re.search(r'Worldwide total:\s*(\d+)', text)
        if worldwide_match:
            worldwide_screens = f"Worldwide total: {worldwide_match.group(1)}"
    
    # Release Date
    if "Release Date:" in text:
        release_match = re.search(r'Release Date:\s*(.+)', text)
        if release_match:
            release_date = release_match.group(1).strip()

print("Budget:", budget)
print("Verdict:", verdict)
print("India Screens:", india_screens)
print("Overseas Screens:", overseas_screens)
print("Worldwide Screens:", worldwide_screens)
print("Release Date:", release_date)

# ------------------- TABLE 1 (Day wise) -------------------
tables = soup.find_all("table")
daywise_table = tables[0]
rows = []

for row in daywise_table.find_all("tr")[1:]:
    cols = [c.text.strip() for c in row.find_all("td")]
    if len(cols) >= 2:
        rows.append({
            "day": cols[0],
            "india_net": cols[1],
            "change_percent": cols[2] if len(cols) > 2 else None
        })

df_daywise = pd.DataFrame(rows)

# ------------------- TABLE 2 (State wise) -------------------
if len(tables) > 1:
    state_table = tables[1]
    state_header = [th.text.strip().replace(" ", "_").lower() for th in state_table.find_all("th")]

    state_rows = []
    for row in state_table.find_all("tr")[1:]:
        cols = [c.text.strip() for c in row.find_all("td")]
        if cols:
            entry = {}
            for i, col in enumerate(cols):
                if i < len(state_header):
                    entry[state_header[i]] = col
            state_rows.append(entry)

    df_state = pd.DataFrame(state_rows)
else:
    df_state = pd.DataFrame()  # empty if no state table

# ------------------- MERGE BOTH TABLES -------------------
df = pd.merge(df_daywise, df_state, left_on="day", right_on="day", how="left")

# ------------------- ADD ALL NEW INFO -------------------
df["movie_name"] = movie_name
df["budget"] = budget
df["verdict"] = verdict
df["india_screens"] = india_screens
df["overseas_screens"] = overseas_screens
df["worldwide_screens"] = worldwide_screens
df["release_date"] = release_date

# ------------------- SAVE TO CSV -------------------
df.to_csv("AVPL_raw.csv", index=False)
print("\nSaved → AVPL_raw.csv")
print("\nFirst 5 rows:")
print(df[["day", "india_net", "budget", "verdict", "india_screens", "release_date"]].head())


Movie: Ala Vaikunthapurramloo Box Office Collection | Day Wise | Worldwide
Budget: ₹ 120 Cr * Approx
Verdict: Not Found
India Screens: Not Found
Overseas Screens: Not Found
Worldwide Screens: Not Found
Release Date: 12th January 2020For more and the latest news aboutTollywood Box Office Collection, Stay tuned to us.Disclaimer: The Box Office Data are compiled from various sources and by our own research.

Saved → AVPL_raw.csv

First 5 rows:
                     day  india_net             budget    verdict  \
0     Day 1 [1st Sunday]  ₹ 5.90 Cr  ₹ 120 Cr * Approx  Not Found   
1     Day 2 [1st Monday]  ₹ 4.00 Cr  ₹ 120 Cr * Approx  Not Found   
2    Day 3 [1st Tuesday]  ₹ 4.10 Cr  ₹ 120 Cr * Approx  Not Found   
3  Day 4 [1st Wednesday]  ₹ 4.50 Cr  ₹ 120 Cr * Approx  Not Found   
4   Day 5 [1st Thursday]  ₹ 4.10 Cr  ₹ 120 Cr * Approx  Not Found   

  india_screens                                       release_date  
0     Not Found  12th January 2020For more and the latest news ...  
1 

PAWAN KALYAN

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# ------------------- CHANGE ONLY THIS LINE FOR OTHER MOVIES -------------------
url = "https://www.sacnilk.com/news/OG_2024_Box_Office_Collection_Day_Wise_Worldwide"
# ------------------------------------------------------------------------------

page = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(page.text, "html.parser")

movie_name = soup.find("h1").text.strip()
print("Movie:", movie_name)

# ------------------- FIXED: GRAB BUDGET, VERDICT, SCREENS, RELEASE DATE -------------------
# Search for specific elements with keywords (handles HTML structure like <strong>)
budget = "Not Found"
verdict = "Not Found"
india_screens = "Not Found"
overseas_screens = "Not Found"
worldwide_screens = "Not Found"
release_date = "Not Found"

# Find all relevant text containers (p, div, span, etc.)
all_elements = soup.find_all(['p', 'div', 'span', 'strong'])

for elem in all_elements:
    text = elem.get_text(strip=True)
    
    # Budget
    if "Budget:" in text:
        budget_match = re.search(r'Budget:\s*(₹\s*\d+(?:\.\d+)?\s*Cr\s*\*\s*Approx)', text)
        if budget_match:
            budget = budget_match.group(1).strip()
    
    # Verdict (look for <strong>Salaar Verdict:</strong> <strong>Hit</strong>)
    if "Verdict:" in text:
        # Get the next sibling or parent text for the value
        verdict_elem = elem.find_next_sibling('strong') or elem.parent.find('strong')
        if verdict_elem:
            verdict = verdict_elem.get_text(strip=True)
        else:
            # Fallback regex for plain text
            verdict_match = re.search(r'Verdict:\s*(Hit|Flop|Blockbuster|Super Hit|Average|Disaster)', text)
            verdict = verdict_match.group(1).strip() if verdict_match else "Not Found"
    
    # Screen counts (look for lines with numbers after labels)
    if "India:" in text and re.search(r'\d+', text):
        india_match = re.search(r'India:\s*(\d+)', text)
        if india_match:
            india_screens = f"India: {india_match.group(1)}"
    if "Overseas:" in text and re.search(r'\d+', text):
        overseas_match = re.search(r'Overseas:\s*(\d+)', text)
        if overseas_match:
            overseas_screens = f"Overseas: {overseas_match.group(1)}"
    if "Worldwide total:" in text and re.search(r'\d+', text):
        worldwide_match = re.search(r'Worldwide total:\s*(\d+)', text)
        if worldwide_match:
            worldwide_screens = f"Worldwide total: {worldwide_match.group(1)}"
    
    # Release Date
    if "Release Date:" in text:
        release_match = re.search(r'Release Date:\s*(.+)', text)
        if release_match:
            release_date = release_match.group(1).strip()

print("Budget:", budget)
print("Verdict:", verdict)
print("India Screens:", india_screens)
print("Overseas Screens:", overseas_screens)
print("Worldwide Screens:", worldwide_screens)
print("Release Date:", release_date)

# ------------------- TABLE 1 (Day wise) -------------------
tables = soup.find_all("table")
daywise_table = tables[0]
rows = []

for row in daywise_table.find_all("tr")[1:]:
    cols = [c.text.strip() for c in row.find_all("td")]
    if len(cols) >= 2:
        rows.append({
            "day": cols[0],
            "india_net": cols[1],
            "change_percent": cols[2] if len(cols) > 2 else None
        })

df_daywise = pd.DataFrame(rows)

# ------------------- TABLE 2 (State wise) -------------------
if len(tables) > 1:
    state_table = tables[1]
    state_header = [th.text.strip().replace(" ", "_").lower() for th in state_table.find_all("th")]

    state_rows = []
    for row in state_table.find_all("tr")[1:]:
        cols = [c.text.strip() for c in row.find_all("td")]
        if cols:
            entry = {}
            for i, col in enumerate(cols):
                if i < len(state_header):
                    entry[state_header[i]] = col
            state_rows.append(entry)

    df_state = pd.DataFrame(state_rows)
else:
    df_state = pd.DataFrame()  # empty if no state table

# ------------------- MERGE BOTH TABLES -------------------
df = pd.merge(df_daywise, df_state, left_on="day", right_on="day", how="left")

# ------------------- ADD ALL NEW INFO -------------------
df["movie_name"] = movie_name
df["budget"] = budget
df["verdict"] = verdict
df["india_screens"] = india_screens
df["overseas_screens"] = overseas_screens
df["worldwide_screens"] = worldwide_screens
df["release_date"] = release_date

# ------------------- SAVE TO CSV -------------------
df.to_csv("OG_raw.csv", index=False)
print("\nSaved → OG_raw.csv")
print("\nFirst 5 rows:")
print(df[["day", "india_net", "budget", "verdict", "india_screens", "release_date"]].head())


Movie: They Call Him OG Box Office Collection | All Language | Day Wise | Worldwide
Budget: Not Found
Verdict: Not Found
India Screens: Not Found
Overseas Screens: Not Found
Worldwide Screens: Not Found
Release Date: 25th September 2025For more and the latest news aboutTollywood Box Office Collection, Stay tuned to us.Disclaimer: The Box Office Data are compiled from various sources and by our own research.

Saved → OG_raw.csv

First 5 rows:
                    day india_net     budget    verdict india_screens  \
0    Day 0 [ Wednesday]  ₹ 2.7 Cr  Not Found  Not Found     Not Found   
1  Day 1 [1st Thursday]  ₹ 6.8 Cr  Not Found  Not Found     Not Found   
2    Day 2 [1st Friday]    ₹ 2 Cr  Not Found  Not Found     Not Found   
3  Day 3 [1st Saturday]  ₹ 2.6 Cr  Not Found  Not Found     Not Found   
4    Day 4 [1st Sunday]  ₹ 2.1 Cr  Not Found  Not Found     Not Found   

                                        release_date  
0  25th September 2025For more and the latest new...  
1  25

In [21]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# ------------------- CHANGE ONLY THIS LINE FOR OTHER MOVIES -------------------
url = "https://www.sacnilk.com/news/PKSDT_2023_Box_Office_Collection_Day_Wise_Worldwide"
# ------------------------------------------------------------------------------

page = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(page.text, "html.parser")

movie_name = soup.find("h1").text.strip()
print("Movie:", movie_name)

# ------------------- FIXED: GRAB BUDGET, VERDICT, SCREENS, RELEASE DATE -------------------
# Search for specific elements with keywords (handles HTML structure like <strong>)
budget = "Not Found"
verdict = "Not Found"
india_screens = "Not Found"
overseas_screens = "Not Found"
worldwide_screens = "Not Found"
release_date = "Not Found"

# Find all relevant text containers (p, div, span, etc.)
all_elements = soup.find_all(['p', 'div', 'span', 'strong'])

for elem in all_elements:
    text = elem.get_text(strip=True)
    
    # Budget
    if "Budget:" in text:
        budget_match = re.search(r'Budget:\s*(₹\s*\d+(?:\.\d+)?\s*Cr\s*\*\s*Approx)', text)
        if budget_match:
            budget = budget_match.group(1).strip()
    
    # Verdict (look for <strong>Salaar Verdict:</strong> <strong>Hit</strong>)
    if "Verdict:" in text:
        # Get the next sibling or parent text for the value
        verdict_elem = elem.find_next_sibling('strong') or elem.parent.find('strong')
        if verdict_elem:
            verdict = verdict_elem.get_text(strip=True)
        else:
            # Fallback regex for plain text
            verdict_match = re.search(r'Verdict:\s*(Hit|Flop|Blockbuster|Super Hit|Average|Disaster)', text)
            verdict = verdict_match.group(1).strip() if verdict_match else "Not Found"
    
    # Screen counts (look for lines with numbers after labels)
    if "India:" in text and re.search(r'\d+', text):
        india_match = re.search(r'India:\s*(\d+)', text)
        if india_match:
            india_screens = f"India: {india_match.group(1)}"
    if "Overseas:" in text and re.search(r'\d+', text):
        overseas_match = re.search(r'Overseas:\s*(\d+)', text)
        if overseas_match:
            overseas_screens = f"Overseas: {overseas_match.group(1)}"
    if "Worldwide total:" in text and re.search(r'\d+', text):
        worldwide_match = re.search(r'Worldwide total:\s*(\d+)', text)
        if worldwide_match:
            worldwide_screens = f"Worldwide total: {worldwide_match.group(1)}"
    
    # Release Date
    if "Release Date:" in text:
        release_match = re.search(r'Release Date:\s*(.+)', text)
        if release_match:
            release_date = release_match.group(1).strip()

print("Budget:", budget)
print("Verdict:", verdict)
print("India Screens:", india_screens)
print("Overseas Screens:", overseas_screens)
print("Worldwide Screens:", worldwide_screens)
print("Release Date:", release_date)

# ------------------- TABLE 1 (Day wise) -------------------
tables = soup.find_all("table")
daywise_table = tables[0]
rows = []

for row in daywise_table.find_all("tr")[1:]:
    cols = [c.text.strip() for c in row.find_all("td")]
    if len(cols) >= 2:
        rows.append({
            "day": cols[0],
            "india_net": cols[1],
            "change_percent": cols[2] if len(cols) > 2 else None
        })

df_daywise = pd.DataFrame(rows)

# ------------------- TABLE 2 (State wise) -------------------
if len(tables) > 1:
    state_table = tables[1]
    state_header = [th.text.strip().replace(" ", "_").lower() for th in state_table.find_all("th")]

    state_rows = []
    for row in state_table.find_all("tr")[1:]:
        cols = [c.text.strip() for c in row.find_all("td")]
        if cols:
            entry = {}
            for i, col in enumerate(cols):
                if i < len(state_header):
                    entry[state_header[i]] = col
            state_rows.append(entry)

    df_state = pd.DataFrame(state_rows)
else:
    df_state = pd.DataFrame()  # empty if no state table

# ------------------- MERGE BOTH TABLES -------------------
df = pd.merge(df_daywise, df_state, left_on="day", right_on="day", how="left")

# ------------------- ADD ALL NEW INFO -------------------
df["movie_name"] = movie_name
df["budget"] = budget
df["verdict"] = verdict
df["india_screens"] = india_screens
df["overseas_screens"] = overseas_screens
df["worldwide_screens"] = worldwide_screens
df["release_date"] = release_date

# ------------------- SAVE TO CSV -------------------
df.to_csv("BRO_raw.csv", index=False)
print("\nSaved → BRO_raw.csv")
print("\nFirst 5 rows:")
print(df[["day", "india_net", "budget", "verdict", "india_screens", "release_date"]].head())


Movie: Bro Box Office Collection | All Language | Day Wise | Worldwide
Budget: Not Found
Verdict: Not Found
India Screens: Not Found
Overseas Screens: Not Found
Worldwide Screens: Not Found
Release Date: 28th July 2023For more and the latest news aboutTollywood Box Office Collection, Stay tuned to us.Disclaimer: The Box Office Data are compiled from various sources and by our own research.

Saved → BRO_raw.csv

First 5 rows:
                    day  india_net     budget    verdict india_screens  \
0    Day 1 [1st Friday]  ₹ 8.45 Cr  Not Found  Not Found     Not Found   
1  Day 2 [1st Saturday]   ₹ 4.8 Cr  Not Found  Not Found     Not Found   
2    Day 3 [1st Sunday]   ₹ 4.2 Cr  Not Found  Not Found     Not Found   
3    Day 4 [1st Monday]  ₹ 0.99 Cr  Not Found  Not Found     Not Found   
4   Day 5 [1st Tuesday]  ₹ 0.72 Cr  Not Found  Not Found     Not Found   

                                        release_date  
0  28th July 2023For more and the latest news abo...  
1  28th July 202

In [22]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# ------------------- CHANGE ONLY THIS LINE FOR OTHER MOVIES -------------------
url = "https://www.sacnilk.com/news/PSPK_Rana_Movie_2022_Box_Office_Collection_Day_Wise_Worldwide"
# ------------------------------------------------------------------------------

page = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(page.text, "html.parser")

movie_name = soup.find("h1").text.strip()
print("Movie:", movie_name)

# ------------------- FIXED: GRAB BUDGET, VERDICT, SCREENS, RELEASE DATE -------------------
# Search for specific elements with keywords (handles HTML structure like <strong>)
budget = "Not Found"
verdict = "Not Found"
india_screens = "Not Found"
overseas_screens = "Not Found"
worldwide_screens = "Not Found"
release_date = "Not Found"

# Find all relevant text containers (p, div, span, etc.)
all_elements = soup.find_all(['p', 'div', 'span', 'strong'])

for elem in all_elements:
    text = elem.get_text(strip=True)
    
    # Budget
    if "Budget:" in text:
        budget_match = re.search(r'Budget:\s*(₹\s*\d+(?:\.\d+)?\s*Cr\s*\*\s*Approx)', text)
        if budget_match:
            budget = budget_match.group(1).strip()
    
    # Verdict (look for <strong>Salaar Verdict:</strong> <strong>Hit</strong>)
    if "Verdict:" in text:
        # Get the next sibling or parent text for the value
        verdict_elem = elem.find_next_sibling('strong') or elem.parent.find('strong')
        if verdict_elem:
            verdict = verdict_elem.get_text(strip=True)
        else:
            # Fallback regex for plain text
            verdict_match = re.search(r'Verdict:\s*(Hit|Flop|Blockbuster|Super Hit|Average|Disaster)', text)
            verdict = verdict_match.group(1).strip() if verdict_match else "Not Found"
    
    # Screen counts (look for lines with numbers after labels)
    if "India:" in text and re.search(r'\d+', text):
        india_match = re.search(r'India:\s*(\d+)', text)
        if india_match:
            india_screens = f"India: {india_match.group(1)}"
    if "Overseas:" in text and re.search(r'\d+', text):
        overseas_match = re.search(r'Overseas:\s*(\d+)', text)
        if overseas_match:
            overseas_screens = f"Overseas: {overseas_match.group(1)}"
    if "Worldwide total:" in text and re.search(r'\d+', text):
        worldwide_match = re.search(r'Worldwide total:\s*(\d+)', text)
        if worldwide_match:
            worldwide_screens = f"Worldwide total: {worldwide_match.group(1)}"
    
    # Release Date
    if "Release Date:" in text:
        release_match = re.search(r'Release Date:\s*(.+)', text)
        if release_match:
            release_date = release_match.group(1).strip()

print("Budget:", budget)
print("Verdict:", verdict)
print("India Screens:", india_screens)
print("Overseas Screens:", overseas_screens)
print("Worldwide Screens:", worldwide_screens)
print("Release Date:", release_date)

# ------------------- TABLE 1 (Day wise) -------------------
tables = soup.find_all("table")
daywise_table = tables[0]
rows = []

for row in daywise_table.find_all("tr")[1:]:
    cols = [c.text.strip() for c in row.find_all("td")]
    if len(cols) >= 2:
        rows.append({
            "day": cols[0],
            "india_net": cols[1],
            "change_percent": cols[2] if len(cols) > 2 else None
        })

df_daywise = pd.DataFrame(rows)

# ------------------- TABLE 2 (State wise) -------------------
if len(tables) > 1:
    state_table = tables[1]
    state_header = [th.text.strip().replace(" ", "_").lower() for th in state_table.find_all("th")]

    state_rows = []
    for row in state_table.find_all("tr")[1:]:
        cols = [c.text.strip() for c in row.find_all("td")]
        if cols:
            entry = {}
            for i, col in enumerate(cols):
                if i < len(state_header):
                    entry[state_header[i]] = col
            state_rows.append(entry)

    df_state = pd.DataFrame(state_rows)
else:
    df_state = pd.DataFrame()  # empty if no state table

# ------------------- MERGE BOTH TABLES -------------------
df = pd.merge(df_daywise, df_state, left_on="day", right_on="day", how="left")

# ------------------- ADD ALL NEW INFO -------------------
df["movie_name"] = movie_name
df["budget"] = budget
df["verdict"] = verdict
df["india_screens"] = india_screens
df["overseas_screens"] = overseas_screens
df["worldwide_screens"] = worldwide_screens
df["release_date"] = release_date

# ------------------- SAVE TO CSV -------------------
df.to_csv("BN_raw.csv", index=False)
print("\nSaved → BN_raw.csv")
print("\nFirst 5 rows:")
print(df[["day", "india_net", "budget", "verdict", "india_screens", "release_date"]].head())


Movie: Bheemla Nayak Box Office Collection | All Language | Day Wise | Worldwide
Budget: ₹ 80 Cr * Approx
Verdict: Not Found
India Screens: Not Found
Overseas Screens: Not Found
Worldwide Screens: Not Found
Release Date: 25 Feb 2022For more and the latest news aboutTollywood Box Office Collection, Stay tuned to us.Disclaimer: The Box Office Data are compiled from various sources and by our own research.

Saved → BN_raw.csv

First 5 rows:
                    day   india_net            budget    verdict  \
0    Day 1 [1st Friday]  ₹ 11.85 Cr  ₹ 80 Cr * Approx  Not Found   
1  Day 2 [1st Saturday]    ₹ 7.5 Cr  ₹ 80 Cr * Approx  Not Found   
2    Day 3 [1st Sunday]   ₹ 6.55 Cr  ₹ 80 Cr * Approx  Not Found   
3    Day 4 [1st Monday]   ₹ 2.39 Cr  ₹ 80 Cr * Approx  Not Found   
4   Day 5 [1st Tuesday]   ₹ 2.93 Cr  ₹ 80 Cr * Approx  Not Found   

  india_screens                                       release_date  
0     Not Found  25 Feb 2022For more and the latest news aboutT...  
1     Not F

In [26]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# ------------------- CHANGE ONLY THIS LINE FOR OTHER MOVIES -------------------
url = "https://www.sacnilk.com/news/PSPK_26_2020_Box_Office_Collection_Day_Wise_Worldwide"
# ------------------------------------------------------------------------------

page = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(page.text, "html.parser")

movie_name = soup.find("h1").text.strip()
print("Movie:", movie_name)

# ------------------- FIXED: GRAB BUDGET, VERDICT, SCREENS, RELEASE DATE -------------------
# Search for specific elements with keywords (handles HTML structure like <strong>)
budget = "Not Found"
verdict = "Not Found"
india_screens = "Not Found"
overseas_screens = "Not Found"
worldwide_screens = "Not Found"
release_date = "Not Found"

# Find all relevant text containers (p, div, span, etc.)
all_elements = soup.find_all(['p', 'div', 'span', 'strong'])

for elem in all_elements:
    text = elem.get_text(strip=True)
    
    # Budget
    if "Budget:" in text:
        budget_match = re.search(r'Budget:\s*(₹\s*\d+(?:\.\d+)?\s*Cr\s*\*\s*Approx)', text)
        if budget_match:
            budget = budget_match.group(1).strip()
    
    # Verdict (look for <strong>Salaar Verdict:</strong> <strong>Hit</strong>)
    if "Verdict:" in text:
        # Get the next sibling or parent text for the value
        verdict_elem = elem.find_next_sibling('strong') or elem.parent.find('strong')
        if verdict_elem:
            verdict = verdict_elem.get_text(strip=True)
        else:
            # Fallback regex for plain text
            verdict_match = re.search(r'Verdict:\s*(Hit|Flop|Blockbuster|Super Hit|Average|Disaster)', text)
            verdict = verdict_match.group(1).strip() if verdict_match else "Not Found"
    
    # Screen counts (look for lines with numbers after labels)
    if "India:" in text and re.search(r'\d+', text):
        india_match = re.search(r'India:\s*(\d+)', text)
        if india_match:
            india_screens = f"India: {india_match.group(1)}"
    if "Overseas:" in text and re.search(r'\d+', text):
        overseas_match = re.search(r'Overseas:\s*(\d+)', text)
        if overseas_match:
            overseas_screens = f"Overseas: {overseas_match.group(1)}"
    if "Worldwide total:" in text and re.search(r'\d+', text):
        worldwide_match = re.search(r'Worldwide total:\s*(\d+)', text)
        if worldwide_match:
            worldwide_screens = f"Worldwide total: {worldwide_match.group(1)}"
    
    # Release Date
    if "Release Date:" in text:
        release_match = re.search(r'Release Date:\s*(.+)', text)
        if release_match:
            release_date = release_match.group(1).strip()

print("Budget:", budget)
print("Verdict:", verdict)
print("India Screens:", india_screens)
print("Overseas Screens:", overseas_screens)
print("Worldwide Screens:", worldwide_screens)
print("Release Date:", release_date)

# ------------------- TABLE 1 (Day wise) -------------------
tables = soup.find_all("table")
daywise_table = tables[0]
rows = []

for row in daywise_table.find_all("tr")[1:]:
    cols = [c.text.strip() for c in row.find_all("td")]
    if len(cols) >= 2:
        rows.append({
            "day": cols[0],
            "india_net": cols[1],
            "change_percent": cols[2] if len(cols) > 2 else None
        })

df_daywise = pd.DataFrame(rows)

# ------------------- TABLE 2 (State wise) -------------------
if len(tables) > 1:
    state_table = tables[1]
    state_header = [th.text.strip().replace(" ", "_").lower() for th in state_table.find_all("th")]

    state_rows = []
    for row in state_table.find_all("tr")[1:]:
        cols = [c.text.strip() for c in row.find_all("td")]
        if cols:
            entry = {}
            for i, col in enumerate(cols):
                if i < len(state_header):
                    entry[state_header[i]] = col
            state_rows.append(entry)

    df_state = pd.DataFrame(state_rows)
else:
    df_state = pd.DataFrame()  # empty if no state table

# ------------------- MERGE BOTH TABLES -------------------
df = pd.merge(df_daywise, df_state, left_on="day", right_on="day", how="left")

# ------------------- ADD ALL NEW INFO -------------------
df["movie_name"] = movie_name
df["budget"] = budget
df["verdict"] = verdict
df["india_screens"] = india_screens
df["overseas_screens"] = overseas_screens
df["worldwide_screens"] = worldwide_screens
df["release_date"] = release_date

# ------------------- SAVE TO CSV -------------------
df.to_csv("VK_raw.csv", index=False)
print("\nSaved → VK_raw.csv")
print("\nFirst 5 rows:")
print(df[["day", "india_net", "budget", "verdict", "india_screens", "release_date"]].head())


Movie: Vakeel Saab Box Office Collection | Day Wise | Worldwide
Budget: Not Found
Verdict: Not Found
India Screens: India: 1500
Overseas Screens: Overseas: 700
Worldwide Screens: Worldwide total: 2200
Release Date: 9th April 2021For more and the latest news aboutTollywood Box Office Collection, Stay tuned to us.Disclaimer: The Box Office Data are compiled from various sources and by our own research.

Saved → VK_raw.csv

First 5 rows:
                    day  india_net     budget    verdict india_screens  \
0    Day 1 [1st Friday]  ₹ 8.75 Cr  Not Found  Not Found   India: 1500   
1  Day 2 [1st Saturday]   ₹ 3.6 Cr  Not Found  Not Found   India: 1500   
2    Day 3 [1st Sunday]   ₹ 3.9 Cr  Not Found  Not Found   India: 1500   
3    Day 4 [1st Monday]   ₹ 1.1 Cr  Not Found  Not Found   India: 1500   
4   Day 5 [1st Tuesday]   ₹ 2.9 Cr  Not Found  Not Found   India: 1500   

                                        release_date  
0  9th April 2021For more and the latest news abo...  
1  9th

MAHESH BABU

In [27]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# ------------------- CHANGE ONLY THIS LINE FOR OTHER MOVIES -------------------
url = "https://www.sacnilk.com/news/SSMB28_2022_Box_Office_Collection_Day_Wise_Worldwide"
# ------------------------------------------------------------------------------

page = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(page.text, "html.parser")

movie_name = soup.find("h1").text.strip()
print("Movie:", movie_name)

# ------------------- FIXED: GRAB BUDGET, VERDICT, SCREENS, RELEASE DATE -------------------
# Search for specific elements with keywords (handles HTML structure like <strong>)
budget = "Not Found"
verdict = "Not Found"
india_screens = "Not Found"
overseas_screens = "Not Found"
worldwide_screens = "Not Found"
release_date = "Not Found"

# Find all relevant text containers (p, div, span, etc.)
all_elements = soup.find_all(['p', 'div', 'span', 'strong'])

for elem in all_elements:
    text = elem.get_text(strip=True)
    
    # Budget
    if "Budget:" in text:
        budget_match = re.search(r'Budget:\s*(₹\s*\d+(?:\.\d+)?\s*Cr\s*\*\s*Approx)', text)
        if budget_match:
            budget = budget_match.group(1).strip()
    
    # Verdict (look for <strong>Salaar Verdict:</strong> <strong>Hit</strong>)
    if "Verdict:" in text:
        # Get the next sibling or parent text for the value
        verdict_elem = elem.find_next_sibling('strong') or elem.parent.find('strong')
        if verdict_elem:
            verdict = verdict_elem.get_text(strip=True)
        else:
            # Fallback regex for plain text
            verdict_match = re.search(r'Verdict:\s*(Hit|Flop|Blockbuster|Super Hit|Average|Disaster)', text)
            verdict = verdict_match.group(1).strip() if verdict_match else "Not Found"
    
    # Screen counts (look for lines with numbers after labels)
    if "India:" in text and re.search(r'\d+', text):
        india_match = re.search(r'India:\s*(\d+)', text)
        if india_match:
            india_screens = f"India: {india_match.group(1)}"
    if "Overseas:" in text and re.search(r'\d+', text):
        overseas_match = re.search(r'Overseas:\s*(\d+)', text)
        if overseas_match:
            overseas_screens = f"Overseas: {overseas_match.group(1)}"
    if "Worldwide total:" in text and re.search(r'\d+', text):
        worldwide_match = re.search(r'Worldwide total:\s*(\d+)', text)
        if worldwide_match:
            worldwide_screens = f"Worldwide total: {worldwide_match.group(1)}"
    
    # Release Date
    if "Release Date:" in text:
        release_match = re.search(r'Release Date:\s*(.+)', text)
        if release_match:
            release_date = release_match.group(1).strip()

print("Budget:", budget)
print("Verdict:", verdict)
print("India Screens:", india_screens)
print("Overseas Screens:", overseas_screens)
print("Worldwide Screens:", worldwide_screens)
print("Release Date:", release_date)

# ------------------- TABLE 1 (Day wise) -------------------
tables = soup.find_all("table")
daywise_table = tables[0]
rows = []

for row in daywise_table.find_all("tr")[1:]:
    cols = [c.text.strip() for c in row.find_all("td")]
    if len(cols) >= 2:
        rows.append({
            "day": cols[0],
            "india_net": cols[1],
            "change_percent": cols[2] if len(cols) > 2 else None
        })

df_daywise = pd.DataFrame(rows)

# ------------------- TABLE 2 (State wise) -------------------
if len(tables) > 1:
    state_table = tables[1]
    state_header = [th.text.strip().replace(" ", "_").lower() for th in state_table.find_all("th")]

    state_rows = []
    for row in state_table.find_all("tr")[1:]:
        cols = [c.text.strip() for c in row.find_all("td")]
        if cols:
            entry = {}
            for i, col in enumerate(cols):
                if i < len(state_header):
                    entry[state_header[i]] = col
            state_rows.append(entry)

    df_state = pd.DataFrame(state_rows)
else:
    df_state = pd.DataFrame()  # empty if no state table

# ------------------- MERGE BOTH TABLES -------------------
df = pd.merge(df_daywise, df_state, left_on="day", right_on="day", how="left")

# ------------------- ADD ALL NEW INFO -------------------
df["movie_name"] = movie_name
df["budget"] = budget
df["verdict"] = verdict
df["india_screens"] = india_screens
df["overseas_screens"] = overseas_screens
df["worldwide_screens"] = worldwide_screens
df["release_date"] = release_date

# ------------------- SAVE TO CSV -------------------
df.to_csv("GK_raw.csv", index=False)
print("\nSaved → GK_raw.csv")
print("\nFirst 5 rows:")
print(df[["day", "india_net", "budget", "verdict", "india_screens", "release_date"]].head())


Movie: Guntur Kaaram Box Office Collection | All Language | Day Wise | Worldwide
Budget: ₹ 150 Cr * Approx
Verdict: Average
India Screens: Not Found
Overseas Screens: Not Found
Worldwide Screens: Not Found
Release Date: 12th January 2024For more and the latest news aboutTollywood Box Office Collection, Stay tuned to us.Disclaimer: The Box Office Data are compiled from various sources and by our own research.

Saved → GK_raw.csv

First 5 rows:
                    day  india_net             budget  verdict india_screens  \
0    Day 1 [1st Friday]   ₹ 4.2 Cr  ₹ 150 Cr * Approx  Average     Not Found   
1  Day 2 [1st Saturday]  ₹ 2.05 Cr  ₹ 150 Cr * Approx  Average     Not Found   
2    Day 3 [1st Sunday]   ₹ 1.6 Cr  ₹ 150 Cr * Approx  Average     Not Found   
3    Day 4 [1st Monday]  ₹ 1.05 Cr  ₹ 150 Cr * Approx  Average     Not Found   
4   Day 5 [1st Tuesday]  ₹ 0.52 Cr  ₹ 150 Cr * Approx  Average     Not Found   

                                        release_date  
0  12th January 2

In [28]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# ------------------- CHANGE ONLY THIS LINE FOR OTHER MOVIES -------------------
url = "https://www.sacnilk.com/news/Sarkaru_Vari_Pata_2021_Box_Office_Collection_Day_Wise_Worldwide"
# ------------------------------------------------------------------------------

page = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(page.text, "html.parser")

movie_name = soup.find("h1").text.strip()
print("Movie:", movie_name)

# ------------------- FIXED: GRAB BUDGET, VERDICT, SCREENS, RELEASE DATE -------------------
# Search for specific elements with keywords (handles HTML structure like <strong>)
budget = "Not Found"
verdict = "Not Found"
india_screens = "Not Found"
overseas_screens = "Not Found"
worldwide_screens = "Not Found"
release_date = "Not Found"

# Find all relevant text containers (p, div, span, etc.)
all_elements = soup.find_all(['p', 'div', 'span', 'strong'])

for elem in all_elements:
    text = elem.get_text(strip=True)
    
    # Budget
    if "Budget:" in text:
        budget_match = re.search(r'Budget:\s*(₹\s*\d+(?:\.\d+)?\s*Cr\s*\*\s*Approx)', text)
        if budget_match:
            budget = budget_match.group(1).strip()
    
    # Verdict (look for <strong>Salaar Verdict:</strong> <strong>Hit</strong>)
    if "Verdict:" in text:
        # Get the next sibling or parent text for the value
        verdict_elem = elem.find_next_sibling('strong') or elem.parent.find('strong')
        if verdict_elem:
            verdict = verdict_elem.get_text(strip=True)
        else:
            # Fallback regex for plain text
            verdict_match = re.search(r'Verdict:\s*(Hit|Flop|Blockbuster|Super Hit|Average|Disaster)', text)
            verdict = verdict_match.group(1).strip() if verdict_match else "Not Found"
    
    # Screen counts (look for lines with numbers after labels)
    if "India:" in text and re.search(r'\d+', text):
        india_match = re.search(r'India:\s*(\d+)', text)
        if india_match:
            india_screens = f"India: {india_match.group(1)}"
    if "Overseas:" in text and re.search(r'\d+', text):
        overseas_match = re.search(r'Overseas:\s*(\d+)', text)
        if overseas_match:
            overseas_screens = f"Overseas: {overseas_match.group(1)}"
    if "Worldwide total:" in text and re.search(r'\d+', text):
        worldwide_match = re.search(r'Worldwide total:\s*(\d+)', text)
        if worldwide_match:
            worldwide_screens = f"Worldwide total: {worldwide_match.group(1)}"
    
    # Release Date
    if "Release Date:" in text:
        release_match = re.search(r'Release Date:\s*(.+)', text)
        if release_match:
            release_date = release_match.group(1).strip()

print("Budget:", budget)
print("Verdict:", verdict)
print("India Screens:", india_screens)
print("Overseas Screens:", overseas_screens)
print("Worldwide Screens:", worldwide_screens)
print("Release Date:", release_date)

# ------------------- TABLE 1 (Day wise) -------------------
tables = soup.find_all("table")
daywise_table = tables[0]
rows = []

for row in daywise_table.find_all("tr")[1:]:
    cols = [c.text.strip() for c in row.find_all("td")]
    if len(cols) >= 2:
        rows.append({
            "day": cols[0],
            "india_net": cols[1],
            "change_percent": cols[2] if len(cols) > 2 else None
        })

df_daywise = pd.DataFrame(rows)

# ------------------- TABLE 2 (State wise) -------------------
if len(tables) > 1:
    state_table = tables[1]
    state_header = [th.text.strip().replace(" ", "_").lower() for th in state_table.find_all("th")]

    state_rows = []
    for row in state_table.find_all("tr")[1:]:
        cols = [c.text.strip() for c in row.find_all("td")]
        if cols:
            entry = {}
            for i, col in enumerate(cols):
                if i < len(state_header):
                    entry[state_header[i]] = col
            state_rows.append(entry)

    df_state = pd.DataFrame(state_rows)
else:
    df_state = pd.DataFrame()  # empty if no state table

# ------------------- MERGE BOTH TABLES -------------------
df = pd.merge(df_daywise, df_state, left_on="day", right_on="day", how="left")

# ------------------- ADD ALL NEW INFO -------------------
df["movie_name"] = movie_name
df["budget"] = budget
df["verdict"] = verdict
df["india_screens"] = india_screens
df["overseas_screens"] = overseas_screens
df["worldwide_screens"] = worldwide_screens
df["release_date"] = release_date

# ------------------- SAVE TO CSV -------------------
df.to_csv("SVP_raw.csv", index=False)
print("\nSaved → SVP_raw.csv")
print("\nFirst 5 rows:")
print(df[["day", "india_net", "budget", "verdict", "india_screens", "release_date"]].head())


Movie: Sarkaru Vaari Paata Box Office Collection | Day Wise | Worldwide
Budget: Not Found
Verdict: Not Found
India Screens: Not Found
Overseas Screens: Not Found
Worldwide Screens: Not Found
Release Date: 12th May 2022For more and the latest news aboutTollywood Box Office Collection, Stay tuned to us.Disclaimer: The Box Office Data are compiled from various sources and by our own research.

Saved → SVP_raw.csv

First 5 rows:
                    day india_net     budget    verdict india_screens  \
0  Day 1 [1st Thursday]  ₹ 3.6 Cr  Not Found  Not Found     Not Found   
1    Day 2 [1st Friday]  ₹ 1.6 Cr  Not Found  Not Found     Not Found   
2  Day 3 [1st Saturday]  ₹ 2.2 Cr  Not Found  Not Found     Not Found   
3    Day 4 [1st Sunday]    ₹ 2 Cr  Not Found  Not Found     Not Found   
4    Day 5 [1st Monday]  ₹ 0.4 Cr  Not Found  Not Found     Not Found   

                                        release_date  
0  12th May 2022For more and the latest news abou...  
1  12th May 2022For mo

In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# ------------------- CHANGE ONLY THIS LINE FOR OTHER MOVIES -------------------
url = "https://www.sacnilk.com/news/Sarileru_Neekevvaru_2020_Box_Office_Collection_Day_Wise_Worldwide"
# ------------------------------------------------------------------------------

page = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(page.text, "html.parser")

movie_name = soup.find("h1").text.strip()
print("Movie:", movie_name)

# ------------------- FIXED: GRAB BUDGET, VERDICT, SCREENS, RELEASE DATE -------------------
# Search for specific elements with keywords (handles HTML structure like <strong>)
budget = "Not Found"
verdict = "Not Found"
india_screens = "Not Found"
overseas_screens = "Not Found"
worldwide_screens = "Not Found"
release_date = "Not Found"

# Find all relevant text containers (p, div, span, etc.)
all_elements = soup.find_all(['p', 'div', 'span', 'strong'])

for elem in all_elements:
    text = elem.get_text(strip=True)
    
    # Budget
    if "Budget:" in text:
        budget_match = re.search(r'Budget:\s*(₹\s*\d+(?:\.\d+)?\s*Cr\s*\*\s*Approx)', text)
        if budget_match:
            budget = budget_match.group(1).strip()
    
    # Verdict (look for <strong>Salaar Verdict:</strong> <strong>Hit</strong>)
    if "Verdict:" in text:
        # Get the next sibling or parent text for the value
        verdict_elem = elem.find_next_sibling('strong') or elem.parent.find('strong')
        if verdict_elem:
            verdict = verdict_elem.get_text(strip=True)
        else:
            # Fallback regex for plain text
            verdict_match = re.search(r'Verdict:\s*(Hit|Flop|Blockbuster|Super Hit|Average|Disaster)', text)
            verdict = verdict_match.group(1).strip() if verdict_match else "Not Found"
    
    # Screen counts (look for lines with numbers after labels)
    if "India:" in text and re.search(r'\d+', text):
        india_match = re.search(r'India:\s*(\d+)', text)
        if india_match:
            india_screens = f"India: {india_match.group(1)}"
    if "Overseas:" in text and re.search(r'\d+', text):
        overseas_match = re.search(r'Overseas:\s*(\d+)', text)
        if overseas_match:
            overseas_screens = f"Overseas: {overseas_match.group(1)}"
    if "Worldwide total:" in text and re.search(r'\d+', text):
        worldwide_match = re.search(r'Worldwide total:\s*(\d+)', text)
        if worldwide_match:
            worldwide_screens = f"Worldwide total: {worldwide_match.group(1)}"
    
    # Release Date
    if "Release Date:" in text:
        release_match = re.search(r'Release Date:\s*(.+)', text)
        if release_match:
            release_date = release_match.group(1).strip()

print("Budget:", budget)
print("Verdict:", verdict)
print("India Screens:", india_screens)
print("Overseas Screens:", overseas_screens)
print("Worldwide Screens:", worldwide_screens)
print("Release Date:", release_date)

# ------------------- TABLE 1 (Day wise) -------------------
tables = soup.find_all("table")
daywise_table = tables[0]
rows = []

for row in daywise_table.find_all("tr")[1:]:
    cols = [c.text.strip() for c in row.find_all("td")]
    if len(cols) >= 2:
        rows.append({
            "day": cols[0],
            "india_net": cols[1],
            "change_percent": cols[2] if len(cols) > 2 else None
        })

df_daywise = pd.DataFrame(rows)

# ------------------- TABLE 2 (State wise) -------------------
if len(tables) > 1:
    state_table = tables[1]
    state_header = [th.text.strip().replace(" ", "_").lower() for th in state_table.find_all("th")]

    state_rows = []
    for row in state_table.find_all("tr")[1:]:
        cols = [c.text.strip() for c in row.find_all("td")]
        if cols:
            entry = {}
            for i, col in enumerate(cols):
                if i < len(state_header):
                    entry[state_header[i]] = col
            state_rows.append(entry)

    df_state = pd.DataFrame(state_rows)
else:
    df_state = pd.DataFrame()  # empty if no state table

# ------------------- MERGE BOTH TABLES -------------------
df = pd.merge(df_daywise, df_state, left_on="day", right_on="day", how="left")

# ------------------- ADD ALL NEW INFO -------------------
df["movie_name"] = movie_name
df["budget"] = budget
df["verdict"] = verdict
df["india_screens"] = india_screens
df["overseas_screens"] = overseas_screens
df["worldwide_screens"] = worldwide_screens
df["release_date"] = release_date

# ------------------- SAVE TO CSV -------------------
df.to_csv("SLN_raw.csv", index=False)
print("\nSaved → SLN_raw.csv")
print("\nFirst 5 rows:")
print(df[["day", "india_net", "budget", "verdict", "india_screens", "release_date"]].head())


Movie: Sarileru Neekevvaru Box Office Collection | Day Wise | Worldwide
Budget: ₹ 100.00 Cr * Approx
Verdict: Blockbuster
India Screens: Not Found
Overseas Screens: Not Found
Worldwide Screens: Not Found
Release Date: 11th January 2020For more and the latest news aboutTollywood Box Office Collection, Stay tuned to us.Disclaimer: The Box Office Data are compiled from various sources and by our own research.

Saved → SLN_raw.csv

First 5 rows:
                     day  india_net                budget      verdict  \
0   Day 1 [1st Saturday]  ₹ 8.66 Cr  ₹ 100.00 Cr * Approx  Blockbuster   
1     Day 2 [1st Sunday]  ₹ 4.02 Cr  ₹ 100.00 Cr * Approx  Blockbuster   
2     Day 3 [1st Monday]  ₹ 3.30 Cr  ₹ 100.00 Cr * Approx  Blockbuster   
3    Day 4 [1st Tuesday]  ₹ 3.40 Cr  ₹ 100.00 Cr * Approx  Blockbuster   
4  Day 5 [1st Wednesday]  ₹ 3.12 Cr  ₹ 100.00 Cr * Approx  Blockbuster   

  india_screens                                       release_date  
0     Not Found  11th January 2020For mo

In [30]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# ------------------- CHANGE ONLY THIS LINE FOR OTHER MOVIES -------------------
url = "https://www.sacnilk.com/news/Maharshi_Box_Office_Collection_Telugu_Worldwide"    
# ------------------------------------------------------------------------------

page = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(page.text, "html.parser")

movie_name = soup.find("h1").text.strip()
print("Movie:", movie_name)

# ------------------- FIXED: GRAB BUDGET, VERDICT, SCREENS, RELEASE DATE -------------------
# Search for specific elements with keywords (handles HTML structure like <strong>)
budget = "Not Found"
verdict = "Not Found"
india_screens = "Not Found"
overseas_screens = "Not Found"
worldwide_screens = "Not Found"
release_date = "Not Found"

# Find all relevant text containers (p, div, span, etc.)
all_elements = soup.find_all(['p', 'div', 'span', 'strong'])

for elem in all_elements:
    text = elem.get_text(strip=True)
    
    # Budget
    if "Budget:" in text:
        budget_match = re.search(r'Budget:\s*(₹\s*\d+(?:\.\d+)?\s*Cr\s*\*\s*Approx)', text)
        if budget_match:
            budget = budget_match.group(1).strip()
    
    # Verdict (look for <strong>Salaar Verdict:</strong> <strong>Hit</strong>)
    if "Verdict:" in text:
        # Get the next sibling or parent text for the value
        verdict_elem = elem.find_next_sibling('strong') or elem.parent.find('strong')
        if verdict_elem:
            verdict = verdict_elem.get_text(strip=True)
        else:
            # Fallback regex for plain text
            verdict_match = re.search(r'Verdict:\s*(Hit|Flop|Blockbuster|Super Hit|Average|Disaster)', text)
            verdict = verdict_match.group(1).strip() if verdict_match else "Not Found"
    
    # Screen counts (look for lines with numbers after labels)
    if "India:" in text and re.search(r'\d+', text):
        india_match = re.search(r'India:\s*(\d+)', text)
        if india_match:
            india_screens = f"India: {india_match.group(1)}"
    if "Overseas:" in text and re.search(r'\d+', text):
        overseas_match = re.search(r'Overseas:\s*(\d+)', text)
        if overseas_match:
            overseas_screens = f"Overseas: {overseas_match.group(1)}"
    if "Worldwide total:" in text and re.search(r'\d+', text):
        worldwide_match = re.search(r'Worldwide total:\s*(\d+)', text)
        if worldwide_match:
            worldwide_screens = f"Worldwide total: {worldwide_match.group(1)}"
    
    # Release Date
    if "Release Date:" in text:
        release_match = re.search(r'Release Date:\s*(.+)', text)
        if release_match:
            release_date = release_match.group(1).strip()

print("Budget:", budget)
print("Verdict:", verdict)
print("India Screens:", india_screens)
print("Overseas Screens:", overseas_screens)
print("Worldwide Screens:", worldwide_screens)
print("Release Date:", release_date)

# ------------------- TABLE 1 (Day wise) -------------------
tables = soup.find_all("table")
daywise_table = tables[0]
rows = []

for row in daywise_table.find_all("tr")[1:]:
    cols = [c.text.strip() for c in row.find_all("td")]
    if len(cols) >= 2:
        rows.append({
            "day": cols[0],
            "india_net": cols[1],
            "change_percent": cols[2] if len(cols) > 2 else None
        })

df_daywise = pd.DataFrame(rows)

# ------------------- TABLE 2 (State wise) -------------------
if len(tables) > 1:
    state_table = tables[1]
    state_header = [th.text.strip().replace(" ", "_").lower() for th in state_table.find_all("th")]

    state_rows = []
    for row in state_table.find_all("tr")[1:]:
        cols = [c.text.strip() for c in row.find_all("td")]
        if cols:
            entry = {}
            for i, col in enumerate(cols):
                if i < len(state_header):
                    entry[state_header[i]] = col
            state_rows.append(entry)

    df_state = pd.DataFrame(state_rows)
else:
    df_state = pd.DataFrame()  # empty if no state table

# ------------------- MERGE BOTH TABLES -------------------
df = pd.merge(df_daywise, df_state, left_on="day", right_on="day", how="left")

# ------------------- ADD ALL NEW INFO -------------------
df["movie_name"] = movie_name
df["budget"] = budget
df["verdict"] = verdict
df["india_screens"] = india_screens
df["overseas_screens"] = overseas_screens
df["worldwide_screens"] = worldwide_screens
df["release_date"] = release_date

# ------------------- SAVE TO CSV -------------------
df.to_csv("Maharshi_raw.csv", index=False)
print("\nSaved → Maharshi_raw.csv")
print("\nFirst 5 rows:")
print(df[["day", "india_net", "budget", "verdict", "india_screens", "release_date"]].head())


Movie: Maharshi Box Office Collection | Telugu | Worldwide
Budget: Not Found
Verdict: Not Found
India Screens: Not Found
Overseas Screens: Not Found
Worldwide Screens: Not Found
Release Date: 9th May 2019Maharshi Records- Highest Pre-release business for Mahesh Babu.-Note: Data as of 9th May 2019Story Line-Rishi, a millionaire businessman, returns to his homeland, where he becomes the champion of poor and downtrodden farmers. Directed by Vamsi Paidipally, Maharshi features Mahesh Babu and Pooja Hegde in central roles.For more and the latest news aboutTollywood Box Office Collection, Stay tuned to us.Disclaimer: The Box Office Data are compiled from various sources and by our own research.

Saved → Maharshi_raw.csv

First 5 rows:
                    day   india_net     budget    verdict india_screens  \
0  Day 1 [1st Thursday]  ₹ 34.75 Cr  Not Found  Not Found     Not Found   
1    Day 2 [1st Friday]  ₹ 12.00 Cr  Not Found  Not Found     Not Found   
2  Day 3 [1st Saturday]  ₹ 12.40 Cr 

In [31]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# ------------------- CHANGE ONLY THIS LINE FOR OTHER MOVIES -------------------
url = "https://www.sacnilk.com/news/War_2_2024_Box_Office_Collection_Day_Wise_Worldwide"
# ------------------------------------------------------------------------------

page = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(page.text, "html.parser")

movie_name = soup.find("h1").text.strip()
print("Movie:", movie_name)

# ------------------- FIXED: GRAB BUDGET, VERDICT, SCREENS, RELEASE DATE -------------------
# Search for specific elements with keywords (handles HTML structure like <strong>)
budget = "Not Found"
verdict = "Not Found"
india_screens = "Not Found"
overseas_screens = "Not Found"
worldwide_screens = "Not Found"
release_date = "Not Found"

# Find all relevant text containers (p, div, span, etc.)
all_elements = soup.find_all(['p', 'div', 'span', 'strong'])

for elem in all_elements:
    text = elem.get_text(strip=True)
    
    # Budget
    if "Budget:" in text:
        budget_match = re.search(r'Budget:\s*(₹\s*\d+(?:\.\d+)?\s*Cr\s*\*\s*Approx)', text)
        if budget_match:
            budget = budget_match.group(1).strip()
    
    # Verdict (look for <strong>Salaar Verdict:</strong> <strong>Hit</strong>)
    if "Verdict:" in text:
        # Get the next sibling or parent text for the value
        verdict_elem = elem.find_next_sibling('strong') or elem.parent.find('strong')
        if verdict_elem:
            verdict = verdict_elem.get_text(strip=True)
        else:
            # Fallback regex for plain text
            verdict_match = re.search(r'Verdict:\s*(Hit|Flop|Blockbuster|Super Hit|Average|Disaster)', text)
            verdict = verdict_match.group(1).strip() if verdict_match else "Not Found"
    
    # Screen counts (look for lines with numbers after labels)
    if "India:" in text and re.search(r'\d+', text):
        india_match = re.search(r'India:\s*(\d+)', text)
        if india_match:
            india_screens = f"India: {india_match.group(1)}"
    if "Overseas:" in text and re.search(r'\d+', text):
        overseas_match = re.search(r'Overseas:\s*(\d+)', text)
        if overseas_match:
            overseas_screens = f"Overseas: {overseas_match.group(1)}"
    if "Worldwide total:" in text and re.search(r'\d+', text):
        worldwide_match = re.search(r'Worldwide total:\s*(\d+)', text)
        if worldwide_match:
            worldwide_screens = f"Worldwide total: {worldwide_match.group(1)}"
    
    # Release Date
    if "Release Date:" in text:
        release_match = re.search(r'Release Date:\s*(.+)', text)
        if release_match:
            release_date = release_match.group(1).strip()

print("Budget:", budget)
print("Verdict:", verdict)
print("India Screens:", india_screens)
print("Overseas Screens:", overseas_screens)
print("Worldwide Screens:", worldwide_screens)
print("Release Date:", release_date)

# ------------------- TABLE 1 (Day wise) -------------------
tables = soup.find_all("table")
daywise_table = tables[0]
rows = []

for row in daywise_table.find_all("tr")[1:]:
    cols = [c.text.strip() for c in row.find_all("td")]
    if len(cols) >= 2:
        rows.append({
            "day": cols[0],
            "india_net": cols[1],
            "change_percent": cols[2] if len(cols) > 2 else None
        })

df_daywise = pd.DataFrame(rows)

# ------------------- TABLE 2 (State wise) -------------------
if len(tables) > 1:
    state_table = tables[1]
    state_header = [th.text.strip().replace(" ", "_").lower() for th in state_table.find_all("th")]

    state_rows = []
    for row in state_table.find_all("tr")[1:]:
        cols = [c.text.strip() for c in row.find_all("td")]
        if cols:
            entry = {}
            for i, col in enumerate(cols):
                if i < len(state_header):
                    entry[state_header[i]] = col
            state_rows.append(entry)

    df_state = pd.DataFrame(state_rows)
else:
    df_state = pd.DataFrame()  # empty if no state table

# ------------------- MERGE BOTH TABLES -------------------
df = pd.merge(df_daywise, df_state, left_on="day", right_on="day", how="left")

# ------------------- ADD ALL NEW INFO -------------------
df["movie_name"] = movie_name
df["budget"] = budget
df["verdict"] = verdict
df["india_screens"] = india_screens
df["overseas_screens"] = overseas_screens
df["worldwide_screens"] = worldwide_screens
df["release_date"] = release_date

# ------------------- SAVE TO CSV -------------------
df.to_csv("WAR2_raw.csv", index=False)
print("\nSaved → WAR2_raw.csv")
print("\nFirst 5 rows:")
print(df[["day", "india_net", "budget", "verdict", "india_screens", "release_date"]].head())


Movie: War 2 Box Office Collection | All Language | Day Wise | Worldwide
Budget: ₹ 400 Cr * Approx
Verdict: Not Found
India Screens: Not Found
Overseas Screens: Not Found
Worldwide Screens: Not Found
Release Date: 14th August 2025For more and the latest news aboutBollywood Box Office Collection, Stay tuned to us.War 2 State Wise Gross CollectionDayKarnatakaAPTGTamil NaduKeralaRest Of IndiaDay TotalDay 1 [1st Thursday]₹ 4 Cr₹ 27 Cr₹ 1.1 Cr₹ 0.5 Cr₹ 30 Cr₹ 62.6 CrDay 2 [1st Friday]₹ 4.6 Cr₹ 14.5 Cr₹ 1.25 Cr₹ 0.45 Cr₹ 48 Cr₹ 68.8 CrDay 3 [1st Saturday]₹ 3.1 Cr₹ 8.25 Cr₹ 1.1 Cr₹ 0.25 Cr₹ 26.8 Cr₹ 39.5 CrDay 4 [1st Sunday]₹ 2.75 Cr₹ 7 Cr₹ 1.02 Cr₹ 0.33 Cr₹ 27.75 Cr₹ 38.85 CrDay 5 [1st Monday]₹ 0.75 Cr₹ 1.75 Cr₹ 0.4 Cr₹ 0.15 Cr₹ 7.5 Cr₹ 10.55 CrDay 6 [1st Tuesday]₹ 0.75 Cr₹ 1.5 Cr₹ 0.22 Cr₹ 0.13 Cr₹ 8.2 Cr₹ 10.8 CrDay 7 [1st Wednesday]₹ 0.45 Cr₹ 1.3 Cr₹ 0.25 Cr₹ 0.05 Cr₹ 4.85 Cr₹ 6.9 CrDay 8 [2nd Thursday]₹ 0.4 Cr₹ 1.1 Cr₹ 0.22 Cr₹ 0.03 Cr₹ 4.1 Cr₹ 5.85 CrWeek 1 Gross₹ 16.8 Cr₹ 62.4 Cr₹ 5.56

In [33]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# ------------------- CHANGE ONLY THIS LINE FOR OTHER MOVIES -------------------
url = "https://www.sacnilk.com/news/NTR_30_2022_Box_Office_Collection_Day_Wise_Worldwide"
from bs4 import BeautifulSoup
import pandas as pd
import re

# ------------------- CHANGE ONLY THIS LINE FOR OTHER MOVIES -------------------
url = "https://www.sacnilk.com/news/NTR_30_2022_Box_Office_Collection_Day_Wise_Worldwide"
# ------------------------------------------------------------------------------

page = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(page.text, "html.parser")

movie_name = soup.find("h1").text.strip()
print("Movie:", movie_name)

# ------------------- FIXED: GRAB BUDGET, VERDICT, SCREENS, RELEASE DATE -------------------
# Search for specific elements with keywords (handles HTML structure like <strong>)
budget = "Not Found"
verdict = "Not Found"
india_screens = "Not Found"
overseas_screens = "Not Found"
worldwide_screens = "Not Found"
release_date = "Not Found"

# Find all relevant text containers (p, div, span, etc.)
all_elements = soup.find_all(['p', 'div', 'span', 'strong'])

for elem in all_elements:
    text = elem.get_text(strip=True)
    
    # Budget
    if "Budget:" in text:
        budget_match = re.search(r'Budget:\s*(₹\s*\d+(?:\.\d+)?\s*Cr\s*\*\s*Approx)', text)
        if budget_match:
            budget = budget_match.group(1).strip()
    
    # Verdict (look for <strong>Salaar Verdict:</strong> <strong>Hit</strong>)
    if "Verdict:" in text:
        # Get the next sibling or parent text for the value
        verdict_elem = elem.find_next_sibling('strong') or elem.parent.find('strong')
        if verdict_elem:
            verdict = verdict_elem.get_text(strip=True)
        else:
            # Fallback regex for plain text
            verdict_match = re.search(r'Verdict:\s*(Hit|Flop|Blockbuster|Super Hit|Average|Disaster)', text)
            verdict = verdict_match.group(1).strip() if verdict_match else "Not Found"
    
    # Screen counts (look for lines with numbers after labels)
    if "India:" in text and re.search(r'\d+', text):
        india_match = re.search(r'India:\s*(\d+)', text)
        if india_match:
            india_screens = f"India: {india_match.group(1)}"
    if "Overseas:" in text and re.search(r'\d+', text):
        overseas_match = re.search(r'Overseas:\s*(\d+)', text)
        if overseas_match:
            overseas_screens = f"Overseas: {overseas_match.group(1)}"
    if "Worldwide total:" in text and re.search(r'\d+', text):
        worldwide_match = re.search(r'Worldwide total:\s*(\d+)', text)
        if worldwide_match:
            worldwide_screens = f"Worldwide total: {worldwide_match.group(1)}"
    
    # Release Date
    if "Release Date:" in text:
        release_match = re.search(r'Release Date:\s*(.+)', text)
        if release_match:
            release_date = release_match.group(1).strip()

print("Budget:", budget)
print("Verdict:", verdict)
print("India Screens:", india_screens)
print("Overseas Screens:", overseas_screens)
print("Worldwide Screens:", worldwide_screens)
print("Release Date:", release_date)

# ------------------- TABLE 1 (Day wise) -------------------
tables = soup.find_all("table")
daywise_table = tables[0]
rows = []

for row in daywise_table.find_all("tr")[1:]:
    cols = [c.text.strip() for c in row.find_all("td")]
    if len(cols) >= 2:
        rows.append({
            "day": cols[0],
            "india_net": cols[1],
            "change_percent": cols[2] if len(cols) > 2 else None
        })

df_daywise = pd.DataFrame(rows)

# ------------------- TABLE 2 (State wise) -------------------
if len(tables) > 1:
    state_table = tables[1]
    state_header = [th.text.strip().replace(" ", "_").lower() for th in state_table.find_all("th")]

    state_rows = []
    for row in state_table.find_all("tr")[1:]:
        cols = [c.text.strip() for c in row.find_all("td")]
        if cols:
            entry = {}
            for i, col in enumerate(cols):
                if i < len(state_header):
                    entry[state_header[i]] = col
            state_rows.append(entry)

    df_state = pd.DataFrame(state_rows)
else:
    df_state = pd.DataFrame()  # empty if no state table

# ------------------- MERGE BOTH TABLES -------------------
df = pd.merge(df_daywise, df_state, left_on="day", right_on="day", how="left")

# ------------------- ADD ALL NEW INFO -------------------
df["movie_name"] = movie_name
df["budget"] = budget
df["verdict"] = verdict
df["india_screens"] = india_screens
df["overseas_screens"] = overseas_screens
df["worldwide_screens"] = worldwide_screens
df["release_date"] = release_date

# ------------------- SAVE TO CSV -------------------
df.to_csv("SLN_raw.csv", index=False)
print("\nSaved → SLM_raw.csv")
print("\nFirst 5 rows:")
print(df[["day", "india_net", "budget", "verdict", "india_screens", "release_date"]].head())

# ------------------------------------------------------------------------------

page = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(page.text, "html.parser")

movie_name = soup.find("h1").text.strip()
print("Movie:", movie_name)

# ------------------- FIXED: GRAB BUDGET, VERDICT, SCREENS, RELEASE DATE -------------------
# Search for specific elements with keywords (handles HTML structure like <strong>)
budget = "Not Found"
verdict = "Not Found"
india_screens = "Not Found"
overseas_screens = "Not Found"
worldwide_screens = "Not Found"
release_date = "Not Found"

# Find all relevant text containers (p, div, span, etc.)
all_elements = soup.find_all(['p', 'div', 'span', 'strong'])

for elem in all_elements:
    text = elem.get_text(strip=True)
    
    # Budget
    if "Budget:" in text:
        budget_match = re.search(r'Budget:\s*(₹\s*\d+(?:\.\d+)?\s*Cr\s*\*\s*Approx)', text)
        if budget_match:
            budget = budget_match.group(1).strip()
    
    # Verdict (look for <strong>Salaar Verdict:</strong> <strong>Hit</strong>)
    if "Verdict:" in text:
        # Get the next sibling or parent text for the value
        verdict_elem = elem.find_next_sibling('strong') or elem.parent.find('strong')
        if verdict_elem:
            verdict = verdict_elem.get_text(strip=True)
        else:
            # Fallback regex for plain text
            verdict_match = re.search(r'Verdict:\s*(Hit|Flop|Blockbuster|Super Hit|Average|Disaster)', text)
            verdict = verdict_match.group(1).strip() if verdict_match else "Not Found"
    
    # Screen counts (look for lines with numbers after labels)
    if "India:" in text and re.search(r'\d+', text):
        india_match = re.search(r'India:\s*(\d+)', text)
        if india_match:
            india_screens = f"India: {india_match.group(1)}"
    if "Overseas:" in text and re.search(r'\d+', text):
        overseas_match = re.search(r'Overseas:\s*(\d+)', text)
        if overseas_match:
            overseas_screens = f"Overseas: {overseas_match.group(1)}"
    if "Worldwide total:" in text and re.search(r'\d+', text):
        worldwide_match = re.search(r'Worldwide total:\s*(\d+)', text)
        if worldwide_match:
            worldwide_screens = f"Worldwide total: {worldwide_match.group(1)}"
    
    # Release Date
    if "Release Date:" in text:
        release_match = re.search(r'Release Date:\s*(.+)', text)
        if release_match:
            release_date = release_match.group(1).strip()

print("Budget:", budget)
print("Verdict:", verdict)
print("India Screens:", india_screens)
print("Overseas Screens:", overseas_screens)
print("Worldwide Screens:", worldwide_screens)
print("Release Date:", release_date)

# ------------------- TABLE 1 (Day wise) -------------------
tables = soup.find_all("table")
daywise_table = tables[0]
rows = []

for row in daywise_table.find_all("tr")[1:]:
    cols = [c.text.strip() for c in row.find_all("td")]
    if len(cols) >= 2:
        rows.append({
            "day": cols[0],
            "india_net": cols[1],
            "change_percent": cols[2] if len(cols) > 2 else None
        })

df_daywise = pd.DataFrame(rows)

# ------------------- TABLE 2 (State wise) -------------------
if len(tables) > 1:
    state_table = tables[1]
    state_header = [th.text.strip().replace(" ", "_").lower() for th in state_table.find_all("th")]

    state_rows = []
    for row in state_table.find_all("tr")[1:]:
        cols = [c.text.strip() for c in row.find_all("td")]
        if cols:
            entry = {}
            for i, col in enumerate(cols):
                if i < len(state_header):
                    entry[state_header[i]] = col
            state_rows.append(entry)

    df_state = pd.DataFrame(state_rows)
else:
    df_state = pd.DataFrame()  # empty if no state table

# ------------------- MERGE BOTH TABLES -------------------
df = pd.merge(df_daywise, df_state, left_on="day", right_on="day", how="left")

# ------------------- ADD ALL NEW INFO -------------------
df["movie_name"] = movie_name
df["budget"] = budget
df["verdict"] = verdict
df["india_screens"] = india_screens
df["overseas_screens"] = overseas_screens
df["worldwide_screens"] = worldwide_screens
df["release_date"] = release_date

# ------------------- SAVE TO CSV -------------------
df.to_csv("devara_raw.csv", index=False)
print("\nSaved → devara_raw.csv")
print("\nFirst 5 rows:")
print(df[["day", "india_net", "budget", "verdict", "india_screens", "release_date"]].head())


Movie: Devara - Part 1 Box Office Collection | All Language | Day Wise | Worldwide
Budget: Not Found
Verdict: Not Found
India Screens: Not Found
Overseas Screens: Not Found
Worldwide Screens: Not Found
Release Date: 27th September 2024For more and the latest news aboutTollywood Box Office Collection, Stay tuned to us.Disclaimer: The Box Office Data are compiled from various sources and by our own research.

Saved → SLM_raw.csv

First 5 rows:
                    day  india_net     budget    verdict india_screens  \
0    Day 1 [1st Friday]  ₹ 10.5 Cr  Not Found  Not Found     Not Found   
1  Day 2 [1st Saturday]   ₹ 5.6 Cr  Not Found  Not Found     Not Found   
2    Day 3 [1st Sunday]   ₹ 4.8 Cr  Not Found  Not Found     Not Found   
3    Day 4 [1st Monday]  ₹ 1.55 Cr  Not Found  Not Found     Not Found   
4   Day 5 [1st Tuesday]   ₹ 1.8 Cr  Not Found  Not Found     Not Found   

                                        release_date  
0  27th September 2024For more and the latest new...  

In [34]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# ------------------- CHANGE ONLY THIS LINE FOR OTHER MOVIES -------------------
url = "https://www.sacnilk.com/news/RRR_2020_Box_Office_Collection_Day_Wise_Worldwide"
# ------------------------------------------------------------------------------

page = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(page.text, "html.parser")

movie_name = soup.find("h1").text.strip()
print("Movie:", movie_name)

# ------------------- FIXED: GRAB BUDGET, VERDICT, SCREENS, RELEASE DATE -------------------
# Search for specific elements with keywords (handles HTML structure like <strong>)
budget = "Not Found"
verdict = "Not Found"
india_screens = "Not Found"
overseas_screens = "Not Found"
worldwide_screens = "Not Found"
release_date = "Not Found"

# Find all relevant text containers (p, div, span, etc.)
all_elements = soup.find_all(['p', 'div', 'span', 'strong'])

for elem in all_elements:
    text = elem.get_text(strip=True)
    
    # Budget
    if "Budget:" in text:
        budget_match = re.search(r'Budget:\s*(₹\s*\d+(?:\.\d+)?\s*Cr\s*\*\s*Approx)', text)
        if budget_match:
            budget = budget_match.group(1).strip()
    
    # Verdict (look for <strong>Salaar Verdict:</strong> <strong>Hit</strong>)
    if "Verdict:" in text:
        # Get the next sibling or parent text for the value
        verdict_elem = elem.find_next_sibling('strong') or elem.parent.find('strong')
        if verdict_elem:
            verdict = verdict_elem.get_text(strip=True)
        else:
            # Fallback regex for plain text
            verdict_match = re.search(r'Verdict:\s*(Hit|Flop|Blockbuster|Super Hit|Average|Disaster)', text)
            verdict = verdict_match.group(1).strip() if verdict_match else "Not Found"
    
    # Screen counts (look for lines with numbers after labels)
    if "India:" in text and re.search(r'\d+', text):
        india_match = re.search(r'India:\s*(\d+)', text)
        if india_match:
            india_screens = f"India: {india_match.group(1)}"
    if "Overseas:" in text and re.search(r'\d+', text):
        overseas_match = re.search(r'Overseas:\s*(\d+)', text)
        if overseas_match:
            overseas_screens = f"Overseas: {overseas_match.group(1)}"
    if "Worldwide total:" in text and re.search(r'\d+', text):
        worldwide_match = re.search(r'Worldwide total:\s*(\d+)', text)
        if worldwide_match:
            worldwide_screens = f"Worldwide total: {worldwide_match.group(1)}"
    
    # Release Date
    if "Release Date:" in text:
        release_match = re.search(r'Release Date:\s*(.+)', text)
        if release_match:
            release_date = release_match.group(1).strip()

print("Budget:", budget)
print("Verdict:", verdict)
print("India Screens:", india_screens)
print("Overseas Screens:", overseas_screens)
print("Worldwide Screens:", worldwide_screens)
print("Release Date:", release_date)

# ------------------- TABLE 1 (Day wise) -------------------
tables = soup.find_all("table")
daywise_table = tables[0]
rows = []

for row in daywise_table.find_all("tr")[1:]:
    cols = [c.text.strip() for c in row.find_all("td")]
    if len(cols) >= 2:
        rows.append({
            "day": cols[0],
            "india_net": cols[1],
            "change_percent": cols[2] if len(cols) > 2 else None
        })

df_daywise = pd.DataFrame(rows)

# ------------------- TABLE 2 (State wise) -------------------
if len(tables) > 1:
    state_table = tables[1]
    state_header = [th.text.strip().replace(" ", "_").lower() for th in state_table.find_all("th")]

    state_rows = []
    for row in state_table.find_all("tr")[1:]:
        cols = [c.text.strip() for c in row.find_all("td")]
        if cols:
            entry = {}
            for i, col in enumerate(cols):
                if i < len(state_header):
                    entry[state_header[i]] = col
            state_rows.append(entry)

    df_state = pd.DataFrame(state_rows)
else:
    df_state = pd.DataFrame()  # empty if no state table

# ------------------- MERGE BOTH TABLES -------------------
df = pd.merge(df_daywise, df_state, left_on="day", right_on="day", how="left")

# ------------------- ADD ALL NEW INFO -------------------
df["movie_name"] = movie_name
df["budget"] = budget
df["verdict"] = verdict
df["india_screens"] = india_screens
df["overseas_screens"] = overseas_screens
df["worldwide_screens"] = worldwide_screens
df["release_date"] = release_date

# ------------------- SAVE TO CSV -------------------
df.to_csv("RRR_raw.csv", index=False)
print("\nSaved → RRR_raw.csv")
print("\nFirst 5 rows:")
print(df[["day", "india_net", "budget", "verdict", "india_screens", "release_date"]].head())


Movie: RRR Box Office Collection | All Language | Day Wise | Worldwide
Budget: Not Found
Verdict: Blockbuster
India Screens: India: 9000
Overseas Screens: Overseas: 6000
Worldwide Screens: Worldwide total: 15000
Release Date: 25th March 2022For more and the latest news aboutTollywood Box Office Collection, Stay tuned to us.Disclaimer: The Box Office Data are compiled from various sources and by our own research.

Saved → RRR_raw.csv

First 5 rows:
                    day   india_net     budget      verdict india_screens  \
0    Day 1 [1st Friday]  ₹ 23.35 Cr  Not Found  Blockbuster   India: 9000   
1  Day 2 [1st Saturday]   ₹ 15.1 Cr  Not Found  Blockbuster   India: 9000   
2    Day 3 [1st Sunday]  ₹ 15.05 Cr  Not Found  Blockbuster   India: 9000   
3    Day 4 [1st Monday]   ₹ 8.15 Cr  Not Found  Blockbuster   India: 9000   
4   Day 5 [1st Tuesday]    ₹ 6.7 Cr  Not Found  Blockbuster   India: 9000   

                                        release_date  
0  25th March 2022For more and

RAMCHARAN

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# ------------------- CHANGE ONLY THIS LINE FOR OTHER MOVIES -------------------
url = "https://www.sacnilk.com/news/RC15_2022_Box_Office_Collection_Day_Wise_Worldwide"
# ------------------------------------------------------------------------------

page = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(page.text, "html.parser")

movie_name = soup.find("h1").text.strip()
print("Movie:", movie_name)

# ------------------- FIXED: GRAB BUDGET, VERDICT, SCREENS, RELEASE DATE -------------------
# Search for specific elements with keywords (handles HTML structure like <strong>)
budget = "Not Found"
verdict = "Not Found"
india_screens = "Not Found"
overseas_screens = "Not Found"
worldwide_screens = "Not Found"
release_date = "Not Found"

# Find all relevant text containers (p, div, span, etc.)
all_elements = soup.find_all(['p', 'div', 'span', 'strong'])

for elem in all_elements:
    text = elem.get_text(strip=True)
    
    # Budget
    if "Budget:" in text:
        budget_match = re.search(r'Budget:\s*(₹\s*\d+(?:\.\d+)?\s*Cr\s*\*\s*Approx)', text)
        if budget_match:
            budget = budget_match.group(1).strip()
    
    # Verdict (look for <strong>Salaar Verdict:</strong> <strong>Hit</strong>)
    if "Verdict:" in text:
        # Get the next sibling or parent text for the value
        verdict_elem = elem.find_next_sibling('strong') or elem.parent.find('strong')
        if verdict_elem:
            verdict = verdict_elem.get_text(strip=True)
        else:
            # Fallback regex for plain text
            verdict_match = re.search(r'Verdict:\s*(Hit|Flop|Blockbuster|Super Hit|Average|Disaster)', text)
            verdict = verdict_match.group(1).strip() if verdict_match else "Not Found"
    
    # Screen counts (look for lines with numbers after labels)
    if "India:" in text and re.search(r'\d+', text):
        india_match = re.search(r'India:\s*(\d+)', text)
        if india_match:
            india_screens = f"India: {india_match.group(1)}"
    if "Overseas:" in text and re.search(r'\d+', text):
        overseas_match = re.search(r'Overseas:\s*(\d+)', text)
        if overseas_match:
            overseas_screens = f"Overseas: {overseas_match.group(1)}"
    if "Worldwide total:" in text and re.search(r'\d+', text):
        worldwide_match = re.search(r'Worldwide total:\s*(\d+)', text)
        if worldwide_match:
            worldwide_screens = f"Worldwide total: {worldwide_match.group(1)}"
    
    # Release Date
    if "Release Date:" in text:
        release_match = re.search(r'Release Date:\s*(.+)', text)
        if release_match:
            release_date = release_match.group(1).strip()

print("Budget:", budget)
print("Verdict:", verdict)
print("India Screens:", india_screens)
print("Overseas Screens:", overseas_screens)
print("Worldwide Screens:", worldwide_screens)
print("Release Date:", release_date)

# ------------------- TABLE 1 (Day wise) -------------------
tables = soup.find_all("table")
daywise_table = tables[0]
rows = []

for row in daywise_table.find_all("tr")[1:]:
    cols = [c.text.strip() for c in row.find_all("td")]
    if len(cols) >= 2:
        rows.append({
            "day": cols[0],
            "india_net": cols[1],
            "change_percent": cols[2] if len(cols) > 2 else None
        })

df_daywise = pd.DataFrame(rows)

# ------------------- TABLE 2 (State wise) -------------------
if len(tables) > 1:
    state_table = tables[1]
    state_header = [th.text.strip().replace(" ", "_").lower() for th in state_table.find_all("th")]

    state_rows = []
    for row in state_table.find_all("tr")[1:]:
        cols = [c.text.strip() for c in row.find_all("td")]
        if cols:
            entry = {}
            for i, col in enumerate(cols):
                if i < len(state_header):
                    entry[state_header[i]] = col
            state_rows.append(entry)

    df_state = pd.DataFrame(state_rows)
else:
    df_state = pd.DataFrame()  # empty if no state table

# ------------------- MERGE BOTH TABLES -------------------
df = pd.merge(df_daywise, df_state, left_on="day", right_on="day", how="left")

# ------------------- ADD ALL NEW INFO -------------------
df["movie_name"] = movie_name
df["budget"] = budget
df["verdict"] = verdict
df["india_screens"] = india_screens
df["overseas_screens"] = overseas_screens
df["worldwide_screens"] = worldwide_screens
df["release_date"] = release_date

# ------------------- SAVE TO CSV -------------------
df.to_csv("GC.csv", index=False)
print("\nSaved → GC.csv")
print("\nFirst 5 rows:")
print(df[["day", "india_net", "budget", "verdict", "india_screens", "release_date"]].head())


Movie: Game Changer Box Office Collection | All Language | Day Wise | Worldwide
Budget: Not Found
Verdict: Not Found
India Screens: Not Found
Overseas Screens: Not Found
Worldwide Screens: Not Found
Release Date: 10th January 2025For more and the latest news aboutTollywood Box Office Collection, Stay tuned to us.Disclaimer: The Box Office Data are compiled from various sources and by our own research.

Saved → GC.csv

First 5 rows:
                    day  india_net     budget    verdict india_screens  \
0    Day 1 [1st Friday]   ₹ 4.5 Cr  Not Found  Not Found     Not Found   
1  Day 2 [1st Saturday]   ₹ 2.2 Cr  Not Found  Not Found     Not Found   
2    Day 3 [1st Sunday]   ₹ 1.3 Cr  Not Found  Not Found     Not Found   
3    Day 4 [1st Monday]  ₹ 0.65 Cr  Not Found  Not Found     Not Found   
4   Day 5 [1st Tuesday]  ₹ 0.85 Cr  Not Found  Not Found     Not Found   

                                        release_date  
0  10th January 2025For more and the latest news ...  
1  10th J

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# ------------------- CHANGE ONLY THIS LINE FOR OTHER MOVIES -------------------
url = "https://www.sacnilk.com/news/Vinaya_Vidheya_Rama_Box_Office_Collection_Day_Wise_Worldwide"
# ------------------------------------------------------------------------------

page = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(page.text, "html.parser")

movie_name = soup.find("h1").text.strip()
print("Movie:", movie_name)

# ------------------- FIXED: GRAB BUDGET, VERDICT, SCREENS, RELEASE DATE -------------------
# Search for specific elements with keywords (handles HTML structure like <strong>)
budget = "Not Found"
verdict = "Not Found"
india_screens = "Not Found"
overseas_screens = "Not Found"
worldwide_screens = "Not Found"
release_date = "Not Found"

# Find all relevant text containers (p, div, span, etc.)
all_elements = soup.find_all(['p', 'div', 'span', 'strong'])

for elem in all_elements:
    text = elem.get_text(strip=True)
    
    # Budget
    if "Budget:" in text:
        budget_match = re.search(r'Budget:\s*(₹\s*\d+(?:\.\d+)?\s*Cr\s*\*\s*Approx)', text)
        if budget_match:
            budget = budget_match.group(1).strip()
    
    # Verdict (look for <strong>Salaar Verdict:</strong> <strong>Hit</strong>)
    if "Verdict:" in text:
        # Get the next sibling or parent text for the value
        verdict_elem = elem.find_next_sibling('strong') or elem.parent.find('strong')
        if verdict_elem:
            verdict = verdict_elem.get_text(strip=True)
        else:
            # Fallback regex for plain text
            verdict_match = re.search(r'Verdict:\s*(Hit|Flop|Blockbuster|Super Hit|Average|Disaster)', text)
            verdict = verdict_match.group(1).strip() if verdict_match else "Not Found"
    
    # Screen counts (look for lines with numbers after labels)
    if "India:" in text and re.search(r'\d+', text):
        india_match = re.search(r'India:\s*(\d+)', text)
        if india_match:
            india_screens = f"India: {india_match.group(1)}"
    if "Overseas:" in text and re.search(r'\d+', text):
        overseas_match = re.search(r'Overseas:\s*(\d+)', text)
        if overseas_match:
            overseas_screens = f"Overseas: {overseas_match.group(1)}"
    if "Worldwide total:" in text and re.search(r'\d+', text):
        worldwide_match = re.search(r'Worldwide total:\s*(\d+)', text)
        if worldwide_match:
            worldwide_screens = f"Worldwide total: {worldwide_match.group(1)}"
    
    # Release Date
    if "Release Date:" in text:
        release_match = re.search(r'Release Date:\s*(.+)', text)
        if release_match:
            release_date = release_match.group(1).strip()

print("Budget:", budget)
print("Verdict:", verdict)
print("India Screens:", india_screens)
print("Overseas Screens:", overseas_screens)
print("Worldwide Screens:", worldwide_screens)
print("Release Date:", release_date)

# ------------------- TABLE 1 (Day wise) -------------------
tables = soup.find_all("table")
daywise_table = tables[0]
rows = []

for row in daywise_table.find_all("tr")[1:]:
    cols = [c.text.strip() for c in row.find_all("td")]
    if len(cols) >= 2:
        rows.append({
            "day": cols[0],
            "india_net": cols[1],
            "change_percent": cols[2] if len(cols) > 2 else None
        })

df_daywise = pd.DataFrame(rows)

# ------------------- TABLE 2 (State wise) -------------------
if len(tables) > 1:
    state_table = tables[1]
    state_header = [th.text.strip().replace(" ", "_").lower() for th in state_table.find_all("th")]

    state_rows = []
    for row in state_table.find_all("tr")[1:]:
        cols = [c.text.strip() for c in row.find_all("td")]
        if cols:
            entry = {}
            for i, col in enumerate(cols):
                if i < len(state_header):
                    entry[state_header[i]] = col
            state_rows.append(entry)

    df_state = pd.DataFrame(state_rows)
else:
    df_state = pd.DataFrame()  # empty if no state table

# ------------------- MERGE BOTH TABLES -------------------
df = pd.merge(df_daywise, df_state, left_on="day", right_on="day", how="left")

# ------------------- ADD ALL NEW INFO -------------------
df["movie_name"] = movie_name
df["budget"] = budget
df["verdict"] = verdict
df["india_screens"] = india_screens
df["overseas_screens"] = overseas_screens
df["worldwide_screens"] = worldwide_screens
df["release_date"] = release_date

# ------------------- SAVE TO CSV -------------------
df.to_csv("VVR_raw.csv", index=False)
print("\nSaved → VVR_raw.csv")
print("\nFirst 5 rows:")
print(df[["day", "india_net", "budget", "verdict", "india_screens", "release_date"]].head())


Movie: Vinaya Vidheya Rama Box Office Collection | Day Wise | Worldwide
Budget: Not Found
Verdict: Flop
India Screens: Not Found
Overseas Screens: Not Found
Worldwide Screens: Not Found
Release Date: 11th January 2019Vinaya Vidheya Rama Records- Third highest opening in Telugu after Baahubali 2 and Agnyathavasi. [all time record]

Saved → VVR_raw.csv

First 5 rows:
                    day  india_net     budget verdict india_screens  \
0    Day 1 [1st Friday]    ₹ 34 Cr  Not Found    Flop     Not Found   
1  Day 2 [1st Saturday]  ₹ 6.20 Cr  Not Found    Flop     Not Found   
2    Day 3 [1st Sunday]  ₹ 4.50 Cr  Not Found    Flop     Not Found   
3    Day 4 [1st Monday]  ₹ 5.20 Cr  Not Found    Flop     Not Found   
4   Day 5 [1st Tuesday]  ₹ 7.20 Cr  Not Found    Flop     Not Found   

                                        release_date  
0  11th January 2019Vinaya Vidheya Rama Records- ...  
1  11th January 2019Vinaya Vidheya Rama Records- ...  
2  11th January 2019Vinaya Vidheya Rama 

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# ------------------- CHANGE ONLY THIS LINE FOR OTHER MOVIES -------------------
url = "https://www.sacnilk.com/news/Pushpa_The_Rule_2021_Box_Office_Collection_Day_Wise_Worldwide"
# ------------------------------------------------------------------------------

page = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(page.text, "html.parser")

movie_name = soup.find("h1").text.strip()
print("Movie:", movie_name)

# ------------------- FIXED: GRAB BUDGET, VERDICT, SCREENS, RELEASE DATE -------------------
# Search for specific elements with keywords (handles HTML structure like <strong>)
budget = "Not Found"
verdict = "Not Found"
india_screens = "Not Found"
overseas_screens = "Not Found"
worldwide_screens = "Not Found"
release_date = "Not Found"

# Find all relevant text containers (p, div, span, etc.)
all_elements = soup.find_all(['p', 'div', 'span', 'strong'])

for elem in all_elements:
    text = elem.get_text(strip=True)
    
    # Budget
    if "Budget:" in text:
        budget_match = re.search(r'Budget:\s*(₹\s*\d+(?:\.\d+)?\s*Cr\s*\*\s*Approx)', text)
        if budget_match:
            budget = budget_match.group(1).strip()
    
    # Verdict (look for <strong>Salaar Verdict:</strong> <strong>Hit</strong>)
    if "Verdict:" in text:
        # Get the next sibling or parent text for the value
        verdict_elem = elem.find_next_sibling('strong') or elem.parent.find('strong')
        if verdict_elem:
            verdict = verdict_elem.get_text(strip=True)
        else:
            # Fallback regex for plain text
            verdict_match = re.search(r'Verdict:\s*(Hit|Flop|Blockbuster|Super Hit|Average|Disaster)', text)
            verdict = verdict_match.group(1).strip() if verdict_match else "Not Found"
    
    # Screen counts (look for lines with numbers after labels)
    if "India:" in text and re.search(r'\d+', text):
        india_match = re.search(r'India:\s*(\d+)', text)
        if india_match:
            india_screens = f"India: {india_match.group(1)}"
    if "Overseas:" in text and re.search(r'\d+', text):
        overseas_match = re.search(r'Overseas:\s*(\d+)', text)
        if overseas_match:
            overseas_screens = f"Overseas: {overseas_match.group(1)}"
    if "Worldwide total:" in text and re.search(r'\d+', text):
        worldwide_match = re.search(r'Worldwide total:\s*(\d+)', text)
        if worldwide_match:
            worldwide_screens = f"Worldwide total: {worldwide_match.group(1)}"
    
    # Release Date
    if "Release Date:" in text:
        release_match = re.search(r'Release Date:\s*(.+)', text)
        if release_match:
            release_date = release_match.group(1).strip()

print("Budget:", budget)
print("Verdict:", verdict)
print("India Screens:", india_screens)
print("Overseas Screens:", overseas_screens)
print("Worldwide Screens:", worldwide_screens)
print("Release Date:", release_date)

# ------------------- TABLE 1 (Day wise) -------------------
tables = soup.find_all("table")
daywise_table = tables[0]
rows = []

for row in daywise_table.find_all("tr")[1:]:
    cols = [c.text.strip() for c in row.find_all("td")]
    if len(cols) >= 2:
        rows.append({
            "day": cols[0],
            "india_net": cols[1],
            "change_percent": cols[2] if len(cols) > 2 else None
        })

df_daywise = pd.DataFrame(rows)

# ------------------- TABLE 2 (State wise) -------------------
if len(tables) > 1:
    state_table = tables[1]
    state_header = [th.text.strip().replace(" ", "_").lower() for th in state_table.find_all("th")]

    state_rows = []
    for row in state_table.find_all("tr")[1:]:
        cols = [c.text.strip() for c in row.find_all("td")]
        if cols:
            entry = {}
            for i, col in enumerate(cols):
                if i < len(state_header):
                    entry[state_header[i]] = col
            state_rows.append(entry)

    df_state = pd.DataFrame(state_rows)
else:
    df_state = pd.DataFrame()  # empty if no state table

# ------------------- MERGE BOTH TABLES -------------------
df = pd.merge(df_daywise, df_state, left_on="day", right_on="day", how="left")

# ------------------- ADD ALL NEW INFO -------------------
df["movie_name"] = movie_name
df["budget"] = budget
df["verdict"] = verdict
df["india_screens"] = india_screens
df["overseas_screens"] = overseas_screens
df["worldwide_screens"] = worldwide_screens
df["release_date"] = release_date

# ------------------- SAVE TO CSV -------------------
df.to_csv("p2_raw.csv", index=False)
print("\nSaved → p2_raw.csv")
print("\nFirst 5 rows:")
print(df[["day", "india_net", "budget", "verdict", "india_screens", "release_date"]].head())


Movie: Pushpa: The Rule - Part 2 Box Office Collection | All Language | Day Wise | Worldwide
Budget: Not Found
Verdict: Not Found
India Screens: Not Found
Overseas Screens: Not Found
Worldwide Screens: Not Found
Release Date: 5th December 2024For more and the latest news aboutTollywood Box Office Collection, Stay tuned to us.Disclaimer: The Box Office Data are compiled from various sources and by our own research.

Saved → p2_raw.csv

First 5 rows:
                    day  india_net     budget    verdict india_screens  \
0    Day 0 [ Wednesday]  ₹ 2.35 Cr  Not Found  Not Found     Not Found   
1  Day 1 [1st Thursday]  ₹ 16.9 Cr  Not Found  Not Found     Not Found   
2    Day 2 [1st Friday]  ₹ 8.55 Cr  Not Found  Not Found     Not Found   
3  Day 3 [1st Saturday]  ₹ 11.7 Cr  Not Found  Not Found     Not Found   
4    Day 4 [1st Sunday]  ₹ 13.4 Cr  Not Found  Not Found     Not Found   

                                        release_date  
0  5th December 2024For more and the latest new

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# ------------------- CHANGE ONLY THIS LINE FOR OTHER MOVIES -------------------
url = "https://www.sacnilk.com/news/Pushpa_2020_Box_Office_Collection_Day_Wise_Worldwide"
# ------------------------------------------------------------------------------

page = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(page.text, "html.parser")

movie_name = soup.find("h1").text.strip()
print("Movie:", movie_name)

# ------------------- FIXED: GRAB BUDGET, VERDICT, SCREENS, RELEASE DATE -------------------
# Search for specific elements with keywords (handles HTML structure like <strong>)
budget = "Not Found"
verdict = "Not Found"
india_screens = "Not Found"
overseas_screens = "Not Found"
worldwide_screens = "Not Found"
release_date = "Not Found"

# Find all relevant text containers (p, div, span, etc.)
all_elements = soup.find_all(['p', 'div', 'span', 'strong'])

for elem in all_elements:
    text = elem.get_text(strip=True)
    
    # Budget
    if "Budget:" in text:
        budget_match = re.search(r'Budget:\s*(₹\s*\d+(?:\.\d+)?\s*Cr\s*\*\s*Approx)', text)
        if budget_match:
            budget = budget_match.group(1).strip()
    
    # Verdict (look for <strong>Salaar Verdict:</strong> <strong>Hit</strong>)
    if "Verdict:" in text:
        # Get the next sibling or parent text for the value
        verdict_elem = elem.find_next_sibling('strong') or elem.parent.find('strong')
        if verdict_elem:
            verdict = verdict_elem.get_text(strip=True)
        else:
            # Fallback regex for plain text
            verdict_match = re.search(r'Verdict:\s*(Hit|Flop|Blockbuster|Super Hit|Average|Disaster)', text)
            verdict = verdict_match.group(1).strip() if verdict_match else "Not Found"
    
    # Screen counts (look for lines with numbers after labels)
    if "India:" in text and re.search(r'\d+', text):
        india_match = re.search(r'India:\s*(\d+)', text)
        if india_match:
            india_screens = f"India: {india_match.group(1)}"
    if "Overseas:" in text and re.search(r'\d+', text):
        overseas_match = re.search(r'Overseas:\s*(\d+)', text)
        if overseas_match:
            overseas_screens = f"Overseas: {overseas_match.group(1)}"
    if "Worldwide total:" in text and re.search(r'\d+', text):
        worldwide_match = re.search(r'Worldwide total:\s*(\d+)', text)
        if worldwide_match:
            worldwide_screens = f"Worldwide total: {worldwide_match.group(1)}"
    
    # Release Date
    if "Release Date:" in text:
        release_match = re.search(r'Release Date:\s*(.+)', text)
        if release_match:
            release_date = release_match.group(1).strip()

print("Budget:", budget)
print("Verdict:", verdict)
print("India Screens:", india_screens)
print("Overseas Screens:", overseas_screens)
print("Worldwide Screens:", worldwide_screens)
print("Release Date:", release_date)

# ------------------- TABLE 1 (Day wise) -------------------
tables = soup.find_all("table")
daywise_table = tables[0]
rows = []

for row in daywise_table.find_all("tr")[1:]:
    cols = [c.text.strip() for c in row.find_all("td")]
    if len(cols) >= 2:
        rows.append({
            "day": cols[0],
            "india_net": cols[1],
            "change_percent": cols[2] if len(cols) > 2 else None
        })

df_daywise = pd.DataFrame(rows)

# ------------------- TABLE 2 (State wise) -------------------
if len(tables) > 1:
    state_table = tables[1]
    state_header = [th.text.strip().replace(" ", "_").lower() for th in state_table.find_all("th")]

    state_rows = []
    for row in state_table.find_all("tr")[1:]:
        cols = [c.text.strip() for c in row.find_all("td")]
        if cols:
            entry = {}
            for i, col in enumerate(cols):
                if i < len(state_header):
                    entry[state_header[i]] = col
            state_rows.append(entry)

    df_state = pd.DataFrame(state_rows)
else:
    df_state = pd.DataFrame()  # empty if no state table

# ------------------- MERGE BOTH TABLES -------------------
df = pd.merge(df_daywise, df_state, left_on="day", right_on="day", how="left")

# ------------------- ADD ALL NEW INFO -------------------
df["movie_name"] = movie_name
df["budget"] = budget
df["verdict"] = verdict
df["india_screens"] = india_screens
df["overseas_screens"] = overseas_screens
df["worldwide_screens"] = worldwide_screens
df["release_date"] = release_date

# ------------------- SAVE TO CSV -------------------
df.to_csv("p1_raw.csv", index=False)
print("\nSaved → p1_raw.csv")
print("\nFirst 5 rows:")
print(df[["day", "india_net", "budget", "verdict", "india_screens", "release_date"]].head())


Movie: Pushpa - The Rise Box Office Collection | Day Wise | Worldwide
Budget: ₹ 150 Cr * Approx
Verdict: Not Found
India Screens: India: 3000
Overseas Screens: Overseas: 1000
Worldwide Screens: Worldwide total: 4000
Release Date: 17th Dec 2021For more and the latest news aboutTollywood Box Office Collection, Stay tuned to us.Disclaimer: The Box Office Data are compiled from various sources and by our own research.

Saved → p1_raw.csv

First 5 rows:
                    day   india_net             budget    verdict  \
0    Day 1 [1st Friday]  ₹ 11.44 Cr  ₹ 150 Cr * Approx  Not Found   
1  Day 2 [1st Saturday]    ₹ 7.4 Cr  ₹ 150 Cr * Approx  Not Found   
2    Day 3 [1st Sunday]    ₹ 7.1 Cr  ₹ 150 Cr * Approx  Not Found   
3    Day 4 [1st Monday]   ₹ 3.35 Cr  ₹ 150 Cr * Approx  Not Found   
4   Day 5 [1st Tuesday]    ₹ 1.8 Cr  ₹ 150 Cr * Approx  Not Found   

  india_screens                                       release_date  
0   India: 3000  17th Dec 2021For more and the latest news abou

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import csv
from io import StringIO

def scrape_movie_data(url):
    """
    Scrapes key box office data from the AndhraBoxOffice info page.
    Returns a dictionary of the extracted data.
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    data = {
        "movie_name": "N/A",
        "total_WW_cls": "N/A",
        "hero_name": "N/A",
        "verdict": "N/A",
        "day1_in_cr": "N/A"
    }

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # --- 1. Basic Info (Movie Name, Hero Name, Verdict) ---
        
        # Movie Name (from <h1> tag)
        h1 = soup.find('h1')
        if h1:
            # Cleans up text like "Dhruva Final Total WW Collections" -> "Dhruva"
            match = re.search(r'(.+?)\s(Final Total WW Collections|Collections)', h1.text.strip())
            data["movie_name"] = match.group(1).strip() if match else h1.text.strip()
        
        # Hero Name (lbl_cast_name)
        hero_span = soup.find('span', id=re.compile(r'lbl_cast_name'))
        if hero_span:
            cast_list = hero_span.text.strip()
            # Assumes the first name is the main hero/star
            data["hero_name"] = cast_list.split(',')[0].strip() if ',' in cast_list else cast_list

        # Verdict (lbl_verdict)
        verdict_span = soup.find('span', id=re.compile(r'lbl_verdict'))
        if verdict_span:
            data["verdict"] = verdict_span.text.strip()

        # --- 2. Collection Data (Total WW Share, Day 1 Share) ---
        
        # Collection numbers are in a table structure, usually label/value in <td>s.
        all_tds = soup.find_all('td')
        for td in all_tds:
            text = td.text.strip()
            
            # total_WW_cls (Worldwide Closing Share)
            if 'Worldwide Closing Share' in text and data['total_WW_cls'] == 'N/A':
                # The value is in the next sibling <td>
                value_td = td.find_next_sibling('td')
                if value_td:
                    data['total_WW_cls'] = value_td.text.strip()
            
            # day1_in_cr (Day 1 Share)
            if 'Day 1 Share' in text and data['day1_in_cr'] == 'N/A':
                value_td = td.find_next_sibling('td')
                if value_td:
                    data['day1_in_cr'] = value_td.text.strip()

    except requests.exceptions.RequestException as e:
        print(f"Error retrieving data: {e}")
        return None
    
    return data

def format_to_csv_string(data_dict):
    """
    Formats the extracted dictionary data into a CSV string.
    """
    if not data_dict:
        return ""

    # Define the order of columns for the CSV
    fieldnames = ['movie_name', 'total_WW_cls', 'hero_name', 'verdict', 'day1_in_cr']
    
    # Use StringIO to capture the CSV output in memory
    output = StringIO()
    writer = csv.DictWriter(output, fieldnames=fieldnames)

    # Write header
    writer.writeheader()
    # Write data row
    writer.writerow(data_dict)

    return output.getvalue()

# --- RUN SCRAPER ---
url = "http://andhraboxoffice.com/info.aspx?id=2000&cid=6&fid=4540"
extracted_data = scrape_movie_data(url)

# Format the extracted data into a CSV string
csv_output = format_to_csv_string(extracted_data)

print(csv_output)

movie_name,total_WW_cls,hero_name,verdict,day1_in_cr
N/A,N/A,N/A,N/A,N/A



In [3]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

HEADERS = {
    "User-Agent": "Mozilla/5.0"
}

def scrape_andhra_boxoffice(url):
    r = requests.get(url, headers=HEADERS, timeout=20)
    soup = BeautifulSoup(r.text, "lxml")

    # ---------------- MOVIE NAME ----------------
    try:
        movie_name = soup.find("span", id="lblMovieName").text.strip()
    except:
        movie_name = "Unknown"

    hero_name = "NA"
    verdict = "NA"
    total_ww = "NA"
    day1 = "NA"

    # ---------------- INFO TABLE ----------------
    tables = soup.find_all("table")

    for table in tables:
        rows = table.find_all("tr")
        for row in rows:
            cols = [c.text.strip() for c in row.find_all("td")]

            if len(cols) < 2:
                continue

            label = cols[0].lower()
            value = cols[1]

            if "hero" in label:
                hero_name = value

            elif "verdict" in label:
                verdict = value

            elif "world wide" in label:
                total_ww = value.replace("Cr", "").strip()

            elif "first day" in label:
                day1 = value.replace("Cr", "").strip()

    # ---------------- FINAL STRUCTURE ----------------
    data = {
        "movie_name": movie_name,
        "total_WW_cls": total_ww,
        "hero_name": hero_name,
        "verdict": verdict,
        "day1_in_cr": day1
    }

    return pd.DataFrame([data])


# ---------------- RUN ----------------
url = "http://andhraboxoffice.com/info.aspx?id=2000&cid=6&fid=4540"

df = scrape_andhra_boxoffice(url)

df.to_csv("andhra_boxoffice_movie.csv", index=False)
print(df)


  movie_name total_WW_cls hero_name verdict day1_in_cr
0    Unknown           NA        NA      NA         NA


In [5]:
import requests
import pandas as pd
import time
from bs4 import BeautifulSoup
from tqdm import tqdm

HEADERS = {
    "User-Agent": "Mozilla/5.0"
}

def scrape_movie(movie_id, cid=6):
    url = f"http://andhraboxoffice.com/info.aspx?id={movie_id}&cid={cid}"
    
    try:
        r = requests.get(url, headers=HEADERS, timeout=15)
        if r.status_code != 200:
            return None

        soup = BeautifulSoup(r.text, "lxml")

        # ---------- MOVIE NAME ----------
        try:
            movie_name = soup.find("span", id="lblMovieName").text.strip()
        except:
            return None   # invalid movie page

        hero = verdict = ww = day1 = "NA"

        # ---------- INFO TABLE ----------
        for table in soup.find_all("table"):
            for row in table.find_all("tr"):
                cols = [c.text.strip() for c in row.find_all("td")]
                if len(cols) < 2:
                    continue

                key = cols[0].lower()
                val = cols[1]

                if "hero" in key:
                    hero = val
                elif "verdict" in key:
                    verdict = val
                elif "world wide" in key:
                    ww = val.replace("Cr", "").strip()
                elif "first day" in key:
                    day1 = val.replace("Cr", "").strip()

        return {
            "movie_id": movie_id,
            "movie_name": movie_name,
            "hero_name": hero,
            "verdict": verdict,
            "total_WW_cls": ww,
            "day1_in_cr": day1
        }

    except:
        return None


# ---------------- BULK RUN ----------------
movies = []
START_ID = 1
END_ID = 2500   # you can increase later

for movie_id in tqdm(range(START_ID, END_ID + 1)):
    data = scrape_movie(movie_id)
    if data:
        movies.append(data)
    time.sleep(1.5)   # VERY IMPORTANT (avoid ban)

df = pd.DataFrame(movies)

df.to_csv("andhra_boxoffice_bulk.csv", index=False)
print("Saved:", len(df), "movies")


  0%|          | 0/2500 [00:00<?, ?it/s]

 17%|█▋        | 437/2500 [12:03<56:55,  1.66s/it]  


KeyboardInterrupt: 

In [10]:
import requests
import pandas as pd
import time
from bs4 import BeautifulSoup
from tqdm import tqdm

HEADERS = {"User-Agent": "Mozilla/5.0"}

TOP_HEROES = [
    "Mahesh Babu",
    "Pawan Kalyan",
    "Prabhas",
    "Allu Arjun",
    "Jr NTR",
    "Ram Charan"
]

def scrape_movie(movie_id):
    url = f"http://andhraboxoffice.com/info.aspx?id={movie_id}&cid=6"

    try:
        r = requests.get(url, headers=HEADERS, timeout=15)
        if r.status_code != 200:
            return None

        soup = BeautifulSoup(r.text, "lxml")

        movie_name = soup.find("span", id="lblMovieName").text.strip()

        hero = verdict = ww = day1 = "NA"

        for table in soup.find_all("table"):
            for row in table.find_all("tr"):
                cols = [c.text.strip() for c in row.find_all("td")]
                if len(cols) < 2:
                    continue

                key = cols[0].lower()
                val = cols[1]

                if "hero" in key:
                    hero = val
                elif "verdict" in key:
                    verdict = val
                elif "world wide" in key:
                    ww = val.replace("Cr", "").strip()
                elif "first day" in key:
                    day1 = val.replace("Cr", "").strip()

        if hero not in TOP_HEROES:
            return None

        return {
            "movie_id": movie_id,
            "movie_name": movie_name,
            "hero_name": hero,
            "day1_in_cr": day1,
            "total_WW_cls": ww,
            "verdict": verdict
        }

    except:
        return None


# -------- BULK RUN --------
movies = []

for mid in tqdm(range(1, 2600)):
    data = scrape_movie(mid)
    if data:
        movies.append(data)
    time.sleep(1.5)

df_movies = pd.DataFrame(movies)
df_movies.to_csv("top6_heroes_movies.csv", index=False)

print("Movies scraped:", len(df_movies))
def clean_number(x):
    try:
        return float(str(x).replace(",", "").strip())
    except:
        return None

df_movies["day1_in_cr"] = df_movies["day1_in_cr"].apply(clean_number)
df_movies["total_WW_cls"] = df_movies["total_WW_cls"].apply(clean_number)
def is_hit(verdict):
    verdict = str(verdict).lower()
    return any(x in verdict for x in ["hit", "blockbuster", "super hit", "industry hit"])

career = df_movies.groupby("hero_name").agg(
    total_movies=("movie_name", "count"),
    hits=("verdict", lambda x: sum(is_hit(v) for v in x)),
    avg_day1=("day1_in_cr", "mean"),
    total_ww=("total_WW_cls", "sum")
).reset_index()

career["flops"] = career["total_movies"] - career["hits"]

career.to_csv("top6_heroes_career_summary.csv", index=False)
print(career)


 14%|█▍        | 366/2599 [09:59<1:00:56,  1.64s/it]


KeyboardInterrupt: 

In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}

def scrape_specific_movie(movie_name, hero_name=None):
    """
    Scrape box office data for a SPECIFIC Telugu movie from multiple sources
    Returns: movie_name, day1_WW, total_WW, verdict
    """
    
    # Priority sources for Telugu movies (most reliable)
    sources = [
        f"https://www.sacnilk.com/news/{movie_name.replace(' ', '_').lower()}_box_office_collection_day_wise_worldwide",
        f"https://www.koimoi.com/box-office/{movie_name.lower().replace(' ', '-')}_box_office_collection",
        f"https://en.wikipedia.org/w/index.php?search={movie_name}+box+office+telugu"
    ]
    
    data = {'movie_name': movie_name, 'day1_WW': 'N/A', 'total_WW': 'N/A', 'verdict': 'N/A', 'source': 'None'}
    
    for url in sources:
        try:
            response = requests.get(url, headers=headers, timeout=10)
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Try Sacnilk/Koimoi patterns first (most reliable for Day 1)
            day1_patterns = [
                r'Day 1.*?₹?([\d,.]+)\s*cr',
                r'Opening Day.*?₹?([\d,.]+)\s*cr',
                r'1st Day.*?₹?([\d,.]+).*?Worldwide'
            ]
            
            total_patterns = [
                r'Worldwide.*?₹?([\d,.]+)\s*cr',
                r'Total.*?₹?([\d,.]+)\s*cr',
                r'Final.*?₹?([\d,.]+)\s*cr'
            ]
            
            text = soup.get_text()
            
            # Extract Day 1
            for pattern in day1_patterns:
                match = re.search(pattern, text, re.IGNORECASE)
                if match:
                    data['day1_WW'] = match.group(1).replace(',', '')
                    data['source'] = url
                    break
            
            # Extract Total WW
            for pattern in total_patterns:
                match = re.search(pattern, text, re.IGNORECASE)
                if match:
                    data['total_WW'] = match.group(1).replace(',', '')
                    break
            
            # Extract Verdict
            verdict_patterns = [r'\b(Super Hit|Blockbuster|Hit|Flop|Disaster|Average)\b']
            for pattern in verdict_patterns:
                match = re.search(pattern, text, re.IGNORECASE)
                if match:
                    data['verdict'] = match.group(1).title()
                    break
            
            if data['day1_WW'] != 'N/A':
                break  # Found data, stop searching
                
        except Exception:
            continue
    
    time.sleep(1)  # Rate limit
    return data

def main():
    # Test with specific movies from your list
    movies_to_scrape = [
        'Baahubali 2 The Conclusion',
        'Pushpa 2 The Rule', 
        'Kalki 2898 AD',
        'RRR',
        'Salaar',
        'Dhurva',
        'Saaho'
    ]
    
    all_data = []
    
    for movie in movies_to_scrape:
        print(f"Scraping: {movie}")
        result = scrape_specific_movie(movie)
        all_data.append(result)
        print(f"Day 1: {result['day1_WW']} Cr | Total: {result['total_WW']} Cr")
        time.sleep(2)
    
    # Save to CSV (your exact format)
    df = pd.DataFrame(all_data)
    df = df[['movie_name', 'total_WW', 'verdict', 'day1_WW']]
    df.columns = ['movie_name', 'total_WW_cls', 'verdict', 'day1_WW_Gross_cr']
    
    df.to_csv('specific_movie_data.csv', index=False)
    print("\n✅ Saved to specific_movie_data.csv")
    print(df)

if __name__ == "__main__":
    main()


Scraping: Baahubali 2 The Conclusion
Day 1: N/A Cr | Total: 1700 Cr
Scraping: Pushpa 2 The Rule
Day 1: N/A Cr | Total: 200 Cr
Scraping: Kalki 2898 AD
Day 1: N/A Cr | Total: 200 Cr
Scraping: RRR
Day 1: 6 Cr | Total: 200 Cr
Scraping: Salaar
Day 1: N/A Cr | Total: 200 Cr
Scraping: Dhurva
Day 1: N/A Cr | Total: 200 Cr
Scraping: Saaho
Day 1: N/A Cr | Total: 200 Cr

✅ Saved to specific_movie_data.csv
                   movie_name total_WW_cls      verdict day1_WW_Gross_cr
0  Baahubali 2 The Conclusion         1700  Blockbuster              N/A
1           Pushpa 2 The Rule          200  Blockbuster              N/A
2               Kalki 2898 AD          200  Blockbuster              N/A
3                         RRR          200  Blockbuster                6
4                      Salaar          200  Blockbuster              N/A
5                      Dhurva          200  Blockbuster              N/A
6                       Saaho          200          Hit              N/A


In [11]:
import requests
from bs4 import BeautifulSoup
import re

def scrape_andhraboxoffice(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    try:
        r = requests.get(url, headers=headers, timeout=10)
        r.raise_for_status()
    except requests.RequestException as e:
        print(f"Error fetching page: {e}")
        return {"error": "Failed to fetch page"}
    
    soup = BeautifulSoup(r.text, "html.parser")
    
    # Movie name - from title tag, clean it
    try:
        title_text = soup.find("title").text.strip()
        movie_name = re.sub(r" Final Total WW Collections.*", "", title_text).strip()
    except:
        movie_name = "Unknown"
    print(f"Scraping: {movie_name}")
    
    # Hero name - search for common patterns, e.g., in meta or text
    hero_name = "Unknown"
    page_text = soup.get_text().lower()
    possible_heroes = ["ram charan", "chiranjeevi", "pawan kalyan", "mahesh babu", "prabhas"]  # Add more as needed
    for hero in possible_heroes:
        if hero in page_text:
            hero_name = hero.capitalize()
            break
    # Alternatively, if structured, e.g., find span or div with cast
    
    # Verdict - search for "Verdict :" or similar
    verdict = "Unknown"
    verdict_match = re.search(r"Verdict\s*:\s*([^\n]+)", soup.get_text(), re.IGNORECASE)
    if verdict_match:
        verdict = verdict_match.group(1).strip()
    
    # Collections - find table(s)
    total_ww_cls = "N/A"
    day1_in_cr = "N/A"  # Day 1 might not be on final page; assume from text or separate
    area_data = {}
    
    tables = soup.find_all("table")
    if tables:
        for table in tables:
            rows = table.find_all("tr")
            for row in rows:
                cols = [td.text.strip() for td in row.find_all("td")]
                if len(cols) >= 2:
                    area = cols[0].strip()
                    value = cols[1].strip()
                    area_data[area] = value
                    if "worldwide" in area.lower() or "total ww" in area.lower():
                        total_ww_cls = value
                    if "day 1" in area.lower() or "opening day" in area.lower():
                        day1_in_cr = value
    
    # If no explicit day1, perhaps estimate or note absence
    if day1_in_cr == "N/A" and "day1" in page_text.lower():
        # More advanced parsing if needed
        day1_match = re.search(r"Day 1\s*:\s*([\d.]+)\s*Cr", soup.get_text(), re.IGNORECASE)
        if day1_match:
            day1_in_cr = day1_match.group(1) + " Cr"
    
    return {
        "movie_name": movie_name,
        "hero_name": hero_name,
        "verdict": verdict,
        "total_WW_cls": total_ww_cls,
        "day1_in_cr": day1_in_cr,
        "area_data": area_data  # Extra for robustness
    }

# Example usage
url = "http://andhraboxoffice.com/info.aspx?id=2000&cid=6&fid=4540"
data = scrape_andhraboxoffice(url)
print(data)

Scraping: Dhruva
{'movie_name': 'Dhruva', 'hero_name': 'Unknown', 'verdict': 'Unknown', 'total_WW_cls': '', 'day1_in_cr': 'N/A', 'area_data': {'Previous\nNext\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nNEWS\n\n\n\n\n\nGALLERY\n\n\n\n\n\nTRADE\n\n\n\n\n\nCOLLECTIONS\n\n\n\n\n\nCENTRES\n\n\n\n\n\nREVIEWS\n\n\n\n\n\nVIDEOS\n\n\n\n\n\n\n\n\nFORUM\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\r\n            \xa0\xa0\xa0\xa0\xa0\xa0Collections\r\n            \n\n\n\n01/09/2017\n\n\nDhruva Final Total WW Collections\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nShareOnFB \n\n\n\n\nTweet\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nDhruva Final Total WW Collections\n\n\n\n\n\n\n\n\nDhruva 10 Days Total WW Collections\n\n\n\n\n\n\n\n\nDhruva 1st Week Total WW Collections\n\n\n\n\n\n\n\n\nDhruva 1st Weekend Total WW Collections\n\n\n\n\n\n\n\n\nDhruva 5 Days Collections\n\n\n\n\n\n\n\n\nDhruva 1st Day Total WW Collections\n\n\n\n\n\n\n\n\n\n\n\n\n\xa0\n\n\n\n\n\n\n\n\n\n\n

In [13]:
import requests
from bs4 import BeautifulSoup

url = "http://andhraboxoffice.com/info.aspx?id=2000&cid=6&fid=4540"

headers = {"User-Agent": "Mozilla/5.0"}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, "lxml")

# ---------------- MOVIE NAME (SAFE) ----------------
movie_name = "Unknown Movie"

# try 1
span = soup.find("span", id="lblMovieName")
if span:
    movie_name = span.text.strip()
else:
    # try 2 (fallback)
    h1 = soup.find("h1")
    if h1:
        movie_name = h1.text.strip()
    else:
        # try 3 (last fallback)
        movie_name = soup.title.text.strip()

# ---------------- DEFAULT VALUES ----------------
hero_name = "NA"
verdict = "NA"
total_ww = "NA"
day1 = "NA"

# ---------------- SCRAPE TABLE DATA ----------------
for table in soup.find_all("table"):
    for row in table.find_all("tr"):
        cols = [td.text.strip() for td in row.find_all("td")]

        if len(cols) < 2:
            continue

        label = cols[0].lower()
        value = cols[1]

        if "hero" in label:
            hero_name = value

        elif "verdict" in label:
            verdict = value

        elif "world wide" in label:
            total_ww = value.replace("Cr", "").strip()

        elif "first day" in label:
            day1 = value.replace("Cr", "").strip()

# ---------------- RESULT ----------------
print("Movie Name :", movie_name)
print("Hero       :", hero_name)
print("Verdict    :", verdict)
print("Day 1 (Cr) :", day1)
print("WW Total   :", total_ww)



Movie Name : Dhruva Final Total WW Collections| AndhraBoxOffice.com
Hero       : NA
Verdict    : NA
Day 1 (Cr) : NA
WW Total   : NA
