#### Scrape code for fetching number of sequels etc. per year

In [None]:

import requests
from bs4 import BeautifulSoup
import pandas as pd

# IMDb base URL for advanced search
BASE_URL = "https://www.imdb.com/search/title/"

def fetch_total_hits(year, keyword):
    """
    Fetch the total number of hits for a given year and keyword on IMDb.
    
    Parameters:
        year (int): The year to search for.
        keyword (str): The keyword to include in the search.
        
    Returns:
        int: Total number of hits for the specified year and keyword.
    """
    params = {
        "title_type": "feature",
        "release_date": f"{year}-01-01,{year}-12-31",  # Restrict to the given year
        "keywords": keyword,
        "sort": "boxoffice_gross_us,desc"  # Sort by US box office gross
    }
    
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(BASE_URL, headers=headers, params=params)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Locate the element containing the total number of hits
    total_hits_tag = soup.find("div", class_="sc-13add9d7-3 fwjHEn")
    if total_hits_tag:
        total_hits_text = total_hits_tag.text.strip()  # Extract text like "1-50 of 86"
        total_hits = int(total_hits_text.split("of")[-1].strip())  # Extract and convert "86" to int
        return total_hits
    return 0


# Keywords to search for
keywords = ["prequel", "sequel", "remake"]

# Years to iterate over
years = range(1924, 2025)  # Adjust the range as needed

# Create a DataFrame
data = []

''' for year in years:
    for keyword in keywords:
        print(f"Fetching data for year {year} and keyword '{keyword}'...")
        total_hits = fetch_total_hits(year, keyword)
        data.append({"Year": year, "Keyword": keyword, "Hits": total_hits})

# Convert the data to a DataFrame
hits_df = pd.DataFrame(data)

hits_df.to_csv("qqq.csv", index=False) '''

#### Scrape code for fetching number of movies made each year


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# IMDb base URL for advanced search
BASE_URL = "https://www.imdb.com/search/title/"

def fetch_total_hits(year):
    """
    Fetch the total number of hits for a given year on IMDb.
    
    Parameters:
        year (int): The year to search for.
        
    Returns:
        int: Total number of hits for the specified year.
    """
    params = {
        "title_type": "feature",
        "release_date": f"{year}-01-01,{year}-12-31",  # Restrict to the given year
        "sort": "boxoffice_gross_us,desc"  # Sort by US box office gross
    }
    
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(BASE_URL, headers=headers, params=params)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Locate the element containing the total number of hits
    total_hits_tag = soup.find("div", class_="sc-13add9d7-3 fwjHEn")
    if total_hits_tag:
        total_hits_text = total_hits_tag.text.strip()  # Extract text like "1-50 of 86"
        total_hits = total_hits_text.split("of")[-1].strip() #Extract the number
        total_hits = int(total_hits.replace(",", ""))  #  convert to int and deal with the thousand separator ","
        return total_hits
    return 0


# Years to iterate over
years = range(1924, 2025)  # Adjust the range as needed

# Create a DataFrame
data = []

'''for year in years:
    print(f"Fetching data for year {year}")
    total_hits = fetch_total_hits(year)
    data.append({"Year": year, "Hits": total_hits})

# Convert the data to a DataFrame
allhits_df = pd.DataFrame(data)

allhits_df.to_csv("imdb_hits_years.csv", index=False) '''

#### Scrape code for fetching top 25 movies per year


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# IMDb base URL for advanced search
BASE_URL = "https://www.imdb.com/search/title/"

def fetch_top_movies(year):
    """
    Fetch the titles and IMDb ratings for the top 25 movies of a given year.
    
    Parameters:
        year (int): The year to fetch data for.
        
    Returns:
        list: A list of dictionaries containing movie titles, IMDb ratings, and year.
    """
    params = {
        "title_type": "feature",
        "release_date": f"{year}-01-01,{year}-12-31",  # Restrict to the given year
        "sort": "boxoffice_gross_us,desc"  # Sort by US box office gross
    }
    
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(BASE_URL, headers=headers, params=params)
    if response.status_code != 200:
        print(f"Failed to fetch data for {year}: Status code {response.status_code}")
        return []
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # List to store movie data
    movies = []
    
    # Locate the container for the movie list
    movie_list = soup.select_one("ul.ipc-metadata-list")
    if not movie_list:
        print(f"No movie list found for {year}")
        return []
    
    # Select all <li> elements within the <ul> container
    list_items = movie_list.select("li.ipc-metadata-list-summary-item")
    print(f"Year {year}: Found {len(list_items)} <li> elements.")  # Debugging output
    
    for item in list_items:
        # Extract title
        title_tag = item.select_one("h3.ipc-title__text")
        title = title_tag.text.strip() if title_tag else None
        
        # Extract IMDb rating
        rating_tag = item.select_one("span.ipc-rating-star--rating")
        rating = float(rating_tag.text.strip()) if rating_tag else None
        
        # Append movie data
        movies.append({
            "Year": year,
            "Title": title,
            "Rating": rating
        })
    
    return movies


'''
# Fetch top 25 movies for multiple years
years = range(1924, 2025)  # Adjust the range as needed
all_movies = []

for year in years:
    print(f"Fetching movies for year {year}")
    all_movies.extend(fetch_top_movies(year))


movies_df = pd.DataFrame(all_movies)

movies_df.to_csv("imdb_top_25_movies.csv", index=False)
'''


#### Scrape code for top 25 movies each year (with keyword search)


In [None]:
#### Scrape code for top 25 movies each year (with keyword search)
import requests
from bs4 import BeautifulSoup
import pandas as pd

# IMDb base URL for advanced search
BASE_URL = "https://www.imdb.com/search/title/"

def fetch_top_movies(year, keyword=None):
    """
    Fetch the titles and IMDb ratings for the top 25 movies of a given year, filtered by keyword if specified.
    
    Parameters:
        year (int): The year to fetch data for.
        keyword (str): The keyword to filter the movies (e.g., "sequel", "remake").
        
    Returns:
        list: A list of dictionaries containing movie titles, IMDb ratings, year, and keyword.
    """
    params = {
        "title_type": "feature",
        "release_date": f"{year}-01-01,{year}-12-31",  # Restrict to the given year
        "sort": "boxoffice_gross_us,desc",  # Sort by US box office gross
    }
    
    # Add keyword to the parameters if specified
    if keyword:
        params["keywords"] = keyword
    
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(BASE_URL, headers=headers, params=params)
    
    # Check for successful response
    if response.status_code != 200:
        print(f"Failed to fetch data for {year} with keyword '{keyword}': Status code {response.status_code}")
        return []
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # List to store movie data
    movies = []
    
    # Locate the container for the movie list
    movie_list = soup.select_one("ul.ipc-metadata-list")
    if not movie_list:
        print(f"No movie list found for {year} with keyword '{keyword}'")
        return []
    
    # Select all <li> elements within the <ul> container
    list_items = movie_list.select("li.ipc-metadata-list-summary-item")
    print(f"Year {year}, Keyword '{keyword}': Found {len(list_items)} <li> elements.")  # Debugging output
    
    for item in list_items:
        # Extract title
        title_tag = item.select_one("h3.ipc-title__text")
        title = title_tag.text.strip() if title_tag else None
        
        # Extract IMDb rating
        rating_tag = item.select_one("span.ipc-rating-star--rating")
        rating = float(rating_tag.text.strip()) if rating_tag else None
        
        # Append movie data
        movies.append({
            "Year": year,
            "Title": title,
            "Rating": rating,
            "Keyword": keyword
        })
    
    return movies

# Years to iterate over
years = range(1924,2025)  # Adjust the range as needed
keywords = ["sequel", "remake", "prequel"]  # Keywords to filter by
all_movies = []

'''
for keyword in keywords:
    for year in years:
        print(f"Fetching movies for year {year} with keyword '{keyword}'")
        all_movies.extend(fetch_top_movies(year, keyword=keyword))

# Convert to DataFrame
movies_df = pd.DataFrame(all_movies)

movies_df

#Save to CSV
movies_df.to_csv("imdb_top_25_movies_with_keywords.csv", index=False)'''

#### Graphics code

In [None]:
#Graphics code for figure Prequels, Sequels and Remakes over the year. (Needs access to the dataframes in the Project file to run)

import matplotlib.pyplot as plt

# Plot a stacked bar chart for prequel, sequel, and remake
plt.figure(figsize=(12, 6))

# Plot each category as a separate bar section
plt.bar(imdb_hits_p["Year"], imdb_hits_p["prequel"], label="Prequels", color="blue", edgecolor="black")
plt.bar(imdb_hits_p["Year"], imdb_hits_p["sequel"], bottom=imdb_hits_p["prequel"], label="Sequels", color="green", edgecolor="black")
plt.bar(imdb_hits_p["Year"], imdb_hits_p["remake"], bottom=imdb_hits_p["prequel"] + imdb_hits_p["sequel"], label="Remakes", color="orange", edgecolor="black")

# Mark 2005 on the x-axis
plt.xticks(rotation=45)  # Rotate x-axis labels
plt.axvline(x=2005, color="red", linestyle="--", linewidth=1, label="2005")

# Add titles and labels
plt.title("Prequels, Sequels, and Remakes Over the Years", fontsize=16)
plt.xlabel("Year", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.legend()  # Add a legend for clarity
plt.grid(axis='y', linestyle='--', alpha=0.7)  # Add horizontal gridlines for clarity
plt.tight_layout()

'''
# Save the plot
plot_path = "imdb_hits_keyword_plot.png"  # File path to save the plot
plt.savefig(plot_path)
plt.close()  # Close the plot to avoid displaying it twice '''


#Graphics code for total number of released over the years and the share of the movies that were sequels etc. (Needs access to the dataframes in the Project file to run)

import matplotlib.pyplot as plt

# Create the figure and primary axis
fig, ax1 = plt.subplots(figsize=(12, 6))

# Plot total hits on the primary y-axis (ax1)
bar1 = ax1.bar(imdb_hits_merged["Year"], imdb_hits_merged["Hits"], color="blue", edgecolor="black", label="Total Hits")
ax1.set_xlabel("Year", fontsize=14)
ax1.set_ylabel("Total Hits", fontsize=14, color="blue")
ax1.tick_params(axis="y", labelcolor="blue")
ax1.grid(axis="y", linestyle="--", alpha=0.7)  # Gridlines for primary axis

# Create a secondary y-axis (ax2) for the share of sequels
ax2 = ax1.twinx()
line2, = ax2.plot(imdb_hits_merged["Year"], imdb_hits_merged["Share"], color="red", label="Share of Sequels etc.", linewidth=2, marker="o")
ax2.set_ylabel("Share", fontsize=14, color="red")
ax2.tick_params(axis="y", labelcolor="red")

# Add title
plt.title("Movies Released and Share of Sequels etc. Over the Years", fontsize=16)

# Add a combined legend
lines = [bar1, line2]
labels = [l.get_label() for l in lines]
plt.legend(lines, labels, loc="upper left", fontsize=12)

# Tight layout for better spacing
plt.tight_layout()

'''
# Save the plot
plot_path = "imdb_total_and_share_plot.png"  # File path to save the plot
plt.savefig(plot_path)
plt.close()  # Close the plot to avoid displaying it twice'''


#Graphics code for the title percentage graph (Needs access to the dataframes in the Project file to run)

import matplotlib.pyplot as plt

# Calculate a moving average for Title_Percentage (window size = 5)
title_counts_df["Moving_Average"] = title_counts_df["Title_Percentage"].rolling(window=7, center=True).mean()

# Plot Title_Percentage over the years
plt.figure(figsize=(12, 6))
plt.plot(title_counts_df["Year"], title_counts_df["Title_Percentage"], marker="o", label="Sequels etc.", color="blue")

# Plot the moving average
plt.plot(title_counts_df["Year"], title_counts_df["Moving_Average"], label="Moving Average (7 Years)", color="orange", linewidth=2)

# Highlight 2005 on the x-axis
plt.axvline(x=2005, color="red", linestyle="--", label="Year 2005 Marker")

# Add titles and labels
plt.title("Percentage Sequels etc. over the years in the top 25 movies by US box office", fontsize=16)
plt.xlabel("Year", fontsize=14)
plt.ylabel("Percentage (%)", fontsize=14)
plt.grid(axis="y", linestyle="--", alpha=0.7)  # Add gridlines for clarity
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.legend()  # Add legend for clarity
plt.tight_layout()

'''
# Save the plot
plot_path = "perc_sequels.png"  # File path to save the plot
plt.savefig(plot_path)
plt.close()  # Close the plot to avoid displaying it twice
'''
