## Search Terms

In [14]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Base URL pattern for Google Trends Year in Search
base_url = "https://trends.withgoogle.com/year-in-search/{year}/us/"

# List to store all data
all_data = []

# Loop through each year from 2015 to 2025
for year in range(2015, 2026):
    url = base_url.format(year=year)
    
    try:
        # Fetch the webpage content
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, "html.parser")

        # Find all category containers
        categories = soup.find_all("div", class_="card-inner-container")

        for category in categories:
            # Extract the category title
            title_tag = category.find("h4", class_="glue-headline--headline-4")
            if title_tag:
                category_title = title_tag.text.strip()
            else:
                continue  # Skip if no title found

            # Extract the list items within the category
            items = category.find_all("li", class_="glue-card__list-item")
            for rank, item in enumerate(items, start=1):
                # Extract only the text inside the search term
                term_tag = item.find("h4")  # The actual trending search term is inside <h4>
                if term_tag:
                    search_term = term_tag.text.strip()
                else:
                    search_term = item.get_text(strip=True).split("Search it")[0].strip()  # Remove "Search it" text

                all_data.append([year, category_title, rank, search_term])
    
        print(f"✅ Successfully scraped {year}")
    
    except Exception as e:
        print(f"❌ Failed to scrape {year}: {e}")

# Create a DataFrame
df_trending_searches = pd.DataFrame(all_data, columns=["Year", "Category", "Rank", "Search Term"])


✅ Successfully scraped 2015
✅ Successfully scraped 2016
✅ Successfully scraped 2017
✅ Successfully scraped 2018
✅ Successfully scraped 2019
✅ Successfully scraped 2020
✅ Successfully scraped 2021
✅ Successfully scraped 2022
✅ Successfully scraped 2023
✅ Successfully scraped 2024
✅ Successfully scraped 2025


In [14]:
# import pandas as pd
# from IPython.display import display, HTML  # Corrected import

# # Save DataFrame as a CSV file
# csv_filename = "google_trending_searches_2015_2025.csv"
# df_trending_searches.to_csv(csv_filename, index=False)
# print(f"✅ Data saved as '{csv_filename}'")

# # Generate HTML table with scrollable container
# scrollable_table_html = f"""
# <style>
#     .scrollable-table {{
#         overflow-y: auto;
#         height: 500px;
#         display: block;
#         border: 1px solid #ddd;
#     }}
#     table {{
#         border-collapse: collapse;
#         width: 100%;
#     }}
#     th, td {{
#         border: 1px solid #ddd;
#         padding: 8px;
#         text-align: left;
#     }}
#     th {{
#         background-color: #f4f4f4;
#         position: sticky;
#         top: 0;
#     }}
# </style>
# <div class="scrollable-table">
#     {df_trending_searches.to_html(index=False, classes='table table-striped table-hover')}
# </div>
# """

# # Display the scrollable table
# display(HTML(scrollable_table_html))

## Weekly Search Interest

In [15]:
from pytrends.request import TrendReq
import pandas as pd
import time
import os

# Load processed data if it exists
clean_data_file = "clean_trends_data.csv"
failed_data_file = "skipped_failed_terms.csv"

# Load existing clean data
if os.path.exists(clean_data_file):
    df_existing = pd.read_csv(clean_data_file)

    # Track completed terms by (search term, year) tuple
    completed_terms = set(zip(df_existing["Search Term"], df_existing["Year"].astype(str)))

    print(f"✅ Found existing data: {len(completed_terms)} search term-year combinations already processed.")
else:
    df_existing = pd.DataFrame()
    completed_terms = set()

# Load input CSV
input_csv = "google_trending_searches_2015_2025_x.csv"
df_trending_searches_processed = pd.read_csv(input_csv)

# Ask the user whether to start from the top or bottom
start_from = input("📌 Enter 'top' to start from the beginning or 'bottom' to start from the end: ").strip().lower()

if start_from == "bottom":
    df_trending_searches_processed = df_trending_searches_processed[::-1]  # Reverse the order

# Initialize Pytrends
pytrends = TrendReq(hl='en-US', tz=360, retries=3)

# Prepare storage for results, skipped terms, and failed terms
all_trends = []
skipped_terms = []  # Terms with no data
failed_terms = []  # Terms that failed after max retries

# Extract search terms that still need to be processed
pending_terms = [
    (row["Search Term"], str(row["Year"])) for _, row in df_trending_searches_processed.iterrows()
    if (row["Search Term"], str(row["Year"])) not in completed_terms  # Ensure it checks BOTH term & year
]

max_retries = 3  # Maximum retries per search term

while pending_terms:
    new_failed_terms = []  # Reset failed terms list in each loop

    for search_term, year in pending_terms:
        attempt = 0
        success = False

        while attempt < max_retries and not success:
            try:
                # Build payload for a single search term
                pytrends.build_payload([search_term], timeframe=f"{year}-01-01 {year}-12-31", geo='US')

                # Get interest over time
                df_trends = pytrends.interest_over_time()

                # Check if the response is empty before processing
                if df_trends.empty:
                    print(f"⚠️ No data found for {search_term} ({year}) - Skipping.")
                    skipped_terms.append((search_term, year, "No Data Available"))  # Ensure it's a tuple
                    success = True  # Mark as success so it doesn't keep retrying
                    continue  # Skip to next search term

                # Remove 'isPartial' column if present
                if 'isPartial' in df_trends.columns:
                    df_trends = df_trends.drop(columns=['isPartial'])

                # Convert data to long format (Tidy Data)
                df_trends = df_trends.reset_index().melt(id_vars=["date"], var_name="Search Term", value_name="Interest")

                # Add metadata columns
                df_trends["Year"] = year

                # Append results
                all_trends.append(df_trends)

                print(f"✅ Extracted: {search_term} ({year})")
                success = True  # Mark as success

                # Save progress after every search term
                pd.concat(all_trends + [df_existing], ignore_index=True).to_csv(clean_data_file, index=False)

            except Exception as e:
                attempt += 1
                print(f"❌ Attempt {attempt}/{max_retries} failed for {search_term} ({year}): {e}")
                time.sleep(5 * attempt)  # Exponential backoff (5s, 10s, 15s)

        if not success:  # If all retries failed, add to failed list
            new_failed_terms.append((search_term, year, "Max Retries Reached"))  # Ensure it's a tuple

    # Update pending terms for the next loop (only failed ones)
    pending_terms = [(term[0], term[1]) for term in new_failed_terms]  # Ensure it's a list of tuples

    if pending_terms:
        print(f"🔄 Retrying {len(pending_terms)} failed terms...")

# Save skipped and failed terms to CSV for review
if skipped_terms or failed_terms:
    df_skipped_failed = pd.DataFrame(skipped_terms + failed_terms, columns=["Search Term", "Year", "Reason"])
    df_skipped_failed.to_csv(failed_data_file, index=False)
    print(f"❌ Skipped & failed terms saved to {failed_data_file}")
else:
    print("✅ No skipped or failed terms!")

print("✅ All terms successfully processed! Check clean_trends_data.csv")


✅ Found existing data: 2023 search term-year combinations already processed.
✅ Extracted: Donald Trump (2015)
✅ Extracted: Bernie Sanders (2015)
✅ Extracted: Ben Carson (2015)
✅ Extracted: Ted Cruz (2015)
✅ Extracted: Deez Nuts (2015)
✅ Extracted: John Boehner (2015)
✅ Extracted: Marco Rubio (2015)
✅ Extracted: Donald Trump (2015)
✅ Extracted: 8 minute back workout (2016)
⚠️ No data found for StrawberryDaiquiri (2016) - Skipping.
✅ Extracted: Zdeno Chára (2016)
⚠️ No data found for drivers license, Olivia Rodrigo (2021) - Skipping.
⚠️ No data found for All Too Well, Taylor Swift (2021) - Skipping.
✅ Extracted: Wants and Needs, Drake (2021)
⚠️ No data found for Poker Face, Lady Gaga (2021) - Skipping.
⚠️ No data found for Fancy Like, Walker Hayes (2021) - Skipping.
⚠️ No data found for deja vu, Olivia Rodrigo (2021) - Skipping.
✅ Extracted: Fruit Roll-Ups trend (2023)
⚠️ No data found for Ella Baila Sola - Eslabon Armado and Peso Pluma (2023) - Skipping.
⚠️ No data found for Boy's a lia

In [21]:
import pandas as pd

# Load the original dataset
original_file = "google_trending_searches_2015_2025_x.csv"
df_original = pd.read_csv(original_file)

# Load the processed data
processed_file = "clean_trends_data.csv"
df_processed = pd.read_csv(processed_file)

# Load the skipped/failed terms
skipped_failed_file = "skipped_failed_terms.csv"
df_skipped_failed = pd.read_csv(skipped_failed_file)

# Combine processed and skipped/failed data
df_combined = pd.concat([df_processed[['Search Term', 'Year']], df_skipped_failed[['Search Term', 'Year']]])

# Drop duplicates in case a term appears multiple times in the same year
df_combined = df_combined.drop_duplicates()

# Merge original data with combined data to find missing terms
df_merged = pd.merge(df_original, df_combined, on=['Search Term', 'Year'], how='left', indicator=True)

# Terms present in original but not in combined are the missing ones
df_missing = df_merged[df_merged['_merge'] == 'left_only']

# Display the missing terms
print(f"Total missing terms: {len(df_missing)}")
print(df_missing[['Search Term', 'Year']])

Total missing terms: 0
Empty DataFrame
Columns: [Search Term, Year]
Index: []


In [1]:
from pytrends.request import TrendReq
import pandas as pd

# Initialize Pytrends
pytrends = TrendReq(hl='en-US', tz=360)

# Define Keywords for comparison
keywords = ["Donald Trump", "Wikipedia"]

# Build Payload
pytrends.build_payload(keywords, timeframe='2017-01-01 2017-12-31', geo='US')

# Get weekly interest over time
df_trends = pytrends.interest_over_time()

# Remove 'isPartial' column if present
if 'isPartial' in df_trends.columns:
    df_trends = df_trends.drop(columns=['isPartial'])

# Reset index to include date as a column
df_trends.reset_index(inplace=True)

print(df_trends.head())  # Show first few rows


        date  Donald Trump  Wikipedia
0 2017-01-01            26         17
1 2017-01-08            45         18
2 2017-01-15           100         18
3 2017-01-22            77         20
4 2017-01-29            63         19


In [2]:
from pytrends.request import TrendReq
import pandas as pd

# Initialize Pytrends
pytrends = TrendReq(hl='en-US', tz=360)

# Define Keywords for comparison
keywords = ["Donald Trump", "Wikipedia"]

# Build Payload
pytrends.build_payload(keywords, timeframe='2017-01-01 2017-12-31', geo='US')

# Get weekly interest over time
df_trends = pytrends.interest_over_time()

# Remove 'isPartial' column if present
if 'isPartial' in df_trends.columns:
    df_trends = df_trends.drop(columns=['isPartial'])

# Reset index to include date as a column
df_trends.reset_index(inplace=True)

print(df_trends.head())  # Show first few rows


        date  Donald Trump  Wikipedia
0 2017-01-01            26         17
1 2017-01-08            45         18
2 2017-01-15           100         18
3 2017-01-22            77         20
4 2017-01-29            63         19


In [4]:
from pytrends.request import TrendReq
import pandas as pd

# Initialize Pytrends
pytrends = TrendReq(hl='en-US', tz=360)

# Define Keywords for comparison
keywords = ["Bernie Sanders"]

# Build Payload
pytrends.build_payload(keywords, timeframe='2015-01-01 2015-12-31', geo='US')

# Get weekly interest over time
df_trends = pytrends.interest_over_time()

# Remove 'isPartial' column if present
if 'isPartial' in df_trends.columns:
    df_trends = df_trends.drop(columns=['isPartial'])

# Reset index to include date as a column
df_trends.reset_index(inplace=True)

print(df_trends.head())  # Show first few rows


        date  Bernie Sanders
0 2014-12-28               1
1 2015-01-04               1
2 2015-01-11               1
3 2015-01-18               1
4 2015-01-25               1


## Annualised Comp to Wikipedia

In [16]:
from pytrends.request import TrendReq
import pandas as pd
import time
import os

# Load input dataset
input_file = "clean_trends_data.csv"
df_clean_trends = pd.read_csv(input_file)

# Step 1: Extract unique search term-year combinations into a new DataFrame
clean_trends_data_processed = df_clean_trends[["Search Term", "Year"]].drop_duplicates()

# Save to file for reference
clean_trends_data_processed.to_csv("clean_trends_data_processed.csv", index=False)

print(f"✅ Extracted {len(clean_trends_data_processed)} unique search term-year combinations.")

# Load processed data if it exists
output_file = "trends_with_wikipedia.csv"
failed_file = "failed_trends_with_wikipedia.csv"

# Load existing processed data
if os.path.exists(output_file):
    df_existing = pd.read_csv(output_file)
    completed_terms = set(zip(df_existing["Search Term"], df_existing["Year"].astype(str)))
    print(f"✅ Found existing data: {len(completed_terms)} search term-year combinations already processed.")
else:
    df_existing = pd.DataFrame(columns=["Search Term", "Average Interest", "Wikipedia Interest", "Year"])
    completed_terms = set()

# Initialize Pytrends
pytrends = TrendReq(hl='en-US', tz=360, retries=3)

# Prepare storage for failed terms
failed_terms = []

# Extract search terms that still need to be processed
pending_terms = [
    (row["Search Term"], str(row["Year"])) for _, row in clean_trends_data_processed.iterrows()
    if (row["Search Term"], str(row["Year"])) not in completed_terms
]

max_retries = 3  # Maximum retries per search term

while pending_terms:
    new_failed_terms = []  # Reset failed terms list in each loop

    for search_term, year in pending_terms:
        attempt = 0
        success = False

        while attempt < max_retries and not success:
            try:
                # Build payload for search term + Wikipedia
                pytrends.build_payload([search_term, "Wikipedia"], timeframe=f"{year}-01-01 {year}-12-31", geo='US')

                # Get interest over time
                df_trends = pytrends.interest_over_time()

                # Check if the response is empty before processing
                if df_trends.empty:
                    print(f"⚠️ No data found for {search_term} ({year}) - Skipping.")
                    failed_terms.append({"Search Term": search_term, "Year": year, "Reason": "No Data Available"})
                    success = True  # Mark as success so it doesn't keep retrying
                    continue  # Skip to next search term

                # Remove 'isPartial' column if present
                if 'isPartial' in df_trends.columns:
                    df_trends = df_trends.drop(columns=['isPartial'])

                # Compute the yearly average interest
                avg_interest = df_trends.mean()

                # Extract search term's average interest
                search_interest = avg_interest.get(search_term, None)
                wikipedia_interest = avg_interest.get("Wikipedia", None)

                # If either value is None, skip and log as failed
                if search_interest is None or wikipedia_interest is None:
                    print(f"⚠️ Missing data for {search_term} ({year}) - Skipping.")
                    failed_terms.append({"Search Term": search_term, "Year": year, "Reason": "Missing Data"})
                    success = True  # Mark as success
                    continue

                # Create a structured DataFrame
                df_result = pd.DataFrame({
                    "Search Term": [search_term],
                    "Average Interest": [search_interest],
                    "Wikipedia Interest": [wikipedia_interest],
                    "Year": [year]
                })

                # Save results **immediately** to prevent duplication
                df_result.to_csv(output_file, mode='a', header=not os.path.exists(output_file), index=False)

                print(f"✅ Extracted: {search_term} ({year})")
                success = True  # Mark as success

            except Exception as e:
                attempt += 1
                print(f"❌ Attempt {attempt}/{max_retries} failed for {search_term} ({year}): {e}")
                time.sleep(5 * attempt)  # Exponential backoff (5s, 10s, 15s)

        if not success:  # If all retries failed, add to failed list
            new_failed_terms.append((search_term, year))

    # Update pending terms for the next loop (only failed ones)
    pending_terms = new_failed_terms

    if pending_terms:
        print(f"🔄 Retrying {len(pending_terms)} failed terms...")

# Save failed terms to CSV for review
if failed_terms:
    df_failed = pd.DataFrame(failed_terms)
    df_failed.to_csv(failed_file, index=False)
    print(f"❌ Failed terms saved to {failed_file}")
else:
    print("✅ No failed terms!")

print("✅ All terms successfully processed! Check trends_with_wikipedia.csv")


✅ Extracted 2038 unique search term-year combinations.
✅ Found existing data: 2030 search term-year combinations already processed.
✅ Extracted: 8 minute back workout (2016)
✅ Extracted: Zdeno Chára (2016)
✅ Extracted: Wants and Needs, Drake (2021)
✅ Extracted: Fruit Roll-Ups trend (2023)
✅ Extracted: Bzrp Music Sessions, Vol. 53 - Shakira (2023)
✅ Extracted: Red, White & Royal Blue - Casey McQuiston (2023)
✅ Extracted: American Prometheus - Kai Bird and Martin J. Sherwin (2023)
✅ Extracted: Dill pickle chicken salad recipe (2024)
✅ No failed terms!
✅ All terms successfully processed! Check trends_with_wikipedia.csv


In [64]:
from pytrends.request import TrendReq
import pandas as pd

# Initialize Pytrends
pytrends = TrendReq(hl='en-US', tz=360)

# Define Keywords for comparison
keywords = ["Donald Trump", "Wikipedia"]

# Define Year
year = "2017"

# Build Payload
pytrends.build_payload(keywords, timeframe=f"{year}-01-01 {year}-12-31", geo='US')

# Get weekly interest over time
df_trends = pytrends.interest_over_time()

# Remove 'isPartial' column if present
if 'isPartial' in df_trends.columns:
    df_trends = df_trends.drop(columns=['isPartial'])

# Compute the yearly average interest
df_average_interest = df_trends.mean().reset_index()
df_average_interest.columns = ["Search Term", "Average Interest"]

# Add Year column
df_average_interest["Year"] = year

# Display the DataFrame
print(df_average_interest)


    Search Term  Average Interest  Year
0  Donald Trump         27.000000  2017
1     Wikipedia         17.660377  2017


## Extract State Comparison

In [17]:
from pytrends.request import TrendReq
import pandas as pd
import time
import os

# Load processed data if it exists
output_file = "state_interest_comparison.csv"
failed_file = "failed_state_interest.csv"

# Load existing processed data
if os.path.exists(output_file):
    df_existing = pd.read_csv(output_file)
    completed_terms = set(zip(df_existing["Search Term"], df_existing["Year"].astype(str)))
    print(f"✅ Found existing data: {len(completed_terms)} search term-year combinations already processed.")
else:
    df_existing = pd.DataFrame(columns=["Search Term", "Year", "State", "Search Term %", "Wikipedia %"])
    completed_terms = set()

# Load input dataset
input_file = "clean_trends_data_processed.csv"
df_clean_trends = pd.read_csv(input_file)

# Initialize Pytrends
pytrends = TrendReq(hl='en-US', tz=360, retries=3)

# Prepare storage for failed terms
failed_terms = []

# Extract search terms that still need to be processed
pending_terms = [
    (row["Search Term"], str(row["Year"])) for _, row in df_clean_trends.iterrows()
    if (row["Search Term"], str(row["Year"])) not in completed_terms
]

max_retries = 3  # Maximum retries per search term

while pending_terms:
    new_failed_terms = []  # Reset failed terms list in each loop

    for search_term, year in pending_terms:
        attempt = 0
        success = False

        while attempt < max_retries and not success:
            try:
                # Build payload for search term + Wikipedia
                pytrends.build_payload([search_term, "Wikipedia"], timeframe=f"{year}-01-01 {year}-12-31", geo='US')

                # Extract Regional Interest
                df_regions = pytrends.interest_by_region(resolution='REGION')

                # Check if the response is empty before processing
                if df_regions.empty:
                    print(f"⚠️ No regional data found for {search_term} ({year}) - Skipping.")
                    failed_terms.append({"Search Term": search_term, "Year": year, "Reason": "No Regional Data"})
                    success = True  # Mark as success so it doesn't keep retrying
                    continue  # Skip to next search term

                # Normalize as percentages
                df_regions["Total"] = df_regions.sum(axis=1)
                df_regions[f"{search_term} %"] = (df_regions[search_term] / df_regions["Total"]) * 100
                df_regions["Wikipedia %"] = (df_regions["Wikipedia"] / df_regions["Total"]) * 100

                # Keep only relevant columns
                df_regions = df_regions[[f"{search_term} %", "Wikipedia %"]]

                # Reset index to get state names
                df_regions.reset_index(inplace=True)
                df_regions.rename(columns={"geoName": "State"}, inplace=True)

                # Add metadata columns
                df_regions["Search Term"] = search_term
                df_regions["Year"] = year

                # Reorder columns for clarity
                df_regions = df_regions[["Search Term", "Year", "State", f"{search_term} %", "Wikipedia %"]]

                # Save results **immediately** to prevent duplication
                df_regions.to_csv(output_file, mode='a', header=not os.path.exists(output_file), index=False)

                print(f"✅ Extracted regional data for: {search_term} ({year})")
                success = True  # Mark as success

            except Exception as e:
                attempt += 1
                print(f"❌ Attempt {attempt}/{max_retries} failed for {search_term} ({year}): {e}")
                time.sleep(5 * attempt)  # Exponential backoff (5s, 10s, 15s)

        if not success:  # If all retries failed, add to failed list
            new_failed_terms.append((search_term, year))

    # Update pending terms for the next loop (only failed ones)
    pending_terms = new_failed_terms

    if pending_terms:
        print(f"🔄 Retrying {len(pending_terms)} failed terms...")

# Save failed terms to CSV for review
if failed_terms:
    df_failed = pd.DataFrame(failed_terms)
    df_failed.to_csv(failed_file, index=False)
    print(f"❌ Failed terms saved to {failed_file}")
else:
    print("✅ No failed terms!")

print("✅ All regional data successfully processed! Check state_interest_comparison.csv")


✅ Found existing data: 2030 search term-year combinations already processed.
✅ Extracted regional data for: 8 minute back workout (2016)
✅ Extracted regional data for: Zdeno Chára (2016)
✅ Extracted regional data for: Wants and Needs, Drake (2021)
✅ Extracted regional data for: Fruit Roll-Ups trend (2023)
✅ Extracted regional data for: Bzrp Music Sessions, Vol. 53 - Shakira (2023)
✅ Extracted regional data for: Red, White & Royal Blue - Casey McQuiston (2023)
✅ Extracted regional data for: American Prometheus - Kai Bird and Martin J. Sherwin (2023)
✅ Extracted regional data for: Dill pickle chicken salad recipe (2024)
✅ No failed terms!
✅ All regional data successfully processed! Check state_interest_comparison.csv


In [1]:
from pytrends.request import TrendReq
import pandas as pd

# Initialize Pytrends
pytrends = TrendReq(hl='en-US', tz=360)

# Define Keywords
keywords = ["Kevin Spacey", "Wikipedia"]

# Build Payload for comparison
pytrends.build_payload(keywords, timeframe='2017-01-01 2017-12-31', geo='US')

# Extract Regional Interest
df_regions = pytrends.interest_by_region(resolution='REGION')

# Normalize as percentages
df_regions["Total"] = df_regions.sum(axis=1)
df_regions["Kevin Spacey %"] = (df_regions["Kevin Spacey"] / df_regions["Total"]) * 100
df_regions["Wikipedia %"] = (df_regions["Wikipedia"] / df_regions["Total"]) * 100

# Keep only percentage columns
df_regions = df_regions[["Kevin Spacey %", "Wikipedia %"]]

# Save to CSV
df_regions.to_csv("state_interest_comparison.csv")

In [2]:
from pytrends.request import TrendReq
import pandas as pd

pytrends = TrendReq(hl='en-US', tz=360)
pytrends.build_payload(["Google"], timeframe="now 1-d", geo='US')

df_trends = pytrends.interest_over_time()
print(df_trends.head())

                     Google  isPartial
date                                  
2025-02-28 18:56:00     100      False
2025-02-28 19:04:00     100      False
2025-02-28 19:12:00      96      False
2025-02-28 19:20:00      93      False
2025-02-28 19:28:00      94      False


  df = df.fillna(False)


In [18]:
import pandas as pd
from IPython.core.display import display, HTML

# Define file paths
file_search_trends = "clean_trends_data.csv"
file_wiki_comp_annual = "trends_with_wikipedia.csv"
file_wiki_comp_state = "state_interest_comparison.csv"

# Load data into DataFrames
d_search_trends = pd.read_csv(file_search_trends)
d_wiki_comp_annual = pd.read_csv(file_wiki_comp_annual)
d_wiki_comp_state = pd.read_csv(file_wiki_comp_state)

# Rename columns for consistency
d_search_trends.columns = ["date", "search_term", "interest", "year"]
d_wiki_comp_annual.columns = ["search_term", "search_term_interest", "wiki_interest", "year"]
d_wiki_comp_state.columns = ["search_term", "year", "state", "term_perc", "wiki_perc"]

# Convert DataFrames to scrollable HTML tables
def display_scrollable_dataframe(df, title):
    """Displays a DataFrame as a scrollable HTML table in Jupyter Notebook."""
    display(HTML(f"<h3>{title}</h3>"))
    display(HTML(df.to_html(notebook=True, escape=False)))

# Display tables as scrollable HTML
display_scrollable_dataframe(d_search_trends.head(20), "Search Trends Data")
display_scrollable_dataframe(d_wiki_comp_annual.head(20), "Annual Wikipedia Comparison")
display_scrollable_dataframe(d_wiki_comp_state.head(20), "State-Level Wikipedia Comparison")


  from IPython.core.display import display, HTML


Unnamed: 0,date,search_term,interest,year
0,2014-12-28 00:00:00,Donald Trump,2,2015
1,2015-01-04 00:00:00,Donald Trump,4,2015
2,2015-01-11 00:00:00,Donald Trump,3,2015
3,2015-01-18 00:00:00,Donald Trump,3,2015
4,2015-01-25 00:00:00,Donald Trump,5,2015
5,2015-02-01 00:00:00,Donald Trump,3,2015
6,2015-02-08 00:00:00,Donald Trump,2,2015
7,2015-02-15 00:00:00,Donald Trump,4,2015
8,2015-02-22 00:00:00,Donald Trump,3,2015
9,2015-03-01 00:00:00,Donald Trump,2,2015


Unnamed: 0,search_term,search_term_interest,wiki_interest,year
0,Boston Red Sox,14.886792,85.962264,2021
1,Los Angeles Lakers,7.037736,90.867925,2021
2,Refrigerator pickle brine recipe,0.0,89.641509,2024
3,Pickled cucumber salad recipe,0.0,89.641509,2024
4,Justin Timberlake,8.396226,42.283019,2024
5,Drake,20.490566,16.90566,2024
6,Kendrick Lamar,12.679245,39.433962,2024
7,Cleveland Guardians,13.584906,91.509434,2024
8,San Diego Padres,5.339623,91.509434,2024
9,Indiana Fever,19.150943,91.509434,2024


Unnamed: 0,search_term,year,state,term_perc,wiki_perc
0,Boston Red Sox,2021,Alabama,8.0,92.0
1,Boston Red Sox,2021,Alaska,11.0,89.0
2,Boston Red Sox,2021,Arizona,10.0,90.0
3,Boston Red Sox,2021,Arkansas,6.0,94.0
4,Boston Red Sox,2021,California,7.0,93.0
5,Boston Red Sox,2021,Colorado,8.0,92.0
6,Boston Red Sox,2021,Connecticut,40.0,60.0
7,Boston Red Sox,2021,Delaware,11.0,89.0
8,Boston Red Sox,2021,District of Columbia,6.0,94.0
9,Boston Red Sox,2021,Florida,16.0,84.0


In [20]:
# Calculate absolute interest
d_wiki_comp_annual["absolute_interest"] = d_wiki_comp_annual["search_term_interest"] / d_wiki_comp_annual["wiki_interest"]

# Sort by absolute interest in descending order
d_wiki_comp_annual = d_wiki_comp_annual.sort_values(by="absolute_interest", ascending=False)

# Display as a scrollable table
import IPython.core.display as display

display.display(display.HTML("<h3>Annual Wikipedia Comparison (Ordered by Absolute Interest)</h3>"))
display.display(display.HTML(d_wiki_comp_annual.to_html(notebook=True, escape=False)))


  display.display(display.HTML("<h3>Annual Wikipedia Comparison (Ordered by Absolute Interest)</h3>"))


  display.display(display.HTML(d_wiki_comp_annual.to_html(notebook=True, escape=False)))


Unnamed: 0,search_term,search_term_interest,wiki_interest,year,absolute_interest
1959,Election results,2.169811,0.0,2020,inf
1577,IT,74.754717,4.150943,2017,18.009091
420,Coronavirus,16.0,1.0,2020,16.0
643,Wordle,52.754717,5.018868,2022,10.511278
1216,Trump,15.566038,2.0,2016,7.783019
496,NBA,41.169811,5.301887,2021,7.765125
781,Ohio,68.207547,9.226415,2023,7.392638
338,Unemployment,41.09434,5.698113,2020,7.211921
857,Football,33.283019,5.09434,2015,6.533333
1547,Ford,88.320755,17.830189,2017,4.953439
