## Search Terms

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Base URL pattern for Google Trends Year in Search
base_url = "https://trends.withgoogle.com/year-in-search/{year}/us/"

# List to store all data
all_data = []

# Loop through each year from 2015 to 2025
for year in range(2015, 2026):
    url = base_url.format(year=year)
    
    try:
        # Fetch the webpage content
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, "html.parser")

        # Find all category containers
        categories = soup.find_all("div", class_="card-inner-container")

        for category in categories:
            # Extract the category title
            title_tag = category.find("h4", class_="glue-headline--headline-4")
            if title_tag:
                category_title = title_tag.text.strip()
            else:
                continue  # Skip if no title found

            # Extract the list items within the category
            items = category.find_all("li", class_="glue-card__list-item")
            for rank, item in enumerate(items, start=1):
                # Extract only the text inside the search term
                term_tag = item.find("h4")  # The actual trending search term is inside <h4>
                if term_tag:
                    search_term = term_tag.text.strip()
                else:
                    search_term = item.get_text(strip=True).split("Search it")[0].strip()  # Remove "Search it" text

                all_data.append([year, category_title, rank, search_term])
    
        print(f"✅ Successfully scraped {year}")
    
    except Exception as e:
        print(f"❌ Failed to scrape {year}: {e}")

# Create a DataFrame
df_trending_searches = pd.DataFrame(all_data, columns=["Year", "Category", "Rank", "Search Term"])

print(f"✅ Data saved to {csv_filename}")




✅ Successfully scraped 2015
✅ Successfully scraped 2016
✅ Successfully scraped 2017
✅ Successfully scraped 2018
✅ Successfully scraped 2019
✅ Successfully scraped 2020
✅ Successfully scraped 2021
✅ Successfully scraped 2022
✅ Successfully scraped 2023
✅ Successfully scraped 2024
✅ Successfully scraped 2025
✅ Data saved to google_trends_year_in_search.csv


In [8]:
# import pandas as pd
# from IPython.display import display, HTML  # Corrected import

# # Save DataFrame as a CSV file
# csv_filename = "google_trending_searches_2015_2025.csv"
# df_trending_searches.to_csv(csv_filename, index=False)
# print(f"✅ Data saved as '{csv_filename}'")

# # Generate HTML table with scrollable container
# scrollable_table_html = f"""
# <style>
#     .scrollable-table {{
#         overflow-y: auto;
#         height: 500px;
#         display: block;
#         border: 1px solid #ddd;
#     }}
#     table {{
#         border-collapse: collapse;
#         width: 100%;
#     }}
#     th, td {{
#         border: 1px solid #ddd;
#         padding: 8px;
#         text-align: left;
#     }}
#     th {{
#         background-color: #f4f4f4;
#         position: sticky;
#         top: 0;
#     }}
# </style>
# <div class="scrollable-table">
#     {df_trending_searches.to_html(index=False, classes='table table-striped table-hover')}
# </div>
# """

# # Display the scrollable table
# display(HTML(scrollable_table_html))

## Weekly Search Interest

In [3]:
from pytrends.request import TrendReq
import pandas as pd
import time
import os

# Load processed data if it exists
clean_data_file = "clean_trends_data.csv"
failed_data_file = "failed_terms.csv"

# Load existing clean data
if os.path.exists(clean_data_file):
    df_existing = pd.read_csv(clean_data_file)
    completed_terms = set(df_existing["Search Term"].unique())
    print(f"✅ Found existing data: {len(completed_terms)} search terms already processed.")
else:
    df_existing = pd.DataFrame()
    completed_terms = set()

# Load the first 50 rows from the CSV for testing
input_csv = "google_trending_searches_2015_2025_x.csv"
df_trending_searches_processed = pd.read_csv(input_csv)

# Initialize Pytrends
pytrends = TrendReq(hl='en-US', tz=360, retries=3)

# Prepare storage for results
all_trends = []

# Extract search terms that still need to be processed
pending_terms = [
    (row["Search Term"], str(row["Year"])) for _, row in df_trending_searches_processed.iterrows()
    if row["Search Term"] not in completed_terms
]

max_retries = 3  # Maximum retries per search term

while pending_terms:
    failed_terms = []  # Reset failed terms list in each loop

    for search_term, year in pending_terms:
        attempt = 0
        success = False

        while attempt < max_retries and not success:
            try:
                # Build payload for a single search term
                pytrends.build_payload([search_term], timeframe=f"{year}-01-01 {year}-12-31", geo='US')

                # Get interest over time
                df_trends = pytrends.interest_over_time()

                # Remove 'isPartial' column if present
                if 'isPartial' in df_trends.columns:
                    df_trends = df_trends.drop(columns=['isPartial'])

                # Convert data to long format (Tidy Data)
                df_trends = df_trends.reset_index().melt(id_vars=["date"], var_name="Search Term", value_name="Interest")

                # Add metadata columns
                df_trends["Year"] = year

                # Append results
                all_trends.append(df_trends)

                print(f"✅ Extracted: {search_term} ({year})")
                success = True  # Mark as success

                # Save progress after every search term
                pd.concat(all_trends + [df_existing], ignore_index=True).to_csv(clean_data_file, index=False)

            except Exception as e:
                attempt += 1
                print(f"❌ Attempt {attempt}/{max_retries} failed for {search_term} ({year}): {e}")
                time.sleep(5 * attempt)  # Exponential backoff (5s, 10s, 15s)

        if not success:  # If all retries failed, add to failed list
            failed_terms.append((search_term, year))

    # Update pending terms for the next loop (only failed ones)
    pending_terms = failed_terms  

    if pending_terms:
        print(f"🔄 Retrying {len(pending_terms)} failed terms...")

# If all terms are processed, remove the failed_terms.csv
if os.path.exists(failed_data_file):
    os.remove(failed_data_file)

print("✅ All terms successfully processed! Check clean_trends_data.csv")


✅ Found existing data: 1171 search terms already processed.
❌ Attempt 1/3 failed for Duke Blue Devils Men's Basketball (2015): The request failed: Google returned a response with code 429
❌ Attempt 2/3 failed for Duke Blue Devils Men's Basketball (2015): The request failed: Google returned a response with code 429
❌ Attempt 3/3 failed for Duke Blue Devils Men's Basketball (2015): The request failed: Google returned a response with code 429
❌ Attempt 1/3 failed for Lenny Kravitz (2015): The request failed: Google returned a response with code 429
❌ Attempt 2/3 failed for Lenny Kravitz (2015): The request failed: Google returned a response with code 429
❌ Attempt 3/3 failed for Lenny Kravitz (2015): The request failed: Google returned a response with code 429
❌ Attempt 1/3 failed for Nicole Curtis (2015): The request failed: Google returned a response with code 429
❌ Attempt 2/3 failed for Nicole Curtis (2015): The request failed: Google returned a response with code 429
❌ Attempt 3/3 fa

KeyboardInterrupt: 

In [58]:
from pytrends.request import TrendReq
import pandas as pd

# Initialize Pytrends
pytrends = TrendReq(hl='en-US', tz=360)

# Define Keywords for comparison
keywords = ["Donald Trump", "Wikipedia"]

# Build Payload
pytrends.build_payload(keywords, timeframe='2017-01-01 2017-12-31', geo='US')

# Get weekly interest over time
df_trends = pytrends.interest_over_time()

# Remove 'isPartial' column if present
if 'isPartial' in df_trends.columns:
    df_trends = df_trends.drop(columns=['isPartial'])

# Reset index to include date as a column
df_trends.reset_index(inplace=True)

print(df_trends.head())  # Show first few rows


        date  Donald Trump  Wikipedia
0 2017-01-01            27         17
1 2017-01-08            48         18
2 2017-01-15           100         18
3 2017-01-22            79         20
4 2017-01-29            64         19


In [11]:
from pytrends.request import TrendReq
import pandas as pd

# Initialize Pytrends
pytrends = TrendReq(hl='en-US', tz=360)

# Define Keywords for comparison
keywords = ["Donald Trump", "Wikipedia"]

# Build Payload
pytrends.build_payload(keywords, timeframe='2017-01-01 2017-12-31', geo='US')

# Get weekly interest over time
df_trends = pytrends.interest_over_time()

# Remove 'isPartial' column if present
if 'isPartial' in df_trends.columns:
    df_trends = df_trends.drop(columns=['isPartial'])

# Reset index to include date as a column
df_trends.reset_index(inplace=True)

print(df_trends.head())  # Show first few rows


TooManyRequestsError: The request failed: Google returned a response with code 429

In [7]:
from pytrends.request import TrendReq
import pandas as pd

# Initialize Pytrends
pytrends = TrendReq(hl='en-US', tz=360)

# Define Keywords for comparison
keywords = ["Bernie Sanders"]

# Build Payload
pytrends.build_payload(keywords, timeframe='2015-01-01 2015-12-31', geo='US')

# Get weekly interest over time
df_trends = pytrends.interest_over_time()

# Remove 'isPartial' column if present
if 'isPartial' in df_trends.columns:
    df_trends = df_trends.drop(columns=['isPartial'])

# Reset index to include date as a column
df_trends.reset_index(inplace=True)

print(df_trends.head())  # Show first few rows


TooManyRequestsError: The request failed: Google returned a response with code 429

## Annualised Comp to Wikipedia

In [64]:
from pytrends.request import TrendReq
import pandas as pd

# Initialize Pytrends
pytrends = TrendReq(hl='en-US', tz=360)

# Define Keywords for comparison
keywords = ["Donald Trump", "Wikipedia"]

# Define Year
year = "2017"

# Build Payload
pytrends.build_payload(keywords, timeframe=f"{year}-01-01 {year}-12-31", geo='US')

# Get weekly interest over time
df_trends = pytrends.interest_over_time()

# Remove 'isPartial' column if present
if 'isPartial' in df_trends.columns:
    df_trends = df_trends.drop(columns=['isPartial'])

# Compute the yearly average interest
df_average_interest = df_trends.mean().reset_index()
df_average_interest.columns = ["Search Term", "Average Interest"]

# Add Year column
df_average_interest["Year"] = year

# Display the DataFrame
print(df_average_interest)


    Search Term  Average Interest  Year
0  Donald Trump         27.000000  2017
1     Wikipedia         17.660377  2017


## Extract State Comparison

In [None]:
from pytrends.request import TrendReq
import pandas as pd

# Initialize Pytrends
pytrends = TrendReq(hl='en-US', tz=360)

# Define Keywords
keywords = ["Kevin Spacey", "Wikipedia"]

# Build Payload for comparison
pytrends.build_payload(keywords, timeframe='2017-01-01 2017-12-31', geo='US')

# Extract Regional Interest
df_regions = pytrends.interest_by_region(resolution='REGION')

# Normalize as percentages
df_regions["Total"] = df_regions.sum(axis=1)
df_regions["Kevin Spacey %"] = (df_regions["Kevin Spacey"] / df_regions["Total"]) * 100
df_regions["Wikipedia %"] = (df_regions["Wikipedia"] / df_regions["Total"]) * 100

# Keep only percentage columns
df_regions = df_regions[["Kevin Spacey %", "Wikipedia %"]]

# Save to CSV
df_regions.to_csv("state_interest_comparison.csv")

In [9]:
from pytrends.request import TrendReq
import pandas as pd

pytrends = TrendReq(hl='en-US', tz=360)
pytrends.build_payload(["Google"], timeframe="now 1-d", geo='US')

df_trends = pytrends.interest_over_time()
print(df_trends.head())

TooManyRequestsError: The request failed: Google returned a response with code 429