# AI-Powered Content Audit - Using Google's Helpful Content Guidelines

In [20]:
import json
import os
import advertools as adv
import pandas as pd
pd.options.display.max_columns = None
import openai
client = openai.OpenAI(api_key=os.environ['OPENAI_API_KEY'])


questions_df = pd.read_csv("data/quality_guidelines.csv")
questions = questions_df['question'].tolist()[:20]
questions[:5]

['Does the content provide original information, reporting, research, or analysis?',
 'Does the content provide a substantial, complete, or comprehensive description of the topic?',
 'Does the content provide insightful analysis or interesting information that is beyond the obvious?',
 'If the content draws on other sources, does it avoid simply copying or rewriting those sources, and instead provide substantial additional value and originality?',
 'Does the main heading or page title provide a descriptive, helpful summary of the content?']

## Crawl a website

In [2]:
# adv.crawl("https://seoweek.org/", "data/seoweek.jsonl", follow_links=True)

## Read the crawled file into a DataFrame

In [22]:
seoweek = pd.read_json('data/seoweek.jsonl', lines=True)

## Filter the desired pages

In [4]:
podcasts = seoweek[seoweek['body_text'].str.contains('transcript', case=False)][['url','title', 'body_text']]

In [5]:
podcasts.head()

Unnamed: 0,url,title,body_text
1,https://seoweek.org/devin-bramhall/,Stop Doing Marketing Featuring Devin Bramhall,\n\t\t \n \n X \n \n\t\t HOME \n AGENDA \n \t\...
2,https://seoweek.org/crystal-carter/,Diving into Deepseek Generative Search Optimiz...,\n\t\t \n \n X \n \n\t\t HOME \n AGENDA \n \t\...
3,https://seoweek.org/jeff-coyle/,"Authoritative Intelligence: Evolving IR, NLP, ...",\n\t\t \n \n X \n \n\t\t HOME \n AGENDA \n \t\...
6,https://seoweek.org/bianca-anderson/,F$%@ Traffic: Prioritizing Conversions Over Va...,\n\t\t \n \n X \n \n\t\t HOME \n AGENDA \n \t\...
7,https://seoweek.org/wil-reynolds/,"The ""Other"" AI That is Hurting Your Organic Tr...",\n\t\t \n \n X \n \n\t\t HOME \n AGENDA \n \t\...


In [6]:
prompt_intro = """Please answer the following questions about this article.
Respond in JSON where questions are keys and answers are values.
Send the JSON string only.
Answers should be boolean only."""

## Loop through the pages, and make a request to evaluate the article with all questions

In [15]:
responses = []

for url, title, body in podcasts.values:
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "user",
            "content": f"""
            {prompt_intro}

            Questions: {questions}

            -------------
            Article Title: {title}

            -------------
            Article Text: {body}

"""}
        ],
        temperature=0,
        response_format={"type": "json_object"},
    )
    responses.append((url, title, response))

# Merge all responses in one DataFrame

In [27]:
dflist = []
for url, title, evaluation in responses:
    d = json.loads(evaluation.choices[0].message.content)
    df = pd.DataFrame({
        'question': d.keys(),
        'answer': d.values()
    })
    df['url'] = url
    df['title'] = title
    dflist.append(df)
final_df = pd.concat(dflist)[['url', 'title', 'question', 'answer']]
final_df.to_csv("data/evaluations.csv", index=False)

In [28]:
evaluations_df = pd.read_csv("data/evaluations.csv")

In [29]:
evaluations_df.sample(15)

Unnamed: 0,url,title,question,answer
191,https://seoweek.org/tom-critchlow/,Executive Presence: How To Get Buy In and Budg...,Is the content carefully crafted with individu...,True
210,https://seoweek.org/ross-hudgens/,The Evolving Content Marketing Playbook Featur...,"Is the content well-produced, appearing polish...",True
159,https://seoweek.org/cindy-krum/,Word to Your MUM Featuring Cindy Krum,Will someone reading your content leave feelin...,True
3,https://seoweek.org/devin-bramhall/,Stop Doing Marketing Featuring Devin Bramhall,"If the content draws on other sources, does it...",True
300,https://seoweek.org/elias-dabbas/,The Rise of the SEO Data Scientist Featuring E...,"Does the content provide original information,...",True
252,https://seoweek.org/dan-petrovic/,Beyond Rank Tracking: Analyzing Brand Percepti...,Does the content present information in a way ...,True
155,https://seoweek.org/cindy-krum/,Word to Your MUM Featuring Cindy Krum,Is the content free of easily-verified factual...,True
9,https://seoweek.org/devin-bramhall/,Stop Doing Marketing Featuring Devin Bramhall,Is the content free of spelling or stylistic i...,True
204,https://seoweek.org/ross-hudgens/,The Evolving Content Marketing Playbook Featur...,Does the main heading or page title provide a ...,True
103,https://seoweek.org/jori-ford/,Hybrid Engine Optimization: A Crawler Driven A...,"If the content draws on other sources, does it...",True


In [30]:
evaluations_df.groupby('url')['answer'].mean().reset_index().style.background_gradient(subset=['answer'])

Unnamed: 0,url,answer
0,https://seoweek.org/aleyda-solis/,1.0
1,https://seoweek.org/bianca-anderson/,0.95
2,https://seoweek.org/cindy-krum/,1.0
3,https://seoweek.org/crystal-carter/,1.0
4,https://seoweek.org/dan-petrovic/,1.0
5,https://seoweek.org/devin-bramhall/,1.0
6,https://seoweek.org/elias-dabbas/,1.0
7,https://seoweek.org/jeff-coyle/,1.0
8,https://seoweek.org/jori-ford/,1.0
9,https://seoweek.org/nick-eubanks/,1.0


In [31]:
evaluations_df.groupby('question')['answer'].mean().reset_index().style.background_gradient(subset=['answer'])

Unnamed: 0,question,answer
0,"After reading your content, will someone leave feeling they've learned enough about a topic to help achieve their goal?",1.0
1,Do you have an existing or intended audience for your business or site that would find the content useful if they came directly to you?,1.0
2,"Does the content present information in a way that makes you want to trust it, such as clear sourcing, evidence of the expertise involved, background about the author or the site that publishes it, such as through links to an author page or a site's About page?",1.0
3,"Does the content provide a substantial, complete, or comprehensive description of the topic?",1.0
4,Does the content provide insightful analysis or interesting information that is beyond the obvious?,1.0
5,"Does the content provide original information, reporting, research, or analysis?",1.0
6,Does the content provide substantial value when compared to other pages in search results?,1.0
7,Does the main heading or page title avoid exaggerating or being shocking in nature?,0.9375
8,"Does the main heading or page title provide a descriptive, helpful summary of the content?",1.0
9,"Does your content clearly demonstrate first-hand expertise and a depth of knowledge (for example, expertise that comes from having actually used a product or service, or visiting a place)?",1.0
