# AI-Powered Content Audit - Using Google's Helpful Content Guidelines

In [None]:
!python3 -m pip install advertools openai

In [4]:
import json
import os
import advertools as adv
import pandas as pd
pd.options.display.max_columns = None
import openai
client = openai.OpenAI(api_key=os.environ['OPENAI_API_KEY'])


questions_df = pd.read_csv("data/quality_guidelines.csv")
questions = questions_df['question'].tolist()[:20]
questions[:5]

['Does the content provide original information, reporting, research, or analysis?',
 'Does the content provide a substantial, complete, or comprehensive description of the topic?',
 'Does the content provide insightful analysis or interesting information that is beyond the obvious?',
 'If the content draws on other sources, does it avoid simply copying or rewriting those sources, and instead provide substantial additional value and originality?',
 'Does the main heading or page title provide a descriptive, helpful summary of the content?']

## Crawl a website

In [2]:
adv.crawl(url_list="https://seoweek.org/", output_file="seoweek.jsonl", follow_links=True)

## Read the crawl file into a DataFrame

In [5]:
seoweek = pd.read_json('seoweek.jsonl', lines=True)

## Filter the desired pages

In [6]:
podcasts = seoweek[seoweek['body_text'].str.contains('transcript', case=False)][['url','title', 'body_text']]

In [7]:
podcasts.head()

Unnamed: 0,url,title,body_text
1,https://seoweek.org/bianca-anderson/,F$%@ Traffic: Prioritizing Conversions Over Va...,\n\t\t \n \n X \n \n\t\t HOME \n AGENDA \n \t\...
2,https://seoweek.org/talia-wolf/,Stop Chasing Conversions: Win More Customers w...,\n\t\t \n \n X \n \n\t\t HOME \n AGENDA \n \t\...
3,https://seoweek.org/ross-simmonds/,"Search, AI & UGC: Navigating the Future of Goo...",\n\t\t \n \n X \n \n\t\t HOME \n AGENDA \n \t\...
4,https://seoweek.org/ross-hudgens/,The Evolving Content Marketing Playbook Featur...,\n\t\t \n \n X \n \n\t\t HOME \n AGENDA \n \t\...
5,https://seoweek.org/cindy-krum/,Word to Your MUM Featuring Cindy Krum,\n\t\t \n \n X \n \n\t\t HOME \n AGENDA \n \t\...


In [8]:
prompt_intro = """Please answer the following questions about this article.
Respond in JSON where questions are keys and answers are values.
Send the JSON string only.
Answers should be boolean only."""

## Loop through the pages, and make a request to evaluate the article with all questions

In [15]:
responses = []

for url, title, body in podcasts.values:
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "user",
            "content": f"""
            {prompt_intro}

            Questions: {questions}

            -------------
            Article Title: {title}

            -------------
            Article Text: {body}

"""}
        ],
        temperature=0,
        response_format={"type": "json_object"},
    )
    responses.append((url, title, response))

# Merge all responses in one DataFrame

In [10]:
dflist = []
for url, title, evaluation in responses:
    d = json.loads(evaluation.choices[0].message.content)
    df = pd.DataFrame({
        'question': d.keys(),
        'answer': d.values()
    })
    df['url'] = url
    df['title'] = title
    dflist.append(df)
final_df = pd.concat(dflist)[['url', 'title', 'question', 'answer']]
final_df.to_csv("data/evaluations.csv", index=False)

In [11]:
evaluations_df = pd.read_csv("data/evaluations.csv")

In [12]:
evaluations_df.sample(15)

Unnamed: 0,url,title,question,answer
332,https://seoweek.org/tom-critchlow/,Executive Presence: How To Get Buy In and Budg...,Does the content present information in a way ...,True
167,https://seoweek.org/jeff-coyle/,"Authoritative Intelligence: Evolving IR, NLP, ...",Would you expect to see this content in or ref...,True
187,https://seoweek.org/wil-reynolds/,"The ""Other"" AI That is Hurting Your Organic Tr...",Would you expect to see this content in or ref...,True
365,https://seoweek.org/zack-notes/,LLM Experience Gain: How We Climbed the Animal...,Does the main heading or page title avoid exag...,True
480,https://seoweek.org/phil-nottingham/,YouTube Keyword Research 2.0 Featuring Phil No...,"Does the content provide original information,...",True
308,https://seoweek.org/devin-bramhall/,Stop Doing Marketing Featuring Devin Bramhall,Does the content provide substantial value whe...,True
373,https://seoweek.org/zack-notes/,LLM Experience Gain: How We Climbed the Animal...,If someone researched the site producing the c...,True
173,https://seoweek.org/jeff-coyle/,"Authoritative Intelligence: Evolving IR, NLP, ...",If someone researched the site producing the c...,True
485,https://seoweek.org/phil-nottingham/,YouTube Keyword Research 2.0 Featuring Phil No...,Does the main heading or page title avoid exag...,True
447,https://seoweek.org/dawn-anderson/,From Villains to Heroes Featuring Dawn Anderson,Would you expect to see this content in or ref...,True


## Evaluations by page

In [13]:
(evaluations_df
 .groupby('url')['answer']
 .mean()
 .reset_index()
 .style
 .background_gradient(subset=['answer'])
 .format({'answer': "{:.0%}"}))

Unnamed: 0,url,answer
0,https://seoweek.org/aleyda-solis/,100%
1,https://seoweek.org/andrew-prince/,100%
2,https://seoweek.org/bianca-anderson/,95%
3,https://seoweek.org/brie-anderson/,100%
4,https://seoweek.org/cindy-krum/,100%
5,https://seoweek.org/crystal-carter/,100%
6,https://seoweek.org/dan-petrovic/,100%
7,https://seoweek.org/dawn-anderson/,100%
8,https://seoweek.org/devin-bramhall/,100%
9,https://seoweek.org/elias-dabbas/,100%


## Evaluations by question

In [14]:
(evaluations_df
 .groupby('question')['answer']
 .mean()
 .reset_index()
 .style
 .background_gradient(subset=['answer'])
 .format({'answer': "{:.0%}"}))

Unnamed: 0,question,answer
0,"After reading your content, will someone leave feeling they've learned enough about a topic to help achieve their goal?",100%
1,Do you have an existing or intended audience for your business or site that would find the content useful if they came directly to you?,100%
2,"Does the content present information in a way that makes you want to trust it, such as clear sourcing, evidence of the expertise involved, background about the author or the site that publishes it, such as through links to an author page or a site's About page?",100%
3,"Does the content provide a substantial, complete, or comprehensive description of the topic?",100%
4,Does the content provide insightful analysis or interesting information that is beyond the obvious?,100%
5,"Does the content provide original information, reporting, research, or analysis?",100%
6,Does the content provide substantial value when compared to other pages in search results?,100%
7,Does the main heading or page title avoid exaggerating or being shocking in nature?,96%
8,"Does the main heading or page title provide a descriptive, helpful summary of the content?",100%
9,"Does your content clearly demonstrate first-hand expertise and a depth of knowledge (for example, expertise that comes from having actually used a product or service, or visiting a place)?",100%
