In [44]:
import os
import json
import re
from groq import Groq
from textstat import flesch_kincaid_grade, flesch_reading_ease
import asyncio
import nest_asyncio

nest_asyncio.apply()

client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

In [45]:
def get_readability_scores(text):
    if not text:
        return {"grade_level": 0, "reading_ease": 0}
    
    return {
        "grade_level": flesch_kincaid_grade(text),
        "reading_ease": flesch_reading_ease(text)
    }

async def analyze_readability_for_marketers(text, title):
    # keep it short for the api call
    content_snippet = text
    
    prompt = f"""
    hey analyze this documentation for someone who's not technical (marketer):
    
    title: {title}
    content: {content_snippet}
    
    is this easy to read for marketers? what sucks about it?
    give me 3 ways to make it better
    
    format:
    readability: good/fair/poor
    main issues: [issues here]
    suggestions: [3 things]
    """
    
    response = client.chat.completions.create(
        messages=[{"role": "user", "content": prompt}],
        model="llama-3.3-70b-versatile",
        temperature=0.3
    )
    
    return response.choices[0].message.content

In [46]:
def analyze_structure(sections, html_content):
    # basic checks for structure elements
    has_lists = '<ul>' in html_content or '<ol>' in html_content
    has_code = '<code>' in html_content or '<pre>' in html_content
    
    section_lengths = []
    for section in sections:
        content = section.get('content', '')
        section_lengths.append(len(content.split()))
    
    avg_length = sum(section_lengths) / len(section_lengths) if section_lengths else 0
    
    issues = []
    if len(sections) < 3:
        issues.append("not enough sections")
    if avg_length > 300:
        issues.append("sections way too long")
    if not has_lists:
        issues.append("missing lists/bullets")
    
    return {
        "total_sections": len(sections),
        "avg_section_length": avg_length,
        "has_lists": has_lists,
        "has_code": has_code,
        "issues": issues
    }

async def analyze_flow(sections, title):
    # just grab first few section titles
    section_titles = [s.get('heading', 'untitled') for s in sections[:5]]
    
    prompt = f"""
    article: {title}
    sections: {section_titles}
    
    does this flow make sense? are things in the right order?
    how would you reorganize this?
    
    give me 2 specific fixes
    """
    
    response = client.chat.completions.create(
        messages=[{"role": "user", "content": prompt}],
        model="llama-3.3-70b-versatile"
    )
    
    return response.choices[0].message.content

In [47]:
def check_completeness(sections, html_content):
    full_text = ' '.join([s.get('content', '') for s in sections])
    
    # count different types of helpful content
    example_count = len(re.findall(r'example|for instance|such as', full_text, re.IGNORECASE))
    step_count = len(re.findall(r'step \d+|follow these steps', full_text, re.IGNORECASE))
    code_count = len(re.findall(r'<code>|<pre>|```', html_content))
    
    return {
        "examples": example_count,
        "steps": step_count,
        "code_blocks": code_count,
        "word_count": len(full_text.split())
    }

async def analyze_completeness(sections, title):
    # grab preview of content
    preview = ""
    for section in sections[:3]:
        heading = section.get('heading', '')
        content = section.get('content', '')[:200]
        preview += f"{heading}: {content}...\n"
    
    prompt = f"""
    article: {title}
    content preview: {preview}
    
    what's missing if someone actually wants to do this?
    what examples or steps are needed?
    
    give me 3 things that are missing
    """
    
    response = client.chat.completions.create(
        messages=[{"role": "user", "content": prompt}],
        model="llama-3.3-70b-versatile"
    )
    
    return response.choices[0].message.content

In [48]:
async def analyze_style(text, title):
    # truncate for api limits
    text_sample = text[:2000]
    
    prompt = f"""
    check this doc against microsoft style guide basics:
    
    title: {title}
    content: {text_sample}
    
    check these 3 things:
    1. voice/tone - customer focused and clear?
    2. clarity - sentences too complex?
    3. action oriented - actually helps users?
    
    give specific examples of what's wrong and how to fix
    """
    
    response = client.chat.completions.create(
        messages=[{"role": "user", "content": prompt}],
        model="llama-3.3-70b-versatile"
    )
    
    return response.choices[0].message.content

def basic_style_check(text):
    sentences = re.split(r'[.!?]+', text)
    sentences = [s.strip() for s in sentences if s.strip()]
    
    # look for passive voice patterns
    passive_voice = len(re.findall(r'\b(is|are|was|were)\s+\w+ed\b', text))
    you_count = len(re.findall(r'\byou\b', text, re.IGNORECASE))
    user_count = len(re.findall(r'\buser\b', text, re.IGNORECASE))
    
    avg_sentence_len = sum(len(s.split()) for s in sentences) / len(sentences) if sentences else 0
    
    return {
        "avg_sentence_length": avg_sentence_len,
        "passive_voice_count": passive_voice,
        "you_references": you_count,
        "user_references": user_count
    }

In [49]:
async def analyze_article(article_data):
    url = article_data.get('url', '')
    title = article_data.get('title', '')
    sections = article_data.get('sections', [])
    full_text = article_data.get('fullText', '')
    html_content = article_data.get('htmlContent', '')
    
    print(f"analyzing: {title}")
    
    # run all the checks
    readability_scores = get_readability_scores(full_text)
    readability_analysis = await analyze_readability_for_marketers(full_text, title)
    
    structure_metrics = analyze_structure(sections, html_content)
    flow_analysis = await analyze_flow(sections, title)
    
    completeness_metrics = check_completeness(sections, html_content)
    completeness_analysis = await analyze_completeness(sections, title)
    
    style_metrics = basic_style_check(full_text)
    style_analysis = await analyze_style(full_text, title)
    
    # package everything up
    report = {
        "url": url,
        "title": title,
        "readability": {
            "scores": readability_scores,
            "analysis": readability_analysis
        },
        "structure": {
            "metrics": structure_metrics,
            "flow_analysis": flow_analysis
        },
        "completeness": {
            "metrics": completeness_metrics,
            "analysis": completeness_analysis
        },
        "style": {
            "metrics": style_metrics,
            "analysis": style_analysis
        }
    }
    
    return report

In [50]:
def load_articles(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # handle different json structures
    if 'articles' in data:
        articles = data['articles']
    else:
        articles = data
    
    # only keep articles that actually have content
    return [a for a in articles if a.get('success') and a.get('fullText')]

# load up the data
articles = load_articles('extracted_articles_complete.json')
print(f"loaded {len(articles)} articles")

# test on first one
test_result = await analyze_article(articles[0])
print("\nanalysis done!")

loaded 879 articles
analyzing: Raise a Support Ticket Through MoEngage Dashboard

analysis done!


In [52]:
def show_results(result):
    print("=" * 80)
    print(f"ANALYSIS REPORT")
    print("=" * 80)
    print(f"title: {result['title']}")
    print(f"url: {result['url']}")
    print()
    
    # readability stuff
    print("READABILITY FOR MARKETERS")
    print("-" * 40)
    scores = result['readability']['scores']
    print(f"grade level: {scores['grade_level']:.1f}")
    print(f"reading ease: {scores['reading_ease']:.1f}")
    
    # extract the actual assessment
    analysis = result['readability']['analysis']
    if "readability:" in analysis.lower():
        readability_score = analysis.split('\n')[0].replace("readability:", "").strip()
        print(f"marketer friendly: {readability_score}")
    
    # get the suggestions - safer approach
    if "suggestions:" in analysis.lower():
        suggestions_split = analysis.split("suggestions:")
        if len(suggestions_split) > 1:
            suggestions_part = suggestions_split[1]
            suggestions = [s.strip() for s in suggestions_part.split('\n') if s.strip() and len(s.strip()) > 0 and s.strip()[0].isdigit()]
            if suggestions:
                print("top suggestions:")
                for i, suggestion in enumerate(suggestions[:2], 1):
                    clean_suggestion = suggestion.split('.', 1)[1].strip() if '.' in suggestion else suggestion
                    print(f"  {i}. {clean_suggestion[:100]}...")
    print()
    
    # structure analysis
    print("STRUCTURE AND FLOW")
    print("-" * 40)
    metrics = result['structure']['metrics']
    print(f"total sections: {metrics['total_sections']}")
    print(f"avg section length: {metrics['avg_section_length']:.0f} words")
    print(f"has lists: {'yes' if metrics['has_lists'] else 'no'}")
    print(f"has code examples: {'yes' if metrics['has_code'] else 'no'}")
    
    if metrics['issues']:
        print("issues found:")
        for issue in metrics['issues']:
            print(f"  - {issue}")
    
    # flow improvements - safer approach
    flow_analysis = result['structure']['flow_analysis']
    if "suggestions" in flow_analysis.lower() or "improve" in flow_analysis.lower():
        print("flow improvements:")
        lines = flow_analysis.split('\n')
        suggestion_count = 0
        for line in lines:
            if line.strip() and (line.strip()[0].isdigit() or '**' in line or 'add' in line.lower() or 'include' in line.lower()):
                clean_line = line.replace('**', '').strip()
                if clean_line and suggestion_count < 3:
                    print(f"  - {clean_line[:80]}...")
                    suggestion_count += 1
    print()
    
    # completeness check
    print("COMPLETENESS AND EXAMPLES")
    print("-" * 40)
    comp_metrics = result['completeness']['metrics']
    print(f"examples found: {comp_metrics['examples']}")
    print(f"step-by-step instructions: {comp_metrics['steps']}")
    print(f"code blocks: {comp_metrics['code_blocks']}")
    print(f"total words: {comp_metrics['word_count']}")
    
    # what's missing - safer approach
    comp_analysis = result['completeness']['analysis']
    print("missing elements:")
    lines = comp_analysis.split('\n')
    missing_count = 0
    for line in lines:
        if line.strip() and (line.strip()[0].isdigit() or '**' in line or 'missing' in line.lower()):
            clean_line = line.replace('**', '').replace('*', '').strip()
            if clean_line and len(clean_line) > 10 and missing_count < 3:
                print(f"  - {clean_line[:80]}...")
                missing_count += 1
    print()
    
    # style check
    print("STYLE GUIDELINES")
    print("-" * 40)
    style_metrics = result['style']['metrics']
    print(f"avg sentence length: {style_metrics['avg_sentence_length']:.1f} words")
    print(f"passive voice count: {style_metrics['passive_voice_count']}")
    print(f"'you' references: {style_metrics['you_references']}")
    print(f"'user' references: {style_metrics['user_references']}")
    
    # style issues - safer approach
    style_analysis = result['style']['analysis']
    if "problem" in style_analysis.lower() or "issue" in style_analysis.lower():
        print("style issues found:")
        # just look for any numbered items or bullet points
        lines = style_analysis.split('\n')
        issue_count = 0
        for line in lines:
            if line.strip() and (line.strip()[0].isdigit() or line.startswith('-') or '**' in line):
                clean_line = line.replace('**', '').replace('*', '').strip()
                if clean_line and len(clean_line) > 10 and issue_count < 3:
                    print(f"  - {clean_line[:80]}...")
                    issue_count += 1
    
    print("=" * 80)

# show the results
show_results(test_result)

ANALYSIS REPORT
title: Raise a Support Ticket Through MoEngage Dashboard
url: https://help.moengage.com/hc/en-us/articles/19708702327572-Raise-a-Support-Ticket-Through-MoEngage-Dashboard

READABILITY FOR MARKETERS
----------------------------------------
grade level: 8.7
reading ease: 52.3
marketer friendly: Readability: Fair

STRUCTURE AND FLOW
----------------------------------------
total sections: 1
avg section length: 388 words
has lists: yes
has code examples: no
issues found:
  - not enough sections
  - sections way too long
flow improvements:
  - 1. Add prerequisite sections: Before diving into the 'Introduction', consider ad...
  - 2. Outline the support ticket process: After the 'Introduction', consider adding...

COMPLETENESS AND EXAMPLES
----------------------------------------
examples found: 1
step-by-step instructions: 0
code blocks: 0
total words: 388
missing elements:
  - Based on the provided content preview, here are 3 things that are missing for so...
  - 1. Login i

In [53]:
def show_summary(result):
    print(f"QUICK SUMMARY: {result['title']}")
    print("=" * 60)
    
    # overall readability
    scores = result['readability']['scores']
    grade = scores['grade_level']
    ease = scores['reading_ease']
    
    # figure out readability level
    if ease >= 60:
        readability_level = "good"
    elif ease >= 30:
        readability_level = "fair"
    else:
        readability_level = "poor"
    
    print(f"readability: {readability_level} (grade {grade:.1f})")
    
    # structure problems
    issues = result['structure']['metrics']['issues']
    print(f"structure issues: {len(issues)} found")
    
    # content completeness
    examples = result['completeness']['metrics']['examples']
    steps = result['completeness']['metrics']['steps']
    print(f"examples: {examples}, steps: {steps}")
    
    # writing style
    passive = result['style']['metrics']['passive_voice_count']
    you_refs = result['style']['metrics']['you_references']
    print(f"writing style: {passive} passive voice, {you_refs} 'you' references")
    
    print("=" * 60)

# quick overview
show_summary(test_result)

QUICK SUMMARY: Raise a Support Ticket Through MoEngage Dashboard
readability: fair (grade 8.7)
structure issues: 2 found
examples: 1, steps: 0
writing style: 5 passive voice, 7 'you' references


In [55]:
import datetime

def create_markdown_report(result):
    md_content = f"""# documentation analysis report

## article info
- **title:** {result['title']}
- **url:** {result['url']}
- **analyzed:** {datetime.datetime.now().strftime('%Y-%m-%d %H:%M')}

---

## readability analysis

### scores
- **grade level:** {result['readability']['scores']['grade_level']:.1f}
- **reading ease:** {result['readability']['scores']['reading_ease']:.1f}

### assessment for marketers
"""
    
    # extract readability stuff - safer approach
    analysis = result['readability']['analysis']
    if "readability:" in analysis.lower():
        first_line = analysis.split('\n')[0] if '\n' in analysis else analysis
        readability_score = first_line.replace("readability:", "").strip()
        md_content += f"**overall:** {readability_score}\n\n"
    
    # main issues - safer parsing
    if "main issues:" in analysis.lower():
        issues_split = analysis.lower().split("main issues:")
        if len(issues_split) > 1:
            # find the end of issues section
            issues_text = issues_split[1]
            if "suggestions:" in issues_text:
                issues_text = issues_text.split("suggestions:")[0]
            
            md_content += "### main issues\n"
            for line in issues_text.split('\n'):
                if line.strip() and (line.strip().startswith('-') or 'issue' in line.lower()):
                    md_content += f"{line.strip()}\n"
            md_content += "\n"
    
    # suggestions - safer parsing
    if "suggestions:" in analysis.lower():
        suggestions_split = analysis.lower().split("suggestions:")
        if len(suggestions_split) > 1:
            suggestions_text = suggestions_split[1]
            md_content += "### improvement suggestions\n"
            for line in suggestions_text.split('\n'):
                if line.strip() and (any(line.strip().startswith(str(i)) for i in range(1, 6)) or line.strip().startswith('-')):
                    md_content += f"{line.strip()}\n"
            md_content += "\n"
    
    # structure section
    md_content += """---

## structure and flow analysis

### metrics
"""
    metrics = result['structure']['metrics']
    md_content += f"- **total sections:** {metrics['total_sections']}\n"
    md_content += f"- **avg section length:** {metrics['avg_section_length']:.0f} words\n"
    md_content += f"- **has lists:** {'yes' if metrics['has_lists'] else 'no'}\n"
    md_content += f"- **has code examples:** {'yes' if metrics['has_code'] else 'no'}\n\n"
    
    if metrics['issues']:
        md_content += "### issues found\n"
        for issue in metrics['issues']:
            md_content += f"- {issue}\n"
        md_content += "\n"
    
    # flow analysis - simpler approach
    flow_analysis = result['structure']['flow_analysis']
    md_content += "### flow assessment\n"
    md_content += f"{flow_analysis[:200]}...\n\n"
    
    # extract any numbered suggestions from flow analysis
    md_content += "### suggestions\n"
    lines = flow_analysis.split('\n')
    for line in lines:
        if line.strip() and (any(line.strip().startswith(str(i)) for i in range(1, 6)) or line.strip().startswith('-')):
            clean_line = line.replace('**', '').strip()
            if clean_line:
                md_content += f"{clean_line}\n"
    
    # completeness section
    md_content += """

---

## completeness analysis

### current state
"""
    comp_metrics = result['completeness']['metrics']
    md_content += f"- **examples found:** {comp_metrics['examples']}\n"
    md_content += f"- **step-by-step instructions:** {comp_metrics['steps']}\n"
    md_content += f"- **code blocks:** {comp_metrics['code_blocks']}\n"
    md_content += f"- **total word count:** {comp_metrics['word_count']}\n\n"
    
    # missing stuff - simpler approach
    comp_analysis = result['completeness']['analysis']
    md_content += "### missing elements\n"
    lines = comp_analysis.split('\n')
    for line in lines:
        if line.strip() and (any(line.strip().startswith(str(i)) for i in range(1, 6)) or 'missing' in line.lower() or line.strip().startswith('-')):
            clean_line = line.replace('**', '').replace('*', '').strip()
            if clean_line and len(clean_line) > 5:
                md_content += f"{clean_line}\n"
    
    # style section
    md_content += """

---

## style guidelines analysis

### metrics
"""
    style_metrics = result['style']['metrics']
    md_content += f"- **avg sentence length:** {style_metrics['avg_sentence_length']:.1f} words\n"
    md_content += f"- **passive voice count:** {style_metrics['passive_voice_count']}\n"
    md_content += f"- **'you' references:** {style_metrics['you_references']}\n"
    md_content += f"- **'user' references:** {style_metrics['user_references']}\n\n"
    
    # style assessment - simplified
    style_analysis = result['style']['analysis']
    md_content += "### assessment\n"
    md_content += f"{style_analysis[:300]}...\n\n"
    
    # extract any specific issues
    if "problem" in style_analysis.lower() or "issue" in style_analysis.lower():
        md_content += "### specific issues found\n"
        lines = style_analysis.split('\n')
        for line in lines:
            if line.strip() and ('problem' in line.lower() or 'issue' in line.lower() or line.strip().startswith('-')):
                clean_line = line.replace('**', '').strip()
                if clean_line and len(clean_line) > 10:
                    md_content += f"- {clean_line}\n"
        md_content += "\n"
    
    md_content += """---

## summary
this analysis covers readability for marketers, document structure, content completeness, and style guidelines. focus on the high-priority suggestions to improve user experience.
"""
    
    return md_content

def save_markdown_report(result, filename=None):
    if not filename:
        # make a safe filename
        safe_title = re.sub(r'[^\w\s-]', '', result['title'])[:30]
        safe_title = re.sub(r'[-\s]+', '_', safe_title)
        filename = f"analysis_{safe_title}.md"
    
    try:
        md_content = create_markdown_report(result)
        
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(md_content)
        
        print(f"markdown report saved to: {filename}")
        return filename
    except Exception as e:
        print(f"error creating markdown report: {e}")
        return None

# create and save the report
markdown_file = save_markdown_report(test_result)

markdown report saved to: analysis_Raise_a_Support_Ticket_Through.md
