In [21]:
import re
from textstat import *
import os

print(os.getcwd())

/data/Documents/DataScience/Courses/Dissertation/src/evaluation


## Analyse Documentation


In [22]:
def analyze_docs_content(content):
    def clean_rst_content(content):
        content = re.sub(r'::[\s\S]*?\n\n', '', content) # code blocks
        content = re.sub(r'.. [a-z]+::.*?\n', '', content) # rst commands
        content = re.sub(r':[a-z]+:`.*?`', '', content) # inline
        content = re.sub(r'`[^`\n]+`_', '', content) # links
        return content
    cleaned_content = clean_rst_content(content)
    return {
        'flesch_reading_ease': textstat.flesch_reading_ease(cleaned_content),
        'flesch_kincaid_grade': textstat.flesch_kincaid_grade(cleaned_content),
        'smog_index': textstat.smog_index(cleaned_content),
        'coleman_liau_index': textstat.coleman_liau_index(cleaned_content),
        'automated_readability_index': textstat.automated_readability_index(cleaned_content),
        'difficult_words': textstat.difficult_words(cleaned_content),
        #'text_standard': textstat.text_standard(cleaned_content)
    }

In [23]:
def analyze_sphinx_docs(docs_source_dir):
    results = {}
    for root, dirs, files in os.walk(docs_source_dir):
        for file in files:
            if file.endswith('.rst'):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                results[file_path] = analyze_docs_content(content)
    return results

In [24]:
def print_results(results):
    for file_path, metrics in results.items():
        print(f"\nAnalysis for {file_path}:")
        for metric, value in metrics.items():
            print(f"  {metric}: {value}")

In [25]:
docs_source_dir = '../../docs/source'
results = analyze_sphinx_docs(docs_source_dir)
print_results(results)


Analysis for ../../docs/source/api_reference.rst:
  flesch_reading_ease: -46.13
  flesch_kincaid_grade: 21.6
  smog_index: 0.0
  coleman_liau_index: 35.63
  automated_readability_index: 32.2
  difficult_words: 11

Analysis for ../../docs/source/candidate_generator.rst:
  flesch_reading_ease: -233.77
  flesch_kincaid_grade: 48.1
  smog_index: 0.0
  coleman_liau_index: 70.5
  automated_readability_index: 110.8
  difficult_words: 12

Analysis for ../../docs/source/config_manager.rst:
  flesch_reading_ease: -121.76
  flesch_kincaid_grade: 32.0
  smog_index: 0.0
  coleman_liau_index: 53.37
  automated_readability_index: 83.7
  difficult_words: 10

Analysis for ../../docs/source/index.rst:
  flesch_reading_ease: 13.91
  flesch_kincaid_grade: 13.0
  smog_index: 10.5
  coleman_liau_index: 24.13
  automated_readability_index: 33.7
  difficult_words: 12

Analysis for ../../docs/source/installation.rst:
  flesch_reading_ease: 48.16
  flesch_kincaid_grade: 8.1
  smog_index: 9.4
  coleman_liau_ind

In [26]:
avg_scores = {metric: sum(doc[metric] for doc in results.values()) / len(results)
              for metric in next(iter(results.values()))}

print("\nAverage scores across all documents:")
for metric, value in avg_scores.items():
    print(f"  {metric}: {value:.2f}")



Average scores across all documents:
  flesch_reading_ease: -52.06
  flesch_kincaid_grade: 22.92
  smog_index: 7.03
  coleman_liau_index: 36.97
  automated_readability_index: 50.63
  difficult_words: 24.11
