In [2]:
import json
import os
from unskript import nbparams
from unskript.secrets import ENV_MODE, ENV_MODE_AWS
from unskript.fwk.workflow import Task, Workflow

env = {"ENV_MODE":"ENV_MODE_AWS","TENANT_ID":"982dba5f-d9df-48ae-a5bf-ec1fc94d4882","PROXY_ID":"1499f27c-6406-4fbd-bd1b-c6f92800018f","TENANT_URL":"https://tenant-staging.alpha.unskript.io","AWS_REGION":"us-west-2"}
secret_store_cfg = {"SECRET_STORE_TYPE":"SECRET_STORE_TYPE_AWS","AWS_SECRET_PREFIX":"staging","AWS_REGION":"us-west-2"}
os.environ["UNSKRIPT_REDIS_HOST"] = "redis-master.redis.svc.cluster.local"
os.environ["UNSKRIPT_TOKEN"] = "5c4a5754-0600-11ec-9a03-0242ac130003"
os.environ["UNSKRIPT_SIDECAR_URL"] = "http://sidecar.sidecar.svc.cluster.local"
os.environ["TENANT_URL"] = env["TENANT_URL"]
paramDict = {"sitemap":"https://docs.unskript.com/unskript-product-documentation/sitemap.xml"}
unSkriptOutputParamDict = {}
paramDict.update(env)
paramDict.update(secret_store_cfg)
paramsJson = json.dumps(paramDict)
nbParamsObj = nbparams.NBParams(paramsJson)
sitemap = nbParamsObj.get('sitemap')


w = Workflow(env, secret_store_cfg, None, global_vars=globals(), check_uuids=None)



In [3]:
pip install xmltodict textstat bs4

In [4]:
import requests
import xmltodict
import json

#This Action reads in the Sitemap, converts the XML to a dictionary, 
# and then extracts every URL into a list


response = requests.get(sitemap)
contents = response.text
# Parse the XML data to a dictionary
xml_dict = xmltodict.parse(contents)
#print(xml_dict['urlset'])
urlList = []
for url in xml_dict['urlset']['url']:
    urlList.append(url['loc'])

print("sitemap read in, list of urls created")


In [5]:
import requests
import textstat
from bs4 import BeautifulSoup


urls = urlList
readability = []

for url in urls:
    # get the text of the file
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    # Remove script and style elements
    for script in soup(["script", "style"]):
        script.decompose()

    text = soup.get_text()
    #lines = (line.strip() for line in text.splitlines())
    #chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    #text = "\n".join(chunk for chunk in chunks if chunk)
    #analyze the text for readability
    fk_reading_ease = textstat.flesch_reading_ease(text)
    flesch_kincaid_grade = textstat.flesch_kincaid_grade(text)
    gunning_fog = textstat.gunning_fog(text)
    automated_readability = textstat.automated_readability_index(text)
    result = {'url': url,"fk_reading_ease":fk_reading_ease, "fk_grade": flesch_kincaid_grade, "gf":gunning_fog, "automated_readability":automated_readability}
    readability.append(result)
print("readability stats collected for each page")

In [6]:
#take the list into a dataframe
import pandas as pd
import matplotlib.pyplot as plt


# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(readability)
df.tail()
# Convert the 'flesch_kincaid_grade' column to integers
df['fk_grade'] = df['fk_grade'].astype(int)

# Group the DataFrame by 'fk' and count the number of URLs for each group
counts = df.groupby('fk_grade')['url'].count()

# Create a bar chart of the counts
counts.plot(kind='bar')
plt.xlabel('fk_grade')
plt.ylabel('Number of URLs')
plt.title('URL Count by fk grade level')
plt.show()


In [13]:
import ipywidgets as widgets
from IPython.display import display

#selected_rows = df[(df['fk_grade'] >= 0) & (df['fk_grade'] <= 20)]
selected_rows = df.sort_values('fk_grade')
# Print the URLs for the selected rows
# Print the URLs and 'fk' values for the selected rows
#for index, row in selected_rows.iterrows():
#    print(f"URL: {row['url']}, fk_grade: {row['fk_grade']}")
# Create a slider widget
slider = widgets.IntRangeSlider(
    min=0,
    max=80,
    step=1,
    description='Score Range:',
    continuous_update=True  # Set to True for continuous update while sliding
)

# Create a function to update the displayed DataFrame based on the slider value
def update_df_range(*args):
    # Get the slider value
    min_score, max_score = slider.value
    
    # Filter the DataFrame based on the score range
    filtered_df = selected_rows[(selected_rows['fk_grade'] >= min_score) & (selected_rows['fk_grade'] <= max_score)]
    # Select only the 'URL' and 'fk_grade' columns
    filtered_df = filtered_df[['URL', 'fk_grade']]
    # Display the filtered DataFrame
    display(filtered_df)

# Attach the function to the slider's value change event
slider.observe(update_df_range, 'value')

# Display the initial DataFrame and the slider widget
display(selected_rows)
display(slider)