In [4]:
import stanza
import ipywidgets as widgets
import pandas as pd
from IPython.display import display
import os

The list of Stanza models with country codes is available at  
https://stanfordnlp.github.io/stanza/performance.html  
Install the model the first time that you use it  
The rest of the notebook wil run for whatever language you input in language_code

In [5]:
language_code = "it" # input the language code here

# I downloaded the models in the current directory, you can change this if you want to downlod the model in a different place
directory = os.getcwd()

# remove the hashtag on the line below to download the model (the first time you use it)
# stanza.download(language_code, model_dir=directory)

Function to calculate <u>diversity</u> (Shannon's entropy) of a sentence

In [6]:
import scipy.stats as stats

def sentence_diversity_calc(tags):
  """Calculates the Shannon entropy of a list of POS tag pairs.

  Args:
    pairs: A list of POS tag pairs (tuples).

  Returns:
    The Shannon entropy of the pairs.
  """

  pairs = []
  for i in range(len(tags) - 1):
    pairs.append((tags[i], tags[i+1]))
  
  # Count occurrences of each pair
  pair_counts = {}
  for pair in pairs:
    pair_counts[pair] = pair_counts.get(pair, 0) + 1

  # Calculate probabilities
  total_pairs = sum(pair_counts.values())
  probabilities = [count / total_pairs for count in pair_counts.values()]

  # Calculate entropy using SciPy's entropy function
  H = stats.entropy(probabilities, base=2)  # Base 2 for bits

  return H

Function to caclulate the <u>productivity</u> of each sentence

In [7]:
import scipy.stats as stats

def sentence_productivity_calc(words, tags):
  """Calculates the productivity of a sentence based on word and tag data.

  Args:
    words: A list of words in the sentence.
    tags: A list of corresponding POS tags.

  Returns:
    The productivity of the sentence.
  """

  # Calculate joint entropy H(W, T)
  word_tag_pairs = list(zip(words, tags))
  pair_counts = {}
  for pair in word_tag_pairs:
    pair_counts[pair] = pair_counts.get(pair, 0) + 1
  total_pairs = sum(pair_counts.values())
  probabilities = [count / total_pairs for count in pair_counts.values()]
  H_WT = stats.entropy(probabilities, base=2)

  # Calculate entropy of tags H(T)
  tag_counts = {}
  for tag in tags:
    tag_counts[tag] = tag_counts.get(tag, 0) + 1
  total_tags = sum(tag_counts.values())
  tag_probabilities = [count / total_tags for count in tag_counts.values()]
  H_T = stats.entropy(tag_probabilities, base=2)

  # Calculate conditional entropy H(W | T)
  H_WT_given_T = H_WT - H_T

  # Calculate productivity
  productivity = H_WT_given_T + 1

  return productivity

Function to calculate the document complexity

In [8]:
def document_complexity_calc(sentences, doc):
    """Calculates the document complexity, diversity, and productivity.

    Args:
        sentences: A list of sentences (each sentence as a list of words).
        doc: The entire document object.

    Returns:
        A tuple containing:
        - Document complexity
        - Average document diversity
        - Average document productivity
    """

    N = len(sentences)
    total_complexity = 0
    total_diversity = 0
    total_productivity = 0

    for sentence in sentences:
        sen_words = []
        sen_pos = []

        for word in sentence.words:
            if word.upos != "PUNCT":
                sen_words.append(word.text.lower())
                sen_pos.append(word.xpos)

        # Calculate D(si) and P(si) for the current sentence
        diversity = sentence_diversity_calc(sen_pos)
        productivity = sentence_productivity_calc(sen_words, sen_pos)

        # Update totals
        total_complexity += diversity * productivity
        total_diversity += diversity
        total_productivity += productivity

    # Calculate averages
    document_complexity = total_complexity / N
    average_diversity = total_diversity / N
    average_productivity = total_productivity / N

    return document_complexity, average_diversity, average_productivity

Input the text(s) and process it using ipywidgets buttons

In [9]:
# Initialize the Stanza pipeline
nlp = stanza.Pipeline(language_code, model_dir=directory)

# Create a title widget
title_widget = widgets.HTML(value="<h1>Construction Complexity Calculator</h1>")

# Create a combined instructions widget
instructions_widget = widgets.HTML(value="Input text in the box and click 'Process Text Input', or click 'Upload' to upload and process .txt files.")

# Create a text box widget
text_box_widget = widgets.Textarea(
    placeholder='Copy and paste text here',
    description='Input text:',
    layout=widgets.Layout(height='300px', width='600px')
)

# Create a file upload widget
file_upload_widget = widgets.FileUpload(
    accept=".txt",  # Accepted file extension e.g. '.txt', '.pdf', 'image/*', 'image/*,.pdf'
    multiple=True  # True to accept multiple files upload else False
)

# Create an output widget to display the result
output_widget = widgets.Output()

# Create a progress bar widget
progress_bar = widgets.IntProgress(
    value=0,
    min=0,
    max=100,
    step=1,
    description='Progress:',
    bar_style='',  # 'success', 'info', 'warning', 'danger' or ''
    orientation='horizontal'
)

# Define the function to process the input files and create a CSV file
def process_input(files=None, text=None):
    output_widget.append_stdout("\nProcessing...this may take some time!\n")
    
    if files:
        results = []
        total_files = len(files)
        progress_bar.max = total_files
        
        for i, uploaded_file in enumerate(files):
            content = bytes(uploaded_file['content']).decode('utf-8')
            doc = nlp(content)
            complexity, avg_diversity, avg_productivity = document_complexity_calc(doc.sentences, doc)
            results.append({'filename': uploaded_file['name'],
                            'complexity': complexity,
                            'diversity': avg_diversity,
                            'productivity': avg_productivity})
            
            # Update progress bar
            progress_bar.value = i + 1
        
        df = pd.DataFrame(results)
        csv_filename = 'complexity_scores.csv'
        df.to_csv(csv_filename, index=False)
        
        progress_bar.value = 0  # Reset progress bar
    
    elif text:
        progress_bar.max = 1
        progress_bar.value = 0
        
        doc = nlp(text)
        complexity, avg_diversity, avg_productivity = document_complexity_calc(doc.sentences, doc)
        
        progress_bar.value = 1  # Complete progress bar
        progress_bar.value = 0  # Reset progress bar

    # Add the output message
    output_widget.append_stdout(f"\nComplexity scores saved to {csv_filename}" if files else f"""
        Complexity score: {complexity}
        Diversity: {avg_diversity}
        Productivity: {avg_productivity}
        """)

# Attach a button to process the text input
process_text_button = widgets.Button(description="Process Text Input")
process_text_button.on_click(lambda b: process_input(text=text_box_widget.value))

# Automatically process files when they are uploaded
def on_file_upload(change):
    process_input(files=change['new'])

file_upload_widget.observe(on_file_upload, names='value')

# Display the widgets
display(title_widget, instructions_widget, text_box_widget, process_text_button, file_upload_widget, output_widget, progress_bar)


2025-01-30 20:02:30 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2025-01-30 20:02:30 INFO: Downloaded file to c:\Users\admin\Documents\Research\Nelson_Complexity\other_languages\Merlin_Corpus\resources.json
2025-01-30 20:02:31 INFO: Loading these models for language: it (Italian):
| Processor    | Package           |
------------------------------------
| tokenize     | combined          |
| mwt          | combined          |
| pos          | combined_charlm   |
| lemma        | combined_nocharlm |
| constituency | vit_charlm        |
| depparse     | combined_charlm   |
| ner          | fbk               |

2025-01-30 20:02:31 INFO: Using device: cpu
2025-01-30 20:02:31 INFO: Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2025-01-30 20:02:32 INFO: Loading: mwt
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2025-01-30 20:02:32 INFO: Loading: pos
  checkpoint = torch.load(filename, lambda storage, loc: storage)
  data = torch.load(self.filename, lambda storage, loc: storage)
  state = torch.load

HTML(value='<h1>Construction Complexity Calculator</h1>')

HTML(value="Input text in the box and click 'Process Text Input', or click 'Upload' to upload and process .txt…

Textarea(value='', description='Input text:', layout=Layout(height='300px', width='600px'), placeholder='Copy …

Button(description='Process Text Input', style=ButtonStyle())

FileUpload(value=(), accept='.txt', description='Upload', multiple=True)

Output()

IntProgress(value=0, description='Progress:')