#Calculating Entropy
from the raw data to calculate the occurence of one type of wordID-head pair.
example- NOUN-VERB pair

In [None]:
# Install necessary packages
!pip install torch torchvision -f https://download.pytorch.org/whl/torch_stable.html
!pip install stanza
!pip install python-docx
!pip install scipy


Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_cura

In [None]:
import stanza
import pandas as pd
from itertools import product
from collections import Counter
import numpy as np
from scipy.stats import entropy
from google.colab import files
from docx import Document
import math

# Initialize Stanza pipeline for English
stanza.download('hi')
nlp = stanza.Pipeline('hi')

# Function to extract dependency pairs from a parsed document
def extract_dependency_pairs(doc):
    dependency_pairs = []
    for sentence in doc.sentences:
        for word in sentence.words:
            if word.head != 0:
                head_word = sentence.words[word.head - 1]
                dependency_pairs.append((word.upos, head_word.upos))
    return dependency_pairs

# Function to calculate entropy for each combination of dependency pairs
def calculate_entropy(dependency_pairs):
    pair_counts = Counter(dependency_pairs)
    total_pairs = sum(pair_counts.values())
    pair_probs = np.array(list(pair_counts.values())) / total_pairs
    pair_entropies = {pair: -math.log2(pair_prob) for pair, pair_prob in zip(pair_counts.keys(), pair_probs)}
    return pair_entropies

# Function to generate an entropy table
def generate_entropy_table(pair_entropies):
    pair_combinations = list(product(set(pair[0] for pair in pair_entropies.keys()), set(pair[1] for pair in pair_entropies.keys())))
    table_data = [(pair[0], pair[1], pair_entropies.get(pair, 0)) for pair in pair_combinations]
    df = pd.DataFrame(table_data, columns=['Subject', 'Head', 'Entropy'])
    return df

# Upload the raw text file
uploaded = files.upload()

# Get the uploaded file name
filename = list(uploaded.keys())[0]

# Read the content of the file
with open(filename, 'r', encoding='utf-8') as file:
    text = file.read()

# Process the text with Stanza
doc_stanza = nlp(text)

# Extract dependency pairs from the parsed document
dependency_pairs = extract_dependency_pairs(doc_stanza)

# Calculate entropy for each combination of dependency pairs
pair_entropies = calculate_entropy(dependency_pairs)

# Generate entropy table
entropy_table = generate_entropy_table(pair_entropies)

# Save the entropy data to a Word document
docx_filename = 'entropy_data.docx'
document = Document()

# Add a title
document.add_heading('Entropy Data', level=1)

# Add the entropy table
table = document.add_table(rows=1, cols=len(entropy_table.columns))
hdr_cells = table.rows[0].cells
for i, column in enumerate(entropy_table.columns):
    hdr_cells[i].text = column

for _, row in entropy_table.iterrows():
    row_cells = table.add_row().cells
    for i, value in enumerate(row):
        row_cells[i].text = str(value)

# Save the document
document.save(docx_filename)

# Download the Word document
files.download(docx_filename)


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: hi (Hindi) ...


Downloading https://huggingface.co/stanfordnlp/stanza-hi/resolve/v1.8.0/models/default.zip:   0%|          | 0…

INFO:stanza:Downloaded file to /root/stanza_resources/hi/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: hi (Hindi):
| Processor | Package       |
-----------------------------
| tokenize  | hdtb          |
| pos       | hdtb_charlm   |
| lemma     | hdtb_nocharlm |
| depparse  | hdtb_charlm   |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Done loading processors!


Saving hi-1872-ocr.txt to hi-1872-ocr (2).txt


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
def calculate_entropy(dependency_pairs):
    pair_counts = Counter(dependency_pairs)
    total_pairs = sum(pair_counts.values())
    pair_probs = np.array(list(pair_counts.values())) / total_pairs
    pair_entropies = {pair: entropy(pair_prob) for pair, pair_prob in zip(pair_counts.keys(), pair_probs)}
    return pair_entropies


In [None]:
def generate_entropy_table(pair_counts, pair_entropies):
    pair_combinations = list(pair_counts.keys())
    table_data = [(pair[0], pair[1], pair_counts[pair], pair_entropies.get(pair, 0)) for pair in pair_combinations]
    df = pd.DataFrame(table_data, columns=['Subject', 'Head', 'Count', 'Entropy'])
    return df

# Calculate counts of each dependency pair
pair_counts = Counter(dependency_pairs)

# Generate entropy table with counts
entropy_table_with_counts = generate_entropy_table(pair_counts, pair_entropies)

# Print the entropy table with counts
print(entropy_table_with_counts)


   Subject   Head  Count    Entropy
0    PROPN  PROPN     47   4.673027
1    PROPN   NOUN     30   5.320725
2    PUNCT  PROPN     23   5.704054
3     PART  PROPN      4   8.227616
4     NOUN  PROPN     11   6.768184
..     ...    ...    ...        ...
67     NUM  PROPN      1  10.227616
68    VERB  CCONJ      2   9.227616
69    PRON    NUM      2   9.227616
70     ADP    NUM      1  10.227616
71    NOUN    NUM      2   9.227616

[72 rows x 4 columns]
