# English Parser
This code processes raw English text files, converting them into a tabular format for easier comprehension and manipulation, with words segmented accordingly. It allows for extraction in either conllu or doc format, depending on requirements.

In [1]:
!pip install torch torchvision -f https://download.pytorch.org/whl/torch_stable.html
!pip install stanza


Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_cura

In [None]:
import stanza


In [None]:
!pip install python-docx

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━[0m [32m204.8/244.3 kB[0m [31m5.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-docx
Successfully installed python-docx-1.1.2


In [None]:
from docx import Document
from docx.shared import Pt
from google.colab import files
import time

In [None]:
# Upload the text file
uploaded = files.upload()

start_time = time.time()

# Get the uploaded file name
filename = list(uploaded.keys())[0]

# Read the content of the file
with open(filename, 'r', encoding='utf-8') as file:
    paragraph = file.read()

print("File read in:", time.time() - start_time, "seconds")

# Download English model if not already downloaded
stanza.download('en', verbose=True)

# Initialize Stanza pipeline
nlp = stanza.Pipeline('en', processors='tokenize,mwt,pos,lemma,depparse')

print("Stanza pipeline initialized in:", time.time() - start_time, "seconds")

# Process the text
doc = nlp(paragraph)

print("Text processed in:", time.time() - start_time, "seconds")


Saving trial.txt to trial.txt
File read in: 0.0004229545593261719 seconds


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: en (English) ...


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.8.0/models/default.zip:   0%|          | 0…

INFO:stanza:Downloaded file to /root/stanza_resources/en/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |
| depparse  | combined_charlm   |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Done loading processors!


Stanza pipeline initialized in: 14.145387172698975 seconds
Text processed in: 16.01706624031067 seconds


## For conllu format parsed file

In [None]:
# in conll format
conll_format = ""
for sentence in doc.sentences:
    for word in sentence.words:
        conll_format += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
            word.id,       # ID
            word.text,     # Text
            word.lemma,    # Lemma
            word.upos,     # UPOS
            word.xpos,     # XPOS
            word.head,     # Head
            word.deprel,   # Deprel
            word.start_char,  # Start Char
            word.end_char,    # End Char
            abs(word.id - word.head)  # Dependency Length
        )
     conll_format += "\n"

# Save the CoNLL format text to a file
conll_filename = 'parsed_data.conll'
with open(conll_filename, 'w', encoding='utf-8') as file:
    file.write(conll_format)

# Download the file
files.download(conll_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## For docx format parsed file

In [None]:
#in docx format
docx_filename = 'parsed_data.docx'
document = Document()

# Add a title
document.add_heading('Parsed Data', level=1)

# Add a table with an additional column for Dependency Length
num_columns = 10
table = document.add_table(rows=1, cols=num_columns)
hdr_cells = table.rows[0].cells
hdr_cells[0].text = 'ID'
hdr_cells[1].text = 'Text'
hdr_cells[2].text = 'Lemma'
hdr_cells[3].text = 'UPOS'
hdr_cells[4].text = 'XPOS'
hdr_cells[5].text = 'Head'
hdr_cells[6].text = 'Deprel'
hdr_cells[7].text = 'Start Char'
hdr_cells[8].text = 'End Char'
hdr_cells[9].text = 'Dependency Length'

# Fill the table with data
for sentence in doc.sentences:
    for word in sentence.words:
        row_cells = table.add_row().cells
        row_cells[0].text = str(word.id)
        row_cells[1].text = word.text
        row_cells[2].text = word.lemma
        row_cells[3].text = word.upos
        row_cells[4].text = word.xpos
        row_cells[5].text = str(word.head)
        row_cells[6].text = word.deprel
        row_cells[7].text = str(word.start_char)
        row_cells[8].text = str(word.end_char)
        if word.head == 0:
            dep_length = 0
        else:
            dep_length = abs(word.id - word.head)
        row_cells[9].text = str(dep_length)

# Save the document
document.save(docx_filename)

# Download the file
files.download(docx_filename)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>