# Data Parsing | Docx | Practice

Installing required libraries

In [None]:
!pip install langchain-community
!pip install openai
!pip install python-dotenv
!pip install "unstructured[all-docs]"
!pip install jq

Importing required libraries

In [1]:
import pandas as pd
import numpy as np
import requests
import json

In [5]:
from itertools import islice
from PIL import Image
from io import BytesIO
from langchain.document_loaders import CSVLoader, TextLoader, JSONLoader
from unstructured.partition.docx import partition_docx
from IPython.core.display import display, HTML

## Unstructured - `partition_docx()`

In [6]:
word_doc = partition_docx(filename = '/content/docx/Data Enablement Service.docx')

General information function

In [11]:
def general_info(doc):
  print(f'Type of document: {type(doc)}')
  print(f'Number of elements in document: {len(doc)}')
  print(f'Type of First element in document: {type(doc[0])}')
  print(f'First element in document\n {doc[0]}')
  print(f'Type of Last element in document: {type(doc[-1])}')
  print(f'Last element in document\n {doc[-1]}')

In [12]:
general_info(word_doc)

Type of document: <class 'list'>
Number of elements in document: 423
Type of First element in document: <class 'unstructured.documents.elements.Title'>
First element in document
                                              Data Enablement platform
Type of Last element in document: <class 'unstructured.documents.elements.NarrativeText'>
Last element in document
 Use ElasticSearch client library elasticsearch-py for Python to interact with ElasticSearch from the product application.


In [14]:
element_types = set([type(item) for item in word_doc])
print(len(element_types))
for typ in element_types:
  print(typ)

6
<class 'unstructured.documents.elements.PageBreak'>
<class 'unstructured.documents.elements.Text'>
<class 'unstructured.documents.elements.Title'>
<class 'unstructured.documents.elements.ListItem'>
<class 'unstructured.documents.elements.NarrativeText'>
<class 'unstructured.documents.elements.Table'>


In [19]:
tables = [el for el in word_doc if str(type(el)) == "<class 'unstructured.documents.elements.Table'>"]
print(len(tables))

3


In [20]:
tables

[<unstructured.documents.elements.Table at 0x7c946e7062f0>,
 <unstructured.documents.elements.Table at 0x7c946e705540>,
 <unstructured.documents.elements.Table at 0x7c946f8e4bb0>]

In [21]:
for table in tables:
  print('#### Table ###')
  print(table.text)

#### Table ###
Hit Rate	​ Proportion of queries for which the system retrieves relevant chunks​ Mean Reciprocal Rank (MRR)​ MRR is the average of the reciprocal ranks of the first relevant item retrieved for each query. It emphasizes the ranking of the first relevant item.​ Precision@K​ Precision@K measures the proportion of retrieved items in the top-K rankings that are relevant to the query. Example: Let's say you retrieve 5 items for a query, out of which only 3 are relevant, then Precision @ 5 would be 3/5 or 0.6.​ Normalized Discounted Cumulative Gain (NDCG)​ NDCG measures the ranking quality by considering both relevance and rank position of retrieved items. It discounts the relevance of items appearing lower in the list​
#### Table ###
Contextual Relevance ​ This guardrail ensures that the responses generated by the language model are appropriate and related to the given context.​ Answer Relevance​ This guardrail focuses on evaluating generated responses that the provided inform

In [32]:
from unstructured.partition.docx import partition_docx
from unstructured.documents.elements import Table
from typing import List

elements = partition_docx(filename='/content/docx/Data Enablement Service.docx')

markdown_tables = []

for element in elements:

    if isinstance(element, Table):
        markdown_table = []

        # Get the table data
        if element.text:
            rows = element.text.strip().split('\n')
            if rows:
                # Process header
                header = [cell.strip() for cell in rows[0].split('\t')]
                markdown_table.append("| " + " | ".join(header) + " |")

                # Add separator line
                separator = "|" + "|".join([" :--- " for _ in header]) + "|"
                markdown_table.append(separator)

                # Process data rows
                for row in rows[1:]:
                    cells = [cell.strip() for cell in row.split('\t')]
                    # Replace any | characters with HTML entity
                    cells = [cell.replace("|", "&#124;") for cell in cells]
                    markdown_table.append("| " + " | ".join(cells) + " |")

                markdown_tables.append("\n".join(markdown_table))

markdown_tables

["| Hit Rate | \u200b Proportion of queries for which the system retrieves relevant chunks\u200b Mean Reciprocal Rank (MRR)\u200b MRR is the average of the reciprocal ranks of the first relevant item retrieved for each query. It emphasizes the ranking of the first relevant item.\u200b Precision@K\u200b Precision@K measures the proportion of retrieved items in the top-K rankings that are relevant to the query. Example: Let's say you retrieve 5 items for a query, out of which only 3 are relevant, then Precision @ 5 would be 3/5 or 0.6.\u200b Normalized Discounted Cumulative Gain (NDCG)\u200b NDCG measures the ranking quality by considering both relevance and rank position of retrieved items. It discounts the relevance of items appearing lower in the list\u200b |\n| :--- | :--- |",
 "| Contextual Relevance \u200b This guardrail ensures that the responses generated by the language model are appropriate and related to the given context.\u200b Answer Relevance\u200b This guardrail focuses on