In [17]:
pip list | grep azure

azure-ai-formrecognizer                       3.3.0b1
azure-cognitiveservices-vision-computervision 0.9.0
azure-common                                  1.1.28
azure-core                                    1.26.4

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [11]:
import json
import pandas as pd
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential


def read_json_file(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)


def create_paragraph_dataframe(data):
    df_paragraph = pd.DataFrame(pd.json_normalize(data)['analyzeResult.paragraphs'][0])
    paragraph_spans = [
        {**span, 'content': row['content'], 'boundingRegions': row['boundingRegions']}
        for _, row in df_paragraph.iterrows()
        for span in row['spans']
    ]
    return pd.DataFrame(paragraph_spans)


def create_style_dataframe(data):
    styles = pd.json_normalize(data)['analyzeResult.styles']
    df_styles = pd.DataFrame(styles[0])
    style_spans = [
        {**span, 'confidence': row['confidence'], 'isHandwritten': row['isHandwritten']}
        for _, row in df_styles.iterrows()
        for span in row['spans']
    ]
    return pd.DataFrame(style_spans)


def merge_dataframes(df_paragraph_spans, df_style_spans):
    df_merged = pd.merge(df_paragraph_spans, df_style_spans, on=['offset', 'length'], how='inner')
    df_bounding_regions_normalized = pd.json_normalize(df_merged['boundingRegions'].explode()).reset_index(drop=True)
    return pd.concat([df_merged.drop(columns=['boundingRegions']), df_bounding_regions_normalized], axis=1)


def form_recognizer(file_path):
    data = read_json_file(file_path)

    df_paragraph_spans = create_paragraph_dataframe(data)
    df_style_spans = create_style_dataframe(data)
    df_handwritten_spans_merged = merge_dataframes(df_paragraph_spans, df_style_spans)

    return df_handwritten_spans_merged[df_handwritten_spans_merged['isHandwritten'] == True]





In [12]:
file_path = '/Users/David.Godinez/Downloads/7b12da4f-7664-433c-9a34-4306c35f1aab_removed.pdf.json'


In [13]:
loaded_json = read_json_file(file_path)
df_paragraph_spans = create_paragraph_dataframe(loaded_json)
df_style_spans = create_style_dataframe(loaded_json)
df_merged = merge_dataframes(df_paragraph_spans, df_style_spans)
# df_form_recognized = form_recognizer(loaded_json)

In [16]:
df_merged

Unnamed: 0,offset,length,content,confidence,isHandwritten,pageNumber,polygon
0,38,7,CR 1.03,0.95,True,1,"[6.7196, 1.3354, 7.7965, 1.3545, 7.7927, 1.565..."
1,144,1,-,0.5,True,1,"[7.9017, 3.0106, 8.2607, 3.0106, 8.2607, 3.111..."
2,438,6,74071,0.4,True,1,"[5.2934, 5.8059, 5.8007, 5.7867, 5.8007, 5.963..."
3,1326,15,Costa Rica Pire,1.0,True,2,"[6.2495, 0.0899, 7.8655, 0.052, 7.8714, 0.3032..."
4,1374,7,CR.1.03,1.0,True,2,"[6.5255, 0.9145, 7.5259, 0.9621, 7.5159, 1.170..."
5,2834,1,1,0.4,True,4,"[3.7282, 2.1302, 3.8278, 2.1302, 3.8278, 2.229..."
6,2836,1,-,0.5,True,4,"[3.5053, 2.4338, 3.6476, 2.4433, 3.6429, 2.514..."
7,3976,11,"Rec 'd 6'6""",0.6,True,4,"[6.6655, 8.2172, 7.5383, 8.2219, 7.5371, 8.435..."
8,4721,1,-,0.7,True,5,"[2.4065, 4.7055, 2.6813, 4.6913, 2.6867, 4.795..."
9,5004,1,-,0.8,True,5,"[2.4687, 5.8942, 2.6961, 5.88, 2.6961, 5.9795,..."


In [1]:
# using pdf and api 

In [15]:
import json
import re 
import os
import pandas as pd
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient, AnalyzeResult

def analyze_document(file_path):
    # Load the credentials from a JSON file and other necessary steps
    credentials_path = os.path.abspath('credentials2.json')
    with open(credentials_path, 'r') as f:
        credentials = json.load(f)

    subscription_key = credentials['API_key']
    endpoint = credentials['endpoint']

    document_analysis_client = DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(subscription_key))

    with open(file_path, "rb") as f:
        file_content = f.read()

    poller = document_analysis_client.begin_analyze_document("prebuilt-read", file_content)
    result = poller.result()
    
    return result

 
    


def create_paragraph_dataframe(analyze_result):
    data = []
    
    for paragraph in analyze_result.paragraphs:
        content = paragraph.content
        bounding_region = paragraph.bounding_regions[0]  # Assuming there's always at least one bounding region
        page_number = bounding_region.page_number
        polygon = [(point.x, point.y) for point in bounding_region.polygon]
        span = paragraph.spans[0]  # Assuming there's always at least one span
        
        data.append({
            "offset": span.offset,
            "length": span.length,
            "content": content,
            "page_number": page_number,
            "polygon": polygon
        })
        
    df_paragraph_spans = pd.DataFrame(data)

    return df_paragraph_spans



def create_style_dataframe(analyze_result):
    data = []

    for style in analyze_result.styles:
        is_handwritten = style.is_handwritten
        confidence = style.confidence
        
        for span in style.spans:
            data.append({
                "confidence": confidence,
                "is_handwritten": is_handwritten,
                "offset": span.offset,
                "length": span.length
            })

    df_style_spans = pd.DataFrame(data)

    return df_style_spans



def merge_dataframes(df_paragraph_spans, df_style_spans):
    df_merged = pd.merge(df_paragraph_spans, df_style_spans, on=['offset', 'length'], how='inner')
    return df_merged.reindex()


def merge_dataframes2(df_paragraph_spans, df_style_spans):
    df_merged = pd.merge(df_paragraph_spans, df_style_spans, on=['offset', 'length'], how='outer')
    # Replace NaN with default values if necessary
    df_merged['confidence'].fillna(0, inplace=True)
    df_merged['is_handwritten'].fillna(False, inplace=True)
    return df_merged.reindex()


def process_analyze_result(filepath: str):
    result = analyze_document(filepath)
    analyzed_result = analyze_result(result)
    df_paragraph_spans = create_paragraph_dataframe(analyzed_result['analyzeResult.paragraphs'][0])
    df_style_spans = create_style_dataframe(analyzed_result['analyzeResult.styles'][0])
    df_handwritten_spans_merged = merge_dataframes(df_paragraph_spans, df_style_spans)

    return df_handwritten_spans_merged[df_handwritten_spans_merged['isHandwritten'] == True]


## First Document

In [4]:
file_path = '/Users/David.Godinez/Desktop/BJSS/exxon3/exxonmobile/files/7b12da4f-7664-433c-9a34-4306c35f1aab_removed.pdf'

In [6]:
result = analyze_document(file_path)

In [7]:
result.api_version

'2023-02-28-preview'

In [8]:
df_paragraph_spans = create_paragraph_dataframe(result)
df_style_spans = create_style_dataframe(result)
df_merged = merge_dataframes(df_paragraph_spans, df_style_spans)
# using inner join on merge
df_merged

Unnamed: 0,offset,length,content,page_number,polygon,confidence,is_handwritten
0,36,7,CR 1.03,1,"[(6.7387, 1.3306), (7.7821, 1.345), (7.7791, 1...",1.0,True
1,1308,14,Costa Rica Pre,2,"[(6.2395, 0.0901), (7.8714, 0.0474), (7.8778, ...",1.0,True
2,1355,8,CR. 1.03,2,"[(6.5255, 0.905), (7.5254, 0.9618), (7.5136, 1...",1.0,True
3,4649,1,-,5,"[(2.4355, 3.5346), (2.7151, 3.5346), (2.7151, ...",0.4,True
4,4960,1,-,5,"[(2.426, 4.705), (2.7009, 4.705), (2.7009, 4.8...",0.7,True


In [264]:
# using outer join on merge
df_merged_outer = merge_dataframes2(df_paragraph_spans, df_style_spans)
df_merged_outer[df_merged_outer['is_handwritten'] == True].reset_index()

Unnamed: 0,index,offset,length,content,page_number,polygon,confidence,is_handwritten
0,1,36,7,CR 1.03,1.0,"[(6.7387, 1.3306), (7.7821, 1.345), (7.7791, 1...",1.0,True
1,27,1308,14,Costa Rica Pre,2.0,"[(6.2395, 0.0901), (7.8714, 0.0474), (7.8778, ...",1.0,True
2,29,1355,8,CR. 1.03,2.0,"[(6.5255, 0.905), (7.5254, 0.9618), (7.5136, 1...",1.0,True
3,124,4649,1,-,5.0,"[(2.4355, 3.5346), (2.7151, 3.5346), (2.7151, ...",0.4,True
4,132,4960,1,-,5.0,"[(2.426, 4.705), (2.7009, 4.705), (2.7009, 4.8...",0.7,True
5,140,4672,1,,,,1.0,True
6,141,3186,1,,,,0.4,True


In [9]:
df_paragraph_spans[df_paragraph_spans['offset'] == 4672]

Unnamed: 0,offset,length,content,page_number,polygon
128,4672,269,4 Reamed 4 hrs to 475'. Circ 1 hr to clean hol...,5,"[(3.2743, 3.4967), (7.7356, 3.5353), (7.7269, ..."


In [254]:
# checking for the offset in paragraph dataframe with NaN as content offset = [4672,3186]
df_paragraph_spans[df_paragraph_spans['offset'] == 3186]

Unnamed: 0,offset,length,content,page_number,polygon


In [255]:
df_style_spans[df_style_spans['offset'] == 3186]

Unnamed: 0,confidence,is_handwritten,offset,length
4,0.4,True,3186,1


| OCR Predictions        | Manual Counts     |
| ----------- | ----------- |
| 5      | 4      |

|            | Actual Positive | Actual Negative |
|------------|-----------------|-----------------|
| Predicted Positive | 3            | 4            |
| Predicted Negative | -            | -            |


In [253]:
# ============================= experimental wing =============================


In [284]:
def merge_dataframes3(df_paragraph_spans, df_style_spans):
    df_merged = pd.merge(df_paragraph_spans, df_style_spans, on=['offset', 'length'], how='outer')
    # Replace NaN with default values if necessary
    df_merged['confidence'].fillna(0, inplace=True)
    df_merged['is_handwritten'].fillna(False, inplace=True)
    return df_merged.reindex()

In [285]:
df_merged_experimental = merge_dataframes3(df_paragraph_spans, df_style_spans)
df_merged_experimental[(df_merged_experimental['is_handwritten'] == True)]

Unnamed: 0,offset,length,content,page_number,polygon,confidence,is_handwritten
1,36,7,CR 1.03,1.0,"[(6.7387, 1.3306), (7.7821, 1.345), (7.7791, 1...",1.0,True
27,1308,14,Costa Rica Pre,2.0,"[(6.2395, 0.0901), (7.8714, 0.0474), (7.8778, ...",1.0,True
29,1355,8,CR. 1.03,2.0,"[(6.5255, 0.905), (7.5254, 0.9618), (7.5136, 1...",1.0,True
124,4649,1,-,5.0,"[(2.4355, 3.5346), (2.7151, 3.5346), (2.7151, ...",0.4,True
132,4960,1,-,5.0,"[(2.426, 4.705), (2.7009, 4.705), (2.7009, 4.8...",0.7,True
140,4672,1,,,,1.0,True
141,3186,1,,,,0.4,True


In [277]:
df_merged_experimental = merge_dataframes3(df_paragraph_spans, df_style_spans)
df_merged_experimental[(df_merged_experimental['is_handwritten'] == True) 
                        & (df_merged_experimental['length']>1)
                        & (df_merged_experimental['content'].notnull()) 
                        & (df_merged_experimental['confidence'] > 0.5)].reset_index()

Unnamed: 0,index,offset,length,content,page_number,polygon,confidence,is_handwritten
0,1,36,7,CR 1.03,1.0,"[(6.7387, 1.3306), (7.7821, 1.345), (7.7791, 1...",1.0,True
1,27,1308,14,Costa Rica Pre,2.0,"[(6.2395, 0.0901), (7.8714, 0.0474), (7.8778, ...",1.0,True
2,29,1355,8,CR. 1.03,2.0,"[(6.5255, 0.905), (7.5254, 0.9618), (7.5136, 1...",1.0,True


| OCR Predictions        | Manual Counts     |
| ----------- | ----------- |
| 3      | 4      |

|            | Actual Positive | Actual Negative |
|------------|-----------------|-----------------|
| Predicted Positive | 3            | 0            |
| Predicted Negative | -            | -            |


Tradeoffs/Notes: 
- Document features: noise, faded text no signatures

In [None]:
# =================================================================

---

## Second Document

In [11]:
file_path2 = '/Users/David.Godinez/Downloads/24555770__AMERICAS__Costa-Rica_truncated_1-5.pdf'

In [12]:
result2 = analyze_document(file_path2)

In [13]:
result2.api_version

'2023-02-28-preview'

In [16]:
df_paragraph_spans2 = create_paragraph_dataframe(result2)
df_style_spans2 = create_style_dataframe(result2)
df_merged2 = merge_dataframes(df_paragraph_spans2, df_style_spans2)
# using inner merge
df_merged2

Unnamed: 0,offset,length,content,page_number,polygon,confidence,is_handwritten
0,0,8,CR. 1.05,1,"[(5.9857, 0.4293), (6.9746, 0.4073), (6.9795, ...",1.0,True
1,233,1,-,1,"[(5.0647, 2.8321), (5.2913, 2.8115), (5.3056, ...",0.95,True
2,727,7,CR.1.05,2,"[(6.6163, 0.3615), (7.7631, 0.3567), (7.7643, ...",1.0,True
3,1010,26,Report is filed in CR 3.13,2,"[(5.0446, 2.3924), (7.3884, 2.3638), (7.3968, ...",1.0,True
4,1828,7,9/2/167,2,"[(4.4021, 8.0683), (5.1628, 7.9286), (5.2008, ...",1.0,True
5,1856,20,Brussell 14 Jeffords,2,"[(5.3054, 8.2663), (7.4161, 8.1759), (7.4351, ...",1.0,True
6,1971,7,CR.1.05,3,"[(6.5367, 0.3915), (7.5033, 0.4011), (7.5012, ...",1.0,True
7,2002,36,que also demmary paleo rpt - CR-3.13,3,"[(2.841, 0.8355), (7.5605, 0.8689), (7.5585, 1...",1.0,True
8,4018,1,-,4,"[(7.2826, 6.8593), (7.4015, 6.8665), (7.3909, ...",0.95,True
9,4116,5,- 2 -,5,"[(4.0339, 0.9741), (4.504, 0.9741), (4.504, 1....",0.4,True


In [213]:
# using outer merge
df_merged_outer = merge_dataframes2(df_paragraph_spans2, df_style_spans2)
df_merged_outer[df_merged_outer['is_handwritten'] == True].reset_index()

Unnamed: 0,index,offset,length,content,page_number,polygon,confidence,is_handwritten
0,0,0,8,CR. 1.05,1.0,"[(5.9857, 0.4293), (6.9746, 0.4073), (6.9795, ...",1.0,True
1,6,233,1,-,1.0,"[(5.0647, 2.8321), (5.2913, 2.8115), (5.3056, ...",0.95,True
2,17,727,7,CR.1.05,2.0,"[(6.6163, 0.3615), (7.7631, 0.3567), (7.7643, ...",1.0,True
3,22,1010,26,Report is filed in CR 3.13,2.0,"[(5.0446, 2.3924), (7.3884, 2.3638), (7.3968, ...",1.0,True
4,30,1828,7,9/2/167,2.0,"[(4.4021, 8.0683), (5.1628, 7.9286), (5.2008, ...",1.0,True
5,32,1856,20,Brussell 14 Jeffords,2.0,"[(5.3054, 8.2663), (7.4161, 8.1759), (7.4351, ...",1.0,True
6,35,1971,7,CR.1.05,3.0,"[(6.5367, 0.3915), (7.5033, 0.4011), (7.5012, ...",1.0,True
7,37,2002,36,que also demmary paleo rpt - CR-3.13,3.0,"[(2.841, 0.8355), (7.5605, 0.8689), (7.5585, 1...",1.0,True
8,91,4018,1,-,4.0,"[(7.2826, 6.8593), (7.4015, 6.8665), (7.3909, ...",0.95,True
9,101,4116,5,- 2 -,5.0,"[(4.0339, 0.9741), (4.504, 0.9741), (4.504, 1....",0.4,True


In [10]:
# check [2774,3261, 4475, 3350]
df_paragraph_spans2[df_paragraph_spans2['offset'] == 4475]

NameError: name 'df_paragraph_spans2' is not defined

In [262]:
df_style_spans2[df_style_spans2['offset'] == 4475]

Unnamed: 0,confidence,is_handwritten,offset,length
15,0.4,True,3350,3


In [None]:
# merge on offset and drop less than length of 2; set confidence score of greater than .90
# how might we identify handwriting that wasn't captured 
# next presentation:  have confusion matrix with confidence constraints 
# look at tradeoffs - minimize false positives (we don't mind false negs rn)

| OCR Predictions        | Manual Counts     |
| ----------- | ----------- |
| 13      | 15      |

|            | Actual Positive | Actual Negative |
|------------|-----------------|-----------------|
| Predicted Positive | 10            | 3            |
| Predicted Negative | -            | -            |


In [None]:
# ========================================= experimental wing document 2 =========================================

In [286]:
# merge_dataframes3 is outer merge of dataframes merging only on 'offset'

df_merged_experimental2 = merge_dataframes3(df_paragraph_spans2, df_style_spans2)
df_merged_experimental2[(df_merged_experimental2['is_handwritten'] == True) 
                        & (df_merged_experimental2['length']>1) 
                        & (df_merged_experimental2['content'].notnull()) 
                        # discuss confidence at this point. generally, really high confidence not needed  
                        & (df_merged_experimental2['confidence'] > 0.5)].reset_index() 

Unnamed: 0,index,offset,length,content,page_number,polygon,confidence,is_handwritten
0,0,0,8,CR. 1.05,1.0,"[(5.9857, 0.4293), (6.9746, 0.4073), (6.9795, ...",1.0,True
1,17,727,7,CR.1.05,2.0,"[(6.6163, 0.3615), (7.7631, 0.3567), (7.7643, ...",1.0,True
2,22,1010,26,Report is filed in CR 3.13,2.0,"[(5.0446, 2.3924), (7.3884, 2.3638), (7.3968, ...",1.0,True
3,30,1828,7,9/2/167,2.0,"[(4.4021, 8.0683), (5.1628, 7.9286), (5.2008, ...",1.0,True
4,32,1856,20,Brussell 14 Jeffords,2.0,"[(5.3054, 8.2663), (7.4161, 8.1759), (7.4351, ...",1.0,True
5,35,1971,7,CR.1.05,3.0,"[(6.5367, 0.3915), (7.5033, 0.4011), (7.5012, ...",1.0,True
6,37,2002,36,que also demmary paleo rpt - CR-3.13,3.0,"[(2.841, 0.8355), (7.5605, 0.8689), (7.5585, 1...",1.0,True
7,117,4378,21,E.D. Ackerman by D.H.,5.0,"[(2.0817, 4.9712), (4.9165, 4.9328), (4.9213, ...",1.0,True
8,119,4406,13,"Mas. 21, 1966",5.0,"[(6.0053, 4.9856), (7.4155, 4.9856), (7.4155, ...",1.0,True
9,125,4504,12,12 Apr. 1966,5.0,"[(6.1156, 6.3819), (7.5258, 6.3867), (7.521, 6...",1.0,True


| OCR Predictions        | Manual Counts     |
| ----------- | ----------- |
| 10      | 15      |

|            | Actual Positive | Actual Negative |
|------------|-----------------|-----------------|
| Predicted Positive | 10            |    0         |
| Predicted Negative | -            | -            |


Tradeoffs/Notes:

- document features: Noise, signatures, some faded text
- filtering by len of characters appropriately takes away noise, such as dashes.
- filtering by removing NaN's in this case proved beneficial as only one of the three NaN's were actually handwritten even though 3 of them had a confidence of 1. 
- confidence was able to filter out remaining noise. 

In [None]:
# ========================================================

## Document 3

In [195]:
file_path3 = '/Users/David.Godinez/Downloads/24769174__AMERICAS__Costa-Rica_truncated1-5.pdf'

In [196]:
result3 = analyze_document(file_path3)

In [202]:
df_paragraph_spans3 = create_paragraph_dataframe(result3)
df_style_spans3 = create_style_dataframe(result3)
df_merged3 = merge_dataframes2(df_paragraph_spans3, df_style_spans3)
df_merged3

Unnamed: 0,offset,length,content,page_number,polygon,confidence,is_handwritten
0,0,35,H013014201 KATALYST DATA MANAGEMENT,1.0,"[(6.0632, 0.608), (7.2518, 0.5597), (7.2625, 0...",0.0,False
1,36,73,"GEOLOGIC RECONNAISSANCE SURVEY, PROVINCE OF GU...",1.0,"[(2.1797, 4.4258), (5.5894, 4.4545), (5.5869, ...",0.0,False
2,110,40,by W. E. Wallis & M. H. Wallace CR. 3.07,1.0,"[(2.2198, 4.9035), (5.1274, 4.6813), (5.157, 5...",0.0,False
3,151,4,1943,1.0,"[(5.3284, 5.107), (5.6587, 5.107), (5.6587, 5....",0.0,False
4,156,55,RETURN TO: ESSO EXPLORATION INC. CENTRAL FILES...,1.0,"[(2.7192, 6.117), (4.9541, 5.9982), (4.9982, 6...",0.0,False
...,...,...,...,...,...,...,...
134,3212,168,"of Puntarenas, "" by Messrs. W. E. Ballis and M...",5.0,"[(1.1819, 4.7768), (7.4071, 4.8294), (7.3998, ...",0.0,False
135,3381,32,"Yours very truly, Walter K. Link",5.0,"[(4.9989, 5.8249), (6.4453, 5.8394), (6.4402, ...",0.0,False
136,3414,196,P. S. On Plate III the Guiones Limestone-Carri...,5.0,"[(1.2441, 7.1119), (7.3593, 7.1173), (7.3589, ...",0.0,False
137,222,6,,,,0.7,True


In [240]:
df_merged3[df_merged3['is_handwritten'] == True].reset_index()

Unnamed: 0,index,offset,length,content,page_number,polygon,confidence,is_handwritten
0,0,0,8,CR. 1.05,1.0,"[(5.9857, 0.4293), (6.9746, 0.4073), (6.9795, ...",1.0,True
1,6,233,1,-,1.0,"[(5.0647, 2.8321), (5.2913, 2.8115), (5.3056, ...",0.95,True
2,17,727,7,CR.1.05,2.0,"[(6.6163, 0.3615), (7.7631, 0.3567), (7.7643, ...",1.0,True
3,22,1010,26,Report is filed in CR 3.13,2.0,"[(5.0446, 2.3924), (7.3884, 2.3638), (7.3968, ...",1.0,True
4,30,1828,7,9/2/167,2.0,"[(4.4021, 8.0683), (5.1628, 7.9286), (5.2008, ...",1.0,True
5,32,1856,20,Brussell 14 Jeffords,2.0,"[(5.3054, 8.2663), (7.4161, 8.1759), (7.4351, ...",1.0,True
6,35,1971,7,CR.1.05,3.0,"[(6.5367, 0.3915), (7.5033, 0.4011), (7.5012, ...",1.0,True
7,37,2002,36,que also demmary paleo rpt - CR-3.13,3.0,"[(2.841, 0.8355), (7.5605, 0.8689), (7.5585, 1...",1.0,True
8,91,4018,1,-,4.0,"[(7.2826, 6.8593), (7.4015, 6.8665), (7.3909, ...",0.95,True
9,101,4116,5,- 2 -,5.0,"[(4.0339, 0.9741), (4.504, 0.9741), (4.504, 1....",0.4,True


| OCR Predictions        | Manual Counts     |
| ----------- | ----------- |
| 14      | 11      |

|            | Actual Positive | Actual Negative |
|------------|-----------------|-----------------|
| Predicted Positive | 9            | 5            |
| Predicted Negative | -            | -            |


In [246]:
# ======================= experimental wing document 3 ========================

In [245]:
df_merged_experimental3 = merge_dataframes3(df_paragraph_spans3, df_style_spans3)
df_merged_experimental3[(df_merged_experimental3['is_handwritten'] == True) 
                        & (df_merged_experimental3['length']>1) 
                        & (df_merged_experimental3['content'].notnull()) 
                        & (df_merged_experimental3['confidence'] > 0.8)].reset_index()

Unnamed: 0,index,offset,length,content,page_number,polygon,confidence,is_handwritten
0,11,377,8,CR. 3.07,2.0,"[(6.5275, 0.5703), (7.5486, 0.5895), (7.544, 0...",1.0,True
1,20,602,7,1-13-66,2.0,"[(6.8536, 2.3962), (7.793, 2.3722), (7.7978, 2...",1.0,True
2,33,778,7,CR.3.07,2.0,"[(0.647, 5.2908), (1.5028, 5.1802), (1.5289, 5...",0.95,True
3,35,948,7,CR.3.05,2.0,"[(0.6556, 6.1728), (1.5145, 5.9857), (1.5584, ...",1.0,True
4,38,1269,7,CR.3.09,2.0,"[(0.7573, 6.8244), (1.5986, 6.6906), (1.6321, ...",1.0,True
5,46,1411,2,14,3.0,"[(3.4032, 1.5255), (3.7083, 1.514), (3.719, 1....",1.0,True
6,48,1430,7,CR 3.07,3.0,"[(6.2164, 1.1053), (7.3793, 1.0995), (7.3804, ...",1.0,True


| OCR Predictions        | Manual Counts     |
| ----------- | ----------- |
| 7      | 11      |

|            | Actual Positive | Actual Negative |
|------------|-----------------|-----------------|
| Predicted Positive | 7            | 0            |
| Predicted Negative | -            | -            |


In [None]:
# ====================================

## Document Four

In [287]:
file_path4 = '/Users/David.Godinez/Downloads/25769174__AMERICAS__Costa-Rica_truncated_19-end.pdf' 

In [288]:
result4 = analyze_document(file_path4)

In [292]:
df_paragraph_spans4 = create_paragraph_dataframe(result4)
df_style_spans4 = create_style_dataframe(result4)
df_merged4 = merge_dataframes3(df_paragraph_spans4, df_style_spans4)
df_merged4[df_merged4['is_handwritten'] == True]

Unnamed: 0,offset,length,content,page_number,polygon,confidence,is_handwritten
3,113,6,Sin NW,1,"[(5.474, 4.7745), (6.4502, 4.8018), (6.4432, 5...",0.7,True
11,173,1,-,1,"[(5.9343, 8.4989), (6.0804, 8.4989), (6.0804, ...",0.9,True
18,245,9,CR. 3. 07,1,"[(5.918, 11.7414), (6.8613, 11.7468), (6.8601,...",1.0,True
50,3788,9,CR. 3. 07,4,"[(6.2818, 10.5932), (7.0814, 10.598), (7.0801,...",0.95,True
58,4014,9,CR. 3. 07,5,"[(6.3917, 10.6315), (7.214, 10.6315), (7.214, ...",0.95,True
65,4239,8,CR. 3.on,6,"[(7.0993, 10.5989), (7.892, 10.6133), (7.8888,...",0.95,True


In [295]:
df_merged4[(df_merged4['is_handwritten'] == True) 
                        & (df_merged4['length']>1) 
                        & (df_merged4['content'].notnull()) 
                        & (df_merged4['confidence'] > 0.5)].reset_index()

Unnamed: 0,index,offset,length,content,page_number,polygon,confidence,is_handwritten
0,3,113,6,Sin NW,1,"[(5.474, 4.7745), (6.4502, 4.8018), (6.4432, 5...",0.7,True
1,18,245,9,CR. 3. 07,1,"[(5.918, 11.7414), (6.8613, 11.7468), (6.8601,...",1.0,True
2,50,3788,9,CR. 3. 07,4,"[(6.2818, 10.5932), (7.0814, 10.598), (7.0801,...",0.95,True
3,58,4014,9,CR. 3. 07,5,"[(6.3917, 10.6315), (7.214, 10.6315), (7.214, ...",0.95,True
4,65,4239,8,CR. 3.on,6,"[(7.0993, 10.5989), (7.892, 10.6133), (7.8888,...",0.95,True


| OCR Predictions        | Manual Counts     |
| ----------- | ----------- |
| 4      | 4      |

|            | Actual Positive | Actual Negative |
|------------|-----------------|-----------------|
| Predicted Positive | 4            | 0            |
| Predicted Negative | -            | -            |


In [None]:
# Find highest we can set confidence threshold without affecting universal true positives 
# show full confusion matrix
# can we have boxed around handwriting be a different color 

In [74]:
def create_style_dataframe(styles):
    data = []

    for style in styles[0]:
        is_handwritten = style.is_handwritten
        confidence = style.confidence

        for span in style.spans:
            data.append({
                "confidence": confidence,
                "is_handwritten": is_handwritten,
                "offset": span.offset,
                "length": span.length
            })

    df_style_spans = pd.DataFrame(data)

    return df_style_spans



In [75]:
styles = analyzed_result['analyzeResult.styles']
df_style_spans = create_style_dataframe(styles)
df_style_spans

AttributeError: 'str' object has no attribute 'is_handwritten'

In [66]:
result.styles[2]

DocumentStyle(is_handwritten=True, spans=[DocumentSpan(offset=4960, length=1)], confidence=0.7, similar_font_family=None, font_style=None, font_weight=None, color=None, background_color=None)

In [69]:
styles = analyzed_result['analyzeResult.styles']

In [73]:
styles[0]

'[DocumentStyle(is_handwritten=True, spans=[DocumentSpan(offset=0, length=8), DocumentSpan(offset=727, length=7), DocumentSpan(offset=1010, length=26), DocumentSpan(offset=1828, length=7), DocumentSpan(offset=1856, length=20), DocumentSpan(offset=1971, length=7), DocumentSpan(offset=2002, length=36), DocumentSpan(offset=2774, length=114), DocumentSpan(offset=3261, length=1), DocumentSpan(offset=4378, length=21), DocumentSpan(offset=4406, length=13), DocumentSpan(offset=4475, length=22), DocumentSpan(offset=4504, length=12)], confidence=1.0, similar_font_family=None, font_style=None, font_weight=None, color=None, background_color=None), DocumentStyle(is_handwritten=True, spans=[DocumentSpan(offset=233, length=1), DocumentSpan(offset=4018, length=1)], confidence=0.95, similar_font_family=None, font_style=None, font_weight=None, color=None, background_color=None), DocumentStyle(is_handwritten=True, spans=[DocumentSpan(offset=3350, length=3), DocumentSpan(offset=4116, length=5)], confidenc

In [78]:
result.styles

[DocumentStyle(is_handwritten=True, spans=[DocumentSpan(offset=36, length=7), DocumentSpan(offset=1308, length=14), DocumentSpan(offset=1355, length=8), DocumentSpan(offset=4672, length=1)], confidence=1.0, similar_font_family=None, font_style=None, font_weight=None, color=None, background_color=None),
 DocumentStyle(is_handwritten=True, spans=[DocumentSpan(offset=3186, length=1), DocumentSpan(offset=4649, length=1)], confidence=0.4, similar_font_family=None, font_style=None, font_weight=None, color=None, background_color=None),
 DocumentStyle(is_handwritten=True, spans=[DocumentSpan(offset=4960, length=1)], confidence=0.7, similar_font_family=None, font_style=None, font_weight=None, color=None, background_color=None)]

In [98]:
for a in range(len(result.styles)):
    b = result.styles[a].to_dict()
    print(f"confidence: {b['confidence']}: spans: {b['spans']}\n")

confidence: 1.0: spans: [{'offset': 36, 'length': 7}, {'offset': 1308, 'length': 14}, {'offset': 1355, 'length': 8}, {'offset': 4672, 'length': 1}]

confidence: 0.4: spans: [{'offset': 3186, 'length': 1}, {'offset': 4649, 'length': 1}]

confidence: 0.7: spans: [{'offset': 4960, 'length': 1}]



In [102]:
b = result.styles

DocumentStyle(is_handwritten=True, spans=[DocumentSpan(offset=36, length=7), DocumentSpan(offset=1308, length=14), DocumentSpan(offset=1355, length=8), DocumentSpan(offset=4672, length=1)], confidence=1.0, similar_font_family=None, font_style=None, font_weight=None, color=None, background_color=None)

In [117]:
for a in range(len(result2.styles)):
    b = result.styles[a].to_dict()
    print(f"confidence: {b['confidence']}: spans: {b['spans']}\n")

confidence: 1.0: spans: [{'offset': 36, 'length': 7}, {'offset': 1308, 'length': 14}, {'offset': 1355, 'length': 8}, {'offset': 4672, 'length': 1}]

confidence: 0.4: spans: [{'offset': 3186, 'length': 1}, {'offset': 4649, 'length': 1}]

confidence: 0.7: spans: [{'offset': 4960, 'length': 1}]



In [121]:
analyzed_result2['analyzeResult.paragraphs'][0]

'[DocumentParagraph(role=None, content=CR. 1.05, bounding_regions=[BoundingRegion(page_number=1, polygon=[Point(x=5.9857, y=0.4293), Point(x=6.9746, y=0.4073), Point(x=6.9795, y=0.6291), Point(x=5.9907, y=0.651)])], spans=[DocumentSpan(offset=0, length=8)]), DocumentParagraph(role=None, content=December 10, 1969, bounding_regions=[BoundingRegion(page_number=1, polygon=[Point(x=1.4711, y=1.1429), Point(x=2.9375, y=1.1429), Point(x=2.9375, y=1.3358), Point(x=1.4711, y=1.3358)])], spans=[DocumentSpan(offset=9, length=17)]), DocumentParagraph(role=None, content=Logs of Costa Rica Wells., bounding_regions=[BoundingRegion(page_number=1, polygon=[Point(x=1.4567, y=1.6638), Point(x=3.5983, y=1.6638), Point(x=3.5983, y=1.8567), Point(x=1.4567, y=1.8567)])], spans=[DocumentSpan(offset=27, length=25)]), DocumentParagraph(role=None, content=H012943680 KATALYST DATA MANAGEMENT, bounding_regions=[BoundingRegion(page_number=1, polygon=[Point(x=5.1417, y=1.3648), Point(x=6.3393, y=1.3405), Point(x=6.3

In [124]:
input_string = analyzed_result['analyzeResult.paragraphs'][0]

In [137]:
type(result)

azure.ai.formrecognizer._models.AnalyzeResult

In [142]:
import pandas as pd

def create_paragraph_dataframe(analyze_result):
    data = []
    
    for paragraph in analyze_result.paragraphs:
        content = paragraph.content
        bounding_region = paragraph.bounding_regions[0]  # Assuming there's always at least one bounding region
        page_number = bounding_region.page_number
        polygon = [(point.x, point.y) for point in bounding_region.polygon]
        span = paragraph.spans[0]  # Assuming there's always at least one span
        
        data.append({
            "offset": span.offset,
            "length": span.length,
            "content": content,
            "page_number": page_number,
            "polygon": polygon
        })
        
    df_paragraph_spans = pd.DataFrame(data)

    return df_paragraph_spans


In [143]:
a = create_paragraph_dataframe(result)

In [144]:
a

Unnamed: 0,offset,length,content,page_number,polygon
0,0,35,HØ12943623 KATALYST DATA MANAGEMENT,1,"[(1.5709, 0.6702), (2.7771, 0.6461), (2.7825, ..."
1,36,7,CR 1.03,1,"[(6.7387, 1.3306), (7.7821, 1.345), (7.7791, 1..."
2,44,14,"March 21, 1950",1,"[(3.6263, 1.7614), (4.8913, 1.7518), (4.8929, ..."
3,59,11,Filet CR-30,1,"[(5.0014, 2.2496), (6.0974, 2.2496), (6.0974, ..."
4,71,8,Subject:,1,"[(5.0014, 2.5942), (5.7145, 2.5942), (5.7145, ..."
...,...,...,...,...,...
135,5231,299,6 Circ & cond mud 10 hrs. Drilled plug & shoe ...,5,"[(3.2411, 5.8407), (7.8798, 5.9063), (7.8631, ..."
136,5531,10,Form. Nº %,5,"[(0.616, 10.0543), (1.1988, 10.0543), (1.1988,..."
137,5542,123,7 Circ & cond mud 16 hrs. Using Lime-Oil Emuls...,5,"[(3.2411, 7.2034), (7.7862, 7.2304), (7.7832, ..."
138,5666,3,79',5,"[(2.4829, 7.3631), (2.7198, 7.3631), (2.7246, ..."


In [149]:
import pandas as pd

def create_style_dataframe(analyze_result):
    data = []

    for style in analyze_result.styles:
        is_handwritten = style.is_handwritten
        confidence = style.confidence
        
        for span in style.spans:
            data.append({
                "confidence": confidence,
                "is_handwritten": is_handwritten,
                "offset": span.offset,
                "length": span.length
            })

    df_style_spans = pd.DataFrame(data)

    return df_style_spans


In [150]:
b = create_style_dataframe(result)

In [151]:
b

Unnamed: 0,confidence,is_handwritten,offset,length
0,1.0,True,36,7
1,1.0,True,1308,14
2,1.0,True,1355,8
3,1.0,True,4672,1
4,0.4,True,3186,1
5,0.4,True,4649,1
6,0.7,True,4960,1


In [148]:
result.styles

[DocumentStyle(is_handwritten=True, spans=[DocumentSpan(offset=36, length=7), DocumentSpan(offset=1308, length=14), DocumentSpan(offset=1355, length=8), DocumentSpan(offset=4672, length=1)], confidence=1.0, similar_font_family=None, font_style=None, font_weight=None, color=None, background_color=None),
 DocumentStyle(is_handwritten=True, spans=[DocumentSpan(offset=3186, length=1), DocumentSpan(offset=4649, length=1)], confidence=0.4, similar_font_family=None, font_style=None, font_weight=None, color=None, background_color=None),
 DocumentStyle(is_handwritten=True, spans=[DocumentSpan(offset=4960, length=1)], confidence=0.7, similar_font_family=None, font_style=None, font_weight=None, color=None, background_color=None)]