In [None]:
#%pip install paddlepaddle -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
#%pip install paddleocr

In [None]:
from pathlib import Path
from paddleocr import PaddleOCR 
import pandas as pd
import re

In [None]:
ocr = PaddleOCR(
    use_doc_orientation_classify=False,
    use_doc_unwarping=False,
    use_textline_orientation=False)

#ocr = PaddleOCR(
#    use_doc_orientation_classify=False, # Disables document orientation classification model via this parameter
#    use_doc_unwarping=False, # Disables text image rectification model via this parameter
#    use_textline_orientation=False, # Disables text line orientation classification model via this parameter
#)

# ocr = PaddleOCR(lang="en") # Uses English model by specifying language parameter
# ocr = PaddleOCR(ocr_version="PP-OCRv4") # Uses other PP-OCR versions via version parameter
# ocr = PaddleOCR(device="gpu") # Enables GPU acceleration for model inference via device parameter
#ocr = PaddleOCR(
#        #device="gpu",
#        text_detection_model_name="PP-OCRv5_mobile_det",
#        text_recognition_model_name="PP-OCRv5_mobile_rec",
#        use_doc_orientation_classify=True,
#        use_doc_unwarping=False,
#        use_textline_orientation=True,
# ) # Switch to PP-OCRv5_mobile models

In [None]:
def ocrDetection(filePath):
  result = ocr.predict(input = filePath)
  for res in result:  
    res.print()  
    #res.save_to_img("output")  
    #res.save_to_json("output")

  print(res.keys())
  print(f'inputPath: {res["input_path"]}')
  print(res['text_det_params'])
  print(res['rec_texts'], res['rec_scores'])
  
  ocrResults = {
      'input_path': res['input_path'],
      'rec_texts': res['rec_texts'],
      'rec_scores': res['rec_scores']
  }

  df = pd.DataFrame(ocrResults)

  return df


# Function to extract TreeID using regular expressions
def extract_tree_id(extract_list):
    result_strings = []  # Initialize a list to collect results
    
    for item in extract_list:
        if isinstance(item, str) and re.match(r'TTT@(.*)', item):
            match = re.match(r'TTT@(.*)', item)
            if match:
                all_groups = match.groups()
                result_strings.append('T' + ''.join(all_groups))  # Prepend 'T' and add to list
    return ', '.join(result_strings) if result_strings else None  # Return concatenated results or None


def exportExcel(dfAll, dfAbnormal, folderPath):
  excel_file_path = Path(folderPath, 'exportOCR.xlsx')

  # Write DataFrames to Excel file
  with pd.ExcelWriter(excel_file_path) as writer:
      dfAll.to_excel(writer, sheet_name='dfAll', index=False)
      dfAbnormal.to_excel(writer, sheet_name='dfAbnormal', index=False)
      print("Done")
   

#=== main ===
def getFilesText(folderPath):
  dfAll = pd.DataFrame()
  valid_extensions = {'.jpg', '.jpeg', '.png', '.pdf'}
  for filePath in Path(folderPath).iterdir():
    if filePath.is_file() and filePath.suffix.lower() in valid_extensions:  # Check if it's a file
        print(f'scan filePath: {filePath}')
        df = ocrDetection(str(filePath))
        dfAll = pd.concat([dfAll, df], ignore_index = True)
  dfAll['rec_texts'] = dfAll['rec_texts'].apply(lambda x: ', '.join(x))
  # Apply the function to create the TreeID column
  dfAll['TreeID'] = dfAll['rec_texts'].apply(extract_tree_id)

  dfAbnormal = dfAll[dfAll['rec_texts'].str.contains(', ')]
  
  return dfAll, dfAbnormal


if __name__ == "__main__":
  msgbox = input('PhotoPath') #r'G:\partTime\Tsing yi Bi Monthly\青衣 Bimonthly monitoring-20250821T081024Z-1-001\青衣 Bimonthly monitoring\202508_Monitoring\Photo\20250813_Mon_Ricky\Raw'
  folderPath = msgbox
  dfAll, dfAbnormal = getFilesText(folderPath)
  exportExcel(dfAll, dfAbnormal, folderPath)



scan filePath: C:\Users\Adm\Desktop\pytest\IMG_0006.JPG


[32m{'res': {'input_path': 'C:\\Users\\Adm\\Desktop\\pytest\\IMG_0006.JPG', 'page_index': None, 'model_settings': {'use_doc_preprocessor': True, 'use_textline_orientation': False}, 'doc_preprocessor_res': {'input_path': None, 'page_index': None, 'model_settings': {'use_doc_orientation_classify': False, 'use_doc_unwarping': False}, 'angle': -1}, 'dt_polys': array([[[299, 278],
        ...,
        [299, 306]],

       ...,

       [[511, 589],
        ...,
        [511, 600]]], shape=(9, 4, 2), dtype=int16), 'text_det_params': {'limit_side_len': 64, 'limit_type': 'min', 'thresh': 0.3, 'max_side_limit': 4000, 'box_thresh': 0.6, 'unclip_ratio': 1.5}, 'text_type': 'general', 'textline_orientation_angles': array([-1, ..., -1], shape=(9,)), 'text_rec_score_thresh': 0.0, 'rec_texts': ['T0433', 'Whole', 'view', '54', '13/08/2025', '=，', 'T', '田', '更多工具'], 'rec_scores': array([0.99624634, ..., 0.86230665], shape=(9,)), 'rec_polys': array([[[299, 278],
        ...,
        [299, 306]],

       

dict_keys(['input_path', 'page_index', 'doc_preprocessor_res', 'dt_polys', 'model_settings', 'text_det_params', 'text_type', 'text_rec_score_thresh', 'rec_texts', 'rec_scores', 'rec_polys', 'vis_fonts', 'textline_orientation_angles', 'rec_boxes'])
inputPath: C:\Users\Adm\Desktop\pytest\IMG_0006.JPG
{'limit_side_len': 64, 'limit_type': 'min', 'thresh': 0.3, 'max_side_limit': 4000, 'box_thresh': 0.6, 'unclip_ratio': 1.5}
['T0433', 'Whole', 'view', '54', '13/08/2025', '=，', 'T', '田', '更多工具'] [0.996246337890625, 0.9980646967887878, 0.9950081706047058, 0.9979003667831421, 0.9996752738952637, 0.4812706708908081, 0.9987766146659851, 0.5650695562362671, 0.8623066544532776]
scan filePath: C:\Users\Adm\Desktop\pytest\IMG_0007.JPG


[32m{'res': {'input_path': 'C:\\Users\\Adm\\Desktop\\pytest\\IMG_0007.JPG', 'page_index': None, 'model_settings': {'use_doc_preprocessor': True, 'use_textline_orientation': False}, 'doc_preprocessor_res': {'input_path': None, 'page_index': None, 'model_settings': {'use_doc_orientation_classify': False, 'use_doc_unwarping': False}, 'angle': -1}, 'dt_polys': array([[[299, 535],
        ...,
        [299, 552]],

       [[385, 569],
        ...,
        [381, 580]]], shape=(2, 4, 2), dtype=int16), 'text_det_params': {'limit_side_len': 64, 'limit_type': 'min', 'thresh': 0.3, 'max_side_limit': 4000, 'box_thresh': 0.6, 'unclip_ratio': 1.5}, 'text_type': 'general', 'textline_orientation_angles': array([-1, -1], shape=(2,)), 'text_rec_score_thresh': 0.0, 'rec_texts': ['13/08/2025', 'DRILTECH'], 'rec_scores': array([0.9924835 , 0.92678946], shape=(2,)), 'rec_polys': array([[[299, 535],
        ...,
        [299, 552]],

       [[385, 569],
        ...,
        [381, 580]]], shape=(2, 4, 2), dt

dict_keys(['input_path', 'page_index', 'doc_preprocessor_res', 'dt_polys', 'model_settings', 'text_det_params', 'text_type', 'text_rec_score_thresh', 'rec_texts', 'rec_scores', 'rec_polys', 'vis_fonts', 'textline_orientation_angles', 'rec_boxes'])
inputPath: C:\Users\Adm\Desktop\pytest\IMG_0007.JPG
{'limit_side_len': 64, 'limit_type': 'min', 'thresh': 0.3, 'max_side_limit': 4000, 'box_thresh': 0.6, 'unclip_ratio': 1.5}
['13/08/2025', 'DRILTECH'] [0.9924834966659546, 0.9267894625663757]
scan filePath: C:\Users\Adm\Desktop\pytest\IMG_0008.JPG


[32m{'res': {'input_path': 'C:\\Users\\Adm\\Desktop\\pytest\\IMG_0008.JPG', 'page_index': None, 'model_settings': {'use_doc_preprocessor': True, 'use_textline_orientation': False}, 'doc_preprocessor_res': {'input_path': None, 'page_index': None, 'model_settings': {'use_doc_orientation_classify': False, 'use_doc_unwarping': False}, 'angle': -1}, 'dt_polys': array([[[183, 525],
        ...,
        [187, 536]],

       [[298, 534],
        ...,
        [298, 552]]], shape=(2, 4, 2), dtype=int16), 'text_det_params': {'limit_side_len': 64, 'limit_type': 'min', 'thresh': 0.3, 'max_side_limit': 4000, 'box_thresh': 0.6, 'unclip_ratio': 1.5}, 'text_type': 'general', 'textline_orientation_angles': array([-1, -1], shape=(2,)), 'text_rec_score_thresh': 0.0, 'rec_texts': ['DRILTECH', '13/08/2025'], 'rec_scores': array([0.8311581, 0.9978925], shape=(2,)), 'rec_polys': array([[[183, 525],
        ...,
        [187, 536]],

       [[298, 534],
        ...,
        [298, 552]]], shape=(2, 4, 2), dtyp

dict_keys(['input_path', 'page_index', 'doc_preprocessor_res', 'dt_polys', 'model_settings', 'text_det_params', 'text_type', 'text_rec_score_thresh', 'rec_texts', 'rec_scores', 'rec_polys', 'vis_fonts', 'textline_orientation_angles', 'rec_boxes'])
inputPath: C:\Users\Adm\Desktop\pytest\IMG_0008.JPG
{'limit_side_len': 64, 'limit_type': 'min', 'thresh': 0.3, 'max_side_limit': 4000, 'box_thresh': 0.6, 'unclip_ratio': 1.5}
['DRILTECH', '13/08/2025'] [0.8311581015586853, 0.9978924989700317]
scan filePath: C:\Users\Adm\Desktop\pytest\IMG_0009.JPG


[32m{'res': {'input_path': 'C:\\Users\\Adm\\Desktop\\pytest\\IMG_0009.JPG', 'page_index': None, 'model_settings': {'use_doc_preprocessor': True, 'use_textline_orientation': False}, 'doc_preprocessor_res': {'input_path': None, 'page_index': None, 'model_settings': {'use_doc_orientation_classify': False, 'use_doc_unwarping': False}, 'angle': -1}, 'dt_polys': array([[[552, 267],
        ...,
        [552, 294]],

       ...,

       [[627, 503],
        ...,
        [627, 528]]], shape=(4, 4, 2), dtype=int16), 'text_det_params': {'limit_side_len': 64, 'limit_type': 'min', 'thresh': 0.3, 'max_side_limit': 4000, 'box_thresh': 0.6, 'unclip_ratio': 1.5}, 'text_type': 'general', 'textline_orientation_angles': array([-1, ..., -1], shape=(4,)), 'text_rec_score_thresh': 0.0, 'rec_texts': ['28-06-2025', 'T0432', 'Wholeview', '13/08/2025'], 'rec_scores': array([0.99370688, ..., 0.99917108], shape=(4,)), 'rec_polys': array([[[552, 267],
        ...,
        [552, 294]],

       ...,

       [[627, 

dict_keys(['input_path', 'page_index', 'doc_preprocessor_res', 'dt_polys', 'model_settings', 'text_det_params', 'text_type', 'text_rec_score_thresh', 'rec_texts', 'rec_scores', 'rec_polys', 'vis_fonts', 'textline_orientation_angles', 'rec_boxes'])
inputPath: C:\Users\Adm\Desktop\pytest\IMG_0009.JPG
{'limit_side_len': 64, 'limit_type': 'min', 'thresh': 0.3, 'max_side_limit': 4000, 'box_thresh': 0.6, 'unclip_ratio': 1.5}
['28-06-2025', 'T0432', 'Wholeview', '13/08/2025'] [0.9937068819999695, 0.9903384447097778, 0.9917887449264526, 0.9991710782051086]
scan filePath: C:\Users\Adm\Desktop\pytest\IMG_0010.JPG


[32m{'res': {'input_path': 'C:\\Users\\Adm\\Desktop\\pytest\\IMG_0010.JPG', 'page_index': None, 'model_settings': {'use_doc_preprocessor': True, 'use_textline_orientation': False}, 'doc_preprocessor_res': {'input_path': None, 'page_index': None, 'model_settings': {'use_doc_orientation_classify': False, 'use_doc_unwarping': False}, 'angle': -1}, 'dt_polys': array([[[296, 534],
        ...,
        [297, 554]]], shape=(1, 4, 2), dtype=int16), 'text_det_params': {'limit_side_len': 64, 'limit_type': 'min', 'thresh': 0.3, 'max_side_limit': 4000, 'box_thresh': 0.6, 'unclip_ratio': 1.5}, 'text_type': 'general', 'textline_orientation_angles': array([-1]), 'text_rec_score_thresh': 0.0, 'rec_texts': ['13/08/2025'], 'rec_scores': array([0.99768716]), 'rec_polys': array([[[296, 534],
        ...,
        [297, 554]]], shape=(1, 4, 2), dtype=int16), 'rec_boxes': array([[296, ..., 554]], shape=(1, 4), dtype=int16)}}[0m


dict_keys(['input_path', 'page_index', 'doc_preprocessor_res', 'dt_polys', 'model_settings', 'text_det_params', 'text_type', 'text_rec_score_thresh', 'rec_texts', 'rec_scores', 'rec_polys', 'vis_fonts', 'textline_orientation_angles', 'rec_boxes'])
inputPath: C:\Users\Adm\Desktop\pytest\IMG_0010.JPG
{'limit_side_len': 64, 'limit_type': 'min', 'thresh': 0.3, 'max_side_limit': 4000, 'box_thresh': 0.6, 'unclip_ratio': 1.5}
['13/08/2025'] [0.9976871609687805]
scan filePath: C:\Users\Adm\Desktop\pytest\IMG_0011.JPG


[32m{'res': {'input_path': 'C:\\Users\\Adm\\Desktop\\pytest\\IMG_0011.JPG', 'page_index': None, 'model_settings': {'use_doc_preprocessor': True, 'use_textline_orientation': False}, 'doc_preprocessor_res': {'input_path': None, 'page_index': None, 'model_settings': {'use_doc_orientation_classify': False, 'use_doc_unwarping': False}, 'angle': -1}, 'dt_polys': array([[[ 78, 248],
        ...,
        [ 78, 264]],

       ...,

       [[536, 578],
        ...,
        [536, 599]]], shape=(13, 4, 2), dtype=int16), 'text_det_params': {'limit_side_len': 64, 'limit_type': 'min', 'thresh': 0.3, 'max_side_limit': 4000, 'box_thresh': 0.6, 'unclip_ratio': 1.5}, 'text_type': 'general', 'textline_orientation_angles': array([-1, ..., -1], shape=(13,)), 'text_rec_score_thresh': 0.0, 'rec_texts': ['28-06-2025', '28-06-2025', '54', 'holeview', 'T0435Wholeview', 'ADT', '13/08/2025', '註解', '螢光標示', '繪製', '文字', '填寫和簽署', '更多工具'], 'rec_scores': array([0.98068154, ..., 0.9868114 ], shape=(13,)), 'rec_polys': a

dict_keys(['input_path', 'page_index', 'doc_preprocessor_res', 'dt_polys', 'model_settings', 'text_det_params', 'text_type', 'text_rec_score_thresh', 'rec_texts', 'rec_scores', 'rec_polys', 'vis_fonts', 'textline_orientation_angles', 'rec_boxes'])
inputPath: C:\Users\Adm\Desktop\pytest\IMG_0011.JPG
{'limit_side_len': 64, 'limit_type': 'min', 'thresh': 0.3, 'max_side_limit': 4000, 'box_thresh': 0.6, 'unclip_ratio': 1.5}
['28-06-2025', '28-06-2025', '54', 'holeview', 'T0435Wholeview', 'ADT', '13/08/2025', '註解', '螢光標示', '繪製', '文字', '填寫和簽署', '更多工具'] [0.9806815385818481, 0.945273220539093, 0.9993879795074463, 0.9871870279312134, 0.9410895705223083, 0.4743160307407379, 0.9991892576217651, 0.8550659418106079, 0.8042163848876953, 0.6229471564292908, 0.9980134963989258, 0.9335204362869263, 0.9868113994598389]
scan filePath: C:\Users\Adm\Desktop\pytest\IMG_0012.JPG


[32m{'res': {'input_path': 'C:\\Users\\Adm\\Desktop\\pytest\\IMG_0012.JPG', 'page_index': None, 'model_settings': {'use_doc_preprocessor': True, 'use_textline_orientation': False}, 'doc_preprocessor_res': {'input_path': None, 'page_index': None, 'model_settings': {'use_doc_orientation_classify': False, 'use_doc_unwarping': False}, 'angle': -1}, 'dt_polys': array([[[112, 524],
        ...,
        [111, 535]],

       ...,

       [[297, 556],
        ...,
        [296, 570]]], shape=(3, 4, 2), dtype=int16), 'text_det_params': {'limit_side_len': 64, 'limit_type': 'min', 'thresh': 0.3, 'max_side_limit': 4000, 'box_thresh': 0.6, 'unclip_ratio': 1.5}, 'text_type': 'general', 'textline_orientation_angles': array([-1, ..., -1], shape=(3,)), 'text_rec_score_thresh': 0.0, 'rec_texts': ['路政署', '13/08/2025', '人内'], 'rec_scores': array([0.79953843, ..., 0.63223791], shape=(3,)), 'rec_polys': array([[[112, 524],
        ...,
        [111, 535]],

       ...,

       [[297, 556],
        ...,
    

dict_keys(['input_path', 'page_index', 'doc_preprocessor_res', 'dt_polys', 'model_settings', 'text_det_params', 'text_type', 'text_rec_score_thresh', 'rec_texts', 'rec_scores', 'rec_polys', 'vis_fonts', 'textline_orientation_angles', 'rec_boxes'])
inputPath: C:\Users\Adm\Desktop\pytest\IMG_0012.JPG
{'limit_side_len': 64, 'limit_type': 'min', 'thresh': 0.3, 'max_side_limit': 4000, 'box_thresh': 0.6, 'unclip_ratio': 1.5}
['路政署', '13/08/2025', '人内'] [0.7995384335517883, 0.99811851978302, 0.6322379112243652]
scan filePath: C:\Users\Adm\Desktop\pytest\IMG_0013.JPG


[32m{'res': {'input_path': 'C:\\Users\\Adm\\Desktop\\pytest\\IMG_0013.JPG', 'page_index': None, 'model_settings': {'use_doc_preprocessor': True, 'use_textline_orientation': False}, 'doc_preprocessor_res': {'input_path': None, 'page_index': None, 'model_settings': {'use_doc_orientation_classify': False, 'use_doc_unwarping': False}, 'angle': -1}, 'dt_polys': array([[[350, 283],
        ...,
        [350, 311]],

       [[627, 503],
        ...,
        [627, 528]]], shape=(2, 4, 2), dtype=int16), 'text_det_params': {'limit_side_len': 64, 'limit_type': 'min', 'thresh': 0.3, 'max_side_limit': 4000, 'box_thresh': 0.6, 'unclip_ratio': 1.5}, 'text_type': 'general', 'textline_orientation_angles': array([-1, -1], shape=(2,)), 'text_rec_score_thresh': 0.0, 'rec_texts': ['434', '13/08/2025'], 'rec_scores': array([0.99932653, 0.99843663], shape=(2,)), 'rec_polys': array([[[350, 283],
        ...,
        [350, 311]],

       [[627, 503],
        ...,
        [627, 528]]], shape=(2, 4, 2), dtype=i

dict_keys(['input_path', 'page_index', 'doc_preprocessor_res', 'dt_polys', 'model_settings', 'text_det_params', 'text_type', 'text_rec_score_thresh', 'rec_texts', 'rec_scores', 'rec_polys', 'vis_fonts', 'textline_orientation_angles', 'rec_boxes'])
inputPath: C:\Users\Adm\Desktop\pytest\IMG_0013.JPG
{'limit_side_len': 64, 'limit_type': 'min', 'thresh': 0.3, 'max_side_limit': 4000, 'box_thresh': 0.6, 'unclip_ratio': 1.5}
['434', '13/08/2025'] [0.9993265271186829, 0.9984366297721863]
scan filePath: C:\Users\Adm\Desktop\pytest\IMG_0014.JPG


[32m{'res': {'input_path': 'C:\\Users\\Adm\\Desktop\\pytest\\IMG_0014.JPG', 'page_index': None, 'model_settings': {'use_doc_preprocessor': True, 'use_textline_orientation': False}, 'doc_preprocessor_res': {'input_path': None, 'page_index': None, 'model_settings': {'use_doc_orientation_classify': False, 'use_doc_unwarping': False}, 'angle': -1}, 'dt_polys': array([[[  0, 398],
        ...,
        [  1, 409]],

       ...,

       [[297, 534],
        ...,
        [297, 552]]], shape=(3, 4, 2), dtype=int16), 'text_det_params': {'limit_side_len': 64, 'limit_type': 'min', 'thresh': 0.3, 'max_side_limit': 4000, 'box_thresh': 0.6, 'unclip_ratio': 1.5}, 'text_type': 'general', 'textline_orientation_angles': array([-1, ..., -1], shape=(3,)), 'text_rec_score_thresh': 0.0, 'rec_texts': ['HILTECH', '腰同心行出安全', '13/08/2025'], 'rec_scores': array([0.88492984, ..., 0.99720889], shape=(3,)), 'rec_polys': array([[[  0, 398],
        ...,
        [  1, 409]],

       ...,

       [[297, 534],
        

dict_keys(['input_path', 'page_index', 'doc_preprocessor_res', 'dt_polys', 'model_settings', 'text_det_params', 'text_type', 'text_rec_score_thresh', 'rec_texts', 'rec_scores', 'rec_polys', 'vis_fonts', 'textline_orientation_angles', 'rec_boxes'])
inputPath: C:\Users\Adm\Desktop\pytest\IMG_0014.JPG
{'limit_side_len': 64, 'limit_type': 'min', 'thresh': 0.3, 'max_side_limit': 4000, 'box_thresh': 0.6, 'unclip_ratio': 1.5}
['HILTECH', '腰同心行出安全', '13/08/2025'] [0.8849298357963562, 0.8406506180763245, 0.9972088932991028]
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
Done
