In [None]:
import os
import json

from IPython.display import Image, display
# from paddleocr import PaddleOCR,draw_ocr
from tqdm import tqdm
import re
import math

import cv2
from matplotlib import pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
from collections import Counter
# ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=False, ocr_version="PP-OCRv3", enable_mkldnn=True) # need to run only once to download and load model into memory

## Pie Table Extraction
In this section, we develop code to extract the data table from the Pie Image. 
### Load the Dataset

In [None]:
# data_path = '/dvmm-filer2/projects/mingyang/semafor/chart_table/data/pie/annotations'
data_path = '/Users/kevinz/Desktop/semafor/chart_table/data_download/pie/annotations'
test_data_path = '/'.join([data_path, "instancesPie(1008)_test2019.json"])
#image_path = '/dvmm-filer2/projects/mingyang/semafor/chart_table/data/pie/images/test2019'
image_path = '/Users/kevinz/Desktop/semafor/chart_table/data_download/pie/images/test2019'
with open(test_data_path, 'r') as f:
    test_annotation = json.load(f)
    

In [None]:
print(test_annotation.keys())

In [None]:
print(type(test_annotation['images']))
print(type(test_annotation['annotations']))
print(type(test_annotation['categories']))

print(len(test_annotation['images']))
print(len(test_annotation['annotations']))

In [None]:
print(test_annotation['images'][0])
print(test_annotation['annotations'][0])
print(test_annotation['categories'][0])

In [None]:
#Line Annotation
#image_info = [x['id']: {'file_name':x['file_name'], 'height':x['height'], 'width':x['width']} for x in test_annotation['images']]
#good sample: [21, 22, 23]
sample_info = test_annotation['images'][24]
sample_id = sample_info['id']
#Find all the annotations:

#Display the image
sample_img_path = '/'.join([image_path, sample_info['file_name']])
display(Image(filename=sample_img_path))

#Map Annotations to Image ID
AnnotationMap = {}
for x in test_annotation['annotations']:
    img_id = x['image_id']
    AnnotationMap[img_id] = AnnotationMap.get(img_id, []) + [x]
    
sample_annotation = AnnotationMap[sample_id]
print(sample_annotation)


### Detect Text Elements of the Image
Extract the Text Elements from Pie Image. 

1. **Extract Legend**
    - Disregard the Numerical Values
    - Disregard the bounding box that is not Vertically or Horizontally aligned
2. **Extract Title**
    - Detected Text that is closest to the Top. 

In [None]:
#Get the OCR of this image
ocr_dir = "/Users/kevinz/Desktop/semafor/chart_table/data_download/pie/ocr_results/test2019/{}.json".format(sample_id)
with open(ocr_dir, "r") as f:
    ocr_texts = json.load(f)


In [None]:
print(ocr_texts)

In [None]:
import re
def hv_alignment(a, b):
    #compare the bounding box a and b to check if they are horizontal or vertically aligned
    align_thred = 5
    align_diff = [abs(a[0][0]-b[0][0]), abs(a[0][1]-b[0][1]), abs(a[2][0]-b[1][0]), abs(a[2][1]-b[2][1])]
    if min(align_diff) < align_thred:
        return True
    else:
        return False


#Extract Legend
def extract_legend(ocr_texts):
    numerical_pattern = "(\d+(\.\d+)?%)"

    #Step 1, Get rid of Numerical Value
    legend_candidates = []
    for candidate in ocr_texts:
        candidate_text = candidate[1][0]
        if not re.search(numerical_pattern, candidate_text):
            legend_candidates.append(candidate)


    #Step 2, Group Text based on their Vertical or Horizontal Alignment.
    align_map = {}
    for i_a,c_a in enumerate(legend_candidates):
        align_map[i_a] = []
        i_b = i_a+1
        for c_b in legend_candidates[i_a+1:]:
            if hv_alignment(c_a[0],c_b[0]):
                align_map[i_a].append(i_b)
            i_b +=1
    
    legend_groups = []
    legend_group = []

    for k,v in align_map.items():
        if k in legend_group:
            continue
        if v:
            legend_group += v + [k]
            for x in v:
                legend_group += align_map[x]
            legend_group = list(set(legend_group))
            legend_groups.append(legend_group)
            legend_group = []
    if len(legend_groups) == 0:
        return []
    #Get the final legends
    if len(legend_groups) == 1 and len(legend_groups[0]) > 0:
        return [legend_candidates[x] for x in legend_groups[0]]
    else:
        #Pick the one with the maximum elements or maybe the group that has the mapped elements as the gt sectors
        legend_groups.sort(key=len)
        return [legend_candidates[x] for x in legend_groups[-1]]

detected_legends = extract_legend(ocr_texts)
print(detected_legends)

### Read Data Values
In this step, we compute the angles of the sector to estimate the pie area. 

In [None]:
import math

def determinant(v1, v2):
  return v1[0]*v2[1] - v1[1]*v2[0]

def dotproduct(v1, v2):
  return sum((a*b) for a, b in zip(v1, v2))

def length(v):
  return math.sqrt(dotproduct(v, v))

def angle(v1, v2):
  return math.atan2(determinant(v1,v2), dotproduct(v1, v2))

def compute_unit(p1,p2):
    distance = [p1[0] - p2[0], -(p1[1] - p2[1])]
    norm = math.sqrt(distance[0] ** 2 + distance[1] ** 2)
    direction = [distance[0] / norm, distance[1] / norm]
    return direction

def compute_distance(p1,p2):
    distance = (p1[0] - p2[0])**2 + (p1[1] - p2[1])**2
    return distance
    
#def estimate_value(sample_annotation):
angles = []
for sector in sample_annotation:
    sector_bbox = sector['bbox']
    #print(sector_bbox)
    assert len(sector_bbox) == 6
    center = [sector_bbox[4],sector_bbox[5]]
    sector_p1 = [sector_bbox[0],sector_bbox[1]]
    sector_p2 = [sector_bbox[2],sector_bbox[3]]
    
   
   
    v1 = compute_unit(sector_p1, center)
    v2 = compute_unit(sector_p2, center)

    current_angle = angle(v2,v1)
    #print(current_angle)
    if current_angle < 0:
        current_angle = 2*math.pi+current_angle
    
    value = current_angle/(2*math.pi)
    sector['value'] = value
    print(value)
#print(sample_annotation)

### Map the Legend or the Column Text
In this final step we will map the legend text or the other detected Text to each sector.

In [None]:
import cv2
from matplotlib import pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
from collections import Counter

def get_dominant_color(image, k=4, image_processing_size = None):
    """
    takes an image as input
    returns the dominant color of the image as a list
    
    dominant color is found by running k means on the 
    pixels & returning the centroid of the largest cluster

    processing time is sped up by working with a smaller image; 
    this resizing can be done with the image_processing_size param 
    which takes a tuple of image dims as input

    >>> get_dominant_color(my_image, k=4, image_processing_size = (25, 25))
    [56.2423442, 34.0834233, 70.1234123]
    """
    #resize image if new dims provided
    if image_processing_size is not None:
        image = cv2.resize(image, image_processing_size, 
                            interpolation = cv2.INTER_AREA)
    
    #reshape the image to be a list of pixels
    image = image.reshape((image.shape[0] * image.shape[1], 3))

    #cluster and assign labels to the pixels 
    clt = KMeans(n_clusters = k)
    labels = clt.fit_predict(image)

    #count labels to find most popular
    label_counts = Counter(labels)
    
    
    #subset out most popular centroid
    #dominant_color = clt.cluster_centers_[label_counts.most_common(1)[0][0]]
    dominant_colors = clt.cluster_centers_
    return list(dominant_colors)

#If Legend Exist and the Length Matched
#if detected_legends and len(detected_legends) == len(sample_annotation):
#Get the dominant Color for each Legends
legends = {}
sample_image = cv2.imread(sample_img_path)
sample_image = cv2.cvtColor(sample_image, cv2.COLOR_BGR2RGB)
for i, dl in enumerate(detected_legends):
    legends[i] = {'bbox': dl[0], 'text': dl[1]}
    start_point = (int(dl[0][0][0]), int(dl[0][0][1]))
    end_point = (int(dl[0][2][0]), int(dl[0][2][1]))
    
    cropped_image = sample_image[start_point[1]:end_point[1], start_point[0]:end_point[0]]
#     plt.imshow(cropped_image)
#     print(cropped_image[5,3])
#     dominant_colors = get_dominant_color(cropped_image, k=3)
#     print(dominant_colors)
    legends[i]['cropped_image'] = cropped_image


#Find the Color for Each Sector, locate a center point. 
for sector in sample_annotation:
    sector_bbox = sector['bbox']
    #Compute the angle again
    center = [int(sector_bbox[4]),int(sector_bbox[5])]
    sector_p1 = [int(sector_bbox[0]),int(sector_bbox[1])]
    sector_p2 = [int(sector_bbox[2]),int(sector_bbox[3])]
   
    v1 = compute_unit(sector_p1, center)
    v2 = compute_unit(sector_p2, center)
    
   
    current_angle = angle(v2,v1)
    if current_angle < 0:
        current_angle = 2*math.pi+current_angle
    
    
    #rotate v2 counter-clock with for 1/2 current_angle to get the 
    rotated_v2_x = math.cos(0.5*current_angle)*v2[0] - math.sin(0.5*current_angle)*v2[1]
    rotated_v2_y = math.sin(0.5*current_angle)*v2[0] + math.cos(0.5*current_angle)*v2[1]
    
    #print(rotated_v2_x**2 + rotated_v2_y**2)
    radius = math.sqrt(compute_distance(sector_p1, center))
    #Remember for Pixel, the y is increased from top to bottom. 
    pick_point = [int(center[0] + rotated_v2_x*0.5*radius), int(center[1] - rotated_v2_y*0.5*radius)]
    
    #Plot the pick-point
    #Get the color of the pick_point
    sample_image_hsv = cv2.cvtColor(sample_image, cv2.COLOR_RGB2HSV)
    pick_point_color = sample_image_hsv[pick_point[1],pick_point[0]]
    
    
    pp_color_min = np.array([max(0, pick_point_color[0]-5),max(50,pick_point_color[1]-10),max(50, pick_point_color[2]-10)], np.uint8)
    pp_color_max = np.array([pick_point_color[0]+5,min(255, pick_point_color[1]+10),min(255, pick_point_color[2]+10)], np.uint8)
    
    sector['mapped_column_value'] = None
    for k, cl in legends.items():
        cl_image = cl["cropped_image"]
        cl_hsv_image = cv2.cvtColor(cl_image, cv2.COLOR_RGB2HSV)
        frame_threshed = cv2.inRange(cl_hsv_image, pp_color_min, pp_color_max)
        

        #might satisty two saturation
        
        if np.count_nonzero(frame_threshed)/(frame_threshed.shape[0]*frame_threshed.shape[1]) > 0.01:
            #Find a Map
            sector['column_value'] = cl['text'][0]
            plt.imshow(cl_image)
            plt.show()
            plt.imshow(frame_threshed)
            plt.show()
            break

        
        
                             
        

                             
    
#plt.imshow('Binary',thresh_img)


In [None]:
display(Image(filename=sample_img_path))
for result in sample_annotation:
    print(f"{result['column_value']}: {result['value']}")