In [22]:
import numpy as np
import pandas as pd
import os
import re
from collections import Counter
from rapidfuzz import process, fuzz

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from underthesea import word_tokenize, pos_tag

import cv2
import requests
import xlsxwriter
import openpyxl


def ShowImage(ImageList, NameList=None, nRows = 1, nCols = 2, WidthSpace = 0.00, HeightSpace = 0.00):
    from matplotlib import pyplot as plt 
    import matplotlib.gridspec as gridspec
    
    gs = gridspec.GridSpec(nRows, nCols)     
    gs.update(wspace=WidthSpace, hspace=HeightSpace) # set the spacing between axes.
    plt.figure(figsize=(40,40))
    # plt.rcParams["figure.figsize"] = [16,9]
    for i in range(len(ImageList)):
        ax1 = plt.subplot(gs[i])
        ax1.set_xticklabels([])
        ax1.set_yticklabels([])
        ax1.set_aspect('equal')

        plt.subplot(nRows, nCols,i+1)

        image = ImageList[i].copy()
        if (len(image.shape) < 3):
            plt.imshow(image, plt.cm.gray)
        else:
            plt.imshow(image)

        if NameList == None:
            plt.title("Image " + str(i))
        else:
            plt.title(NameList[i])
        plt.axis('off')

    plt.show()

pd.options.plotting.backend = "plotly"
pd.set_option('display.max_colwidth', None)

In [23]:

file_name = '2k_data_Sep.csv'
if os.name == "nt":
    file_path = r"G:\My Drive\Work\ICM - GE\1. Dataset\2k_sample"

else:
    file_path = '/Users/admin/Library/CloudStorage/GoogleDrive-huypnm@galaxy.com.vn/My Drive/Work/ICM - GE/1. Dataset/2k_sample'

data_path = os.path.join(file_path, file_name)
data = pd.read_csv(data_path)
data.describe(include='all')

Unnamed: 0,Feedback ID,Lookup Text,Question Text,Feedback Type,Feedback Result,Human Feedback,Lookup Image,Question Image
count,2000.0,2000,2000,2000,2000,2000,2000,2000
unique,,1957,1832,2,3,2,1965,1833
top,,6. Viết tập hợp các số tự nhiên \( x \) thoả mãn mỗi điều kiện sau:\na) \( x \leq 6 \);\nb) \( 35 \leq x \leq 39 \);\nc) \( 216<x \leq 219 \),1.13. Viết thêm các số liền trước và số liền sau của hai số 3532 và 3529 để được sáu số tự nhiên rồi sắp xếp sáu số đó theo thứ tự từ bé đến lớn.,NEGATIVE,MATCH,MATCH,https://s3.icankid.io/uploads/question/66ead790-336a-11ed-afcb-abad28552b30,https://s3.stag.icankids.com.vn/uploads/question/4cd6c4f0-9e18-11ec-bad9-1d3098bb5545
freq,,6,8,1000,877,1021,6,8
mean,7161.7045,,,,,,,
std,1589.314896,,,,,,,
min,4416.0,,,,,,,
25%,5811.0,,,,,,,
50%,7125.5,,,,,,,
75%,8492.75,,,,,,,


In [24]:
nm = data[data['Human Feedback'] == 'MATCH']
nm.reset_index(inplace=True, drop=True)
data = nm

In [25]:
# ===================================================================
def crawls_image(url, path , width:int=1100, img_name='temp_1'):
    image_path = os.path.join(path, f'{img_name}.jpg')

    with open(image_path, 'wb') as handle:
        response = requests.get(url, stream=True)

        if not response.ok:
            print(response)

        for block in response.iter_content(1024):
            if not block:
                break

            handle.write(block)
    
    img = cv2.imread(image_path)
    (h, w) = img.shape[:2]

    r = width / float(w)
    dim = (width, min(900, int(h * r)))
    resized = cv2.resize(img, dim, interpolation = cv2.INTER_AREA)
    cv2.imwrite(image_path, resized)

    return resized, image_path

# ===================================================================
def request_API(payload, post_url='https://data-checking.herokuapp.com/data-checking'):
    r = requests.post(post_url, {'image': payload})
    return r.text

In [26]:
if os.name == 'nt':
    image_path = r"G:\My Drive\Work\ICM - GE\1. Dataset\temp_images"
else:
    image_path = '/Users/admin/Library/CloudStorage/GoogleDrive-huypnm@galaxy.com.vn/My Drive/Work/ICM - GE/1. Dataset/temp_images'

In [27]:
excel_name = 'MATCH_data.xlsx'
excel_path = os.path.join(file_path, excel_name)
if os.path.isfile(excel_path):
    pass

else:
    workbook = xlsxwriter.Workbook(excel_path)

    cell_format = workbook.add_format()
    cell_format.set_align('center')
    cell_format.set_align('vcenter')
    cell_format.set_text_wrap()

    Header = [
        'Lookup image',
        'Question image',
        'not_match',
        'Feedback ID',
        'Lookup URL',
        'Question URL'
    ]

    image_size_display = 400
    start_row = 0
    start_column = 1

    sheet_name = 'MATCH'
    worksheet = workbook.add_worksheet(sheet_name[:30])
    worksheet.write_row(start_row, start_column, Header, cell_format)
    # worksheet.set_column("B:B", 15)
    worksheet.set_column("B:B", int(15 * (image_size_display / 75)))
    worksheet.set_column("C:C", int(15 * (image_size_display / 75)))

    for i in range(len(data)):
    # for i in range(50):
        sample = data.loc[i]
        Data = [0] * len(Header)
        start_row += 1
        worksheet.set_row(start_row, int(90 * (image_size_display / 100)))

        lookup_content, lookup_path = crawls_image(sample['Lookup Image'], image_path, img_name=str(sample['Feedback ID']) +"_1")
        question_content, question_path = crawls_image(sample['Question Image'], image_path, img_name=str(sample['Feedback ID']) +"_2")

        Data[2] = '0'
        Data[3] = sample['Feedback ID']
        Data[4] = sample['Lookup Image']
        Data[5] = sample['Question Image']

        for idx, info in enumerate(Data):
            if idx == 0:
                worksheet.insert_image(
                    start_row, idx+1, lookup_path, 
                    {'x_scale': 0.5, 'y_scale': 0.5, 'x_offset': 5, 'y_offset': 5, 'object_position': 1})
            elif idx == 1:
                worksheet.insert_image(
                    start_row, idx+1, question_path, 
                    {'x_scale': 0.5, 'y_scale': 0.5, 'x_offset': 5, 'y_offset': 5, 'object_position': 1})

            else:
                worksheet.write(start_row, idx + 1, Data[idx], cell_format)

    header = [{'header': head} for head in Header]
    worksheet.add_table(0, 1, start_row, len(Header), {'columns': header})
    worksheet.freeze_panes(1, 0)
    worksheet.hide_gridlines(2)
    workbook.close()

In [None]:
open_workbook = openpyxl.load_workbook(excel_path)
sheets = open_workbook.sheetnames

df_xl = pd.read_excel(
    excel_path,
    engine="openpyxl",
    sheet_name=sheets[0],
    index_col=False,
)

In [14]:
df_xl['lookup_in_question'].value_counts()

0    1021
Name: lookup_in_question, dtype: int64