## Import libs

In [1]:
import numpy as np
import pandas as pd
import os
import re
from collections import Counter
from thefuzz import fuzz

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

from sklearn.metrics.pairwise import cosine_similarity
from underthesea import word_tokenize

from sklearn.metrics import confusion_matrix

from latex2sympy2 import latex2sympy

### Import dataset

In [3]:
if os.name == "nt":
    pass
    data_path = None

else:
    data_path = '/Users/admin/Library/CloudStorage/GoogleDrive-huypnm@galaxy.com.vn/My Drive/Work/ICM- GE/1. Dataset/2k_sample/2k_data_Sep.csv'

data = pd.read_csv(data_path)
data.describe(include='all')

Unnamed: 0,Feedback ID,Lookup Text,Question Text,Feedback Type,Feedback Result,Human Feedback,Lookup Image,Question Image
count,2000.0,2000,2000,2000,2000,2000,2000,2000
unique,,1957,1832,2,3,2,1965,1833
top,,6. Viết tập hợp các số tự nhiên \( x \) thoả m...,1.13. Viết thêm các số liền trước và số liền s...,NEGATIVE,MATCH,MATCH,https://s3.icankid.io/uploads/question/66ead79...,https://s3.stag.icankids.com.vn/uploads/questi...
freq,,6,8,1000,877,1021,6,8
mean,7161.7045,,,,,,,
std,1589.314896,,,,,,,
min,4416.0,,,,,,,
25%,5811.0,,,,,,,
50%,7125.5,,,,,,,
75%,8492.75,,,,,,,


In [5]:
data.head(5)

Unnamed: 0,Feedback ID,Lookup Text,Question Text,Feedback Type,Feedback Result,Human Feedback,Lookup Image,Question Image
0,4416.0,b) \( \left(\frac{-4}{15}-\frac{18}{19}\right)...,a) \( \frac{2}{3}+\frac{-1}{3}+\frac{7}{15} \)...,NEGATIVE,MATCH,NOT MATCH,https://s3.icankid.io/uploads/question/9a7aa76...,https://s3.icankid.io/uploads/question/41bf56d...
1,4419.0,Đề số \( 1 . \)\n1. Tính\nb) \( \sqrt{25} \)\n...,b) \( \sqrt{25} \)\nc) \( \sqrt{-36} \)\na) \(...,NEGATIVE,MATCH,MATCH,https://s3.icankid.io/uploads/question/0398020...,https://s3.icankid.io/uploads/question/5a47b89...
2,4420.0,1. Tính: \( \sqrt{\frac{25}{4}}+\left(\sqrt{\f...,Đề số \( 3 . \)\n1. Tính: \( \sqrt{\frac{25}{4...,POSITIVE,MATCH,MATCH,https://s3.icankid.io/uploads/question/a81b075...,https://s3.icankid.io/uploads/question/2d3358a...
3,4423.0,Câu 49: Cho hàm số \( y=f(x) \) có đạo hàm là ...,Câu 49. Cho hàm số \( y=f(x) \) có đạo hàm \( ...,NEGATIVE,NOT_MATCH,NOT MATCH,https://s3.icankid.io/uploads/question/a981260...,https://s3.icankid.io/uploads/question/aaa4976...
4,4425.0,b. Giải phương trình: \( 2 \sin ^{2} x+\sqrt{3...,1) \( \sin ^{2} 2 x-(2+\sqrt{3}) \sin 2 x+\sqr...,NEGATIVE,NOT_MATCH,NOT MATCH,https://s3.icankid.io/uploads/question/caa519f...,https://s3.icankid.io/uploads/question/7ea6f5f...


## TODO
[x] Seperate formula and text

[x] Tokenize text

[x] Apply methods:

- [] Cosine similarity (might need full Q-A pair to replicate product line)

[x] Handle formula

[] Merge formula and text in form of ...

In [123]:
def handle_formula(t:str):
    """
        Function that split a text that contain both text and formula(in Latex form)

        Args:
            t (str): a string type input

        Returns:
            t (str): a string contain only text
            formulas (list): a list contain all possible formula 
    """
    s = t
    formulas = []
    math_operators =["+", "-", "*", "/", "=", "<", ">"]
    begin = s.find("\(")
    while begin != -1:
        # Open and Close of a math expression in Latex
        begin = s.find("\(")
        end = s.find("\)")
        # Append when it is a real math expression
        formula = s[begin:end + 2]
        if begin != -1 and any([ops in formula for ops in math_operators]):
            formulas.append(formula)
        s = s[end+2:]

    for formula in formulas:
        t = "".join(t.split(formula))

    return t, formulas

def retrieve_text_only(s:str):
    """
        Retrieve text from entire string contain both formula and text
    """
    text, _ = handle_formula(s)
    return text

def retrieve_formula_only(s:str):
    """
        Retrieve formula from entire string contain both formula and text 
    """
    _, formulas = handle_formula(s)
    return formulas

def calculate_ratio(s:str, opt='text'):
    """
        Return ratio between text and math expression
    """
    text, formulas = handle_formula(s)
    text_token = tokenize(text)

    if opt == 'text':
        return len(text_token) / (len(text_token) + len(formulas) + 10e-6) # Prevent division by zero error

    else:
        return len(formulas) / (len(text_token) + len(formulas) + 10e-6) # Prevent division by zero error 


def text_analyze(s:str):
    """
        Analyze string text and return it's case

        Args:
            s (str): a string type text

        Returns
            A string type indicate which kind the text belong to
            "Math only"
            "Text only"
            "Math and Text"
    """
    text, formulas = handle_formula(s)

    # This will take 'Text only' case
    if len(formulas) == 0:
        return "Text only"

    token_words = tokenize(text)

    if len(token_words) < len(formulas):
        return 'Math only'

    return "Math and Text"


def create_dict(corpus:list):
    """
        Create dictionary of words from corpus

        Args:
            corpus (list): list contain string of sentences 

        Returns:
            a dictionary type contain all token words
    """
    vocab_corpus = []
    for sentence in corpus:
        token_sentence = tokenize(sentence)
        vocab_corpus.extend(token_sentence)

    # Add 'padding' into dict, for future use when 2 sentence have different length
    vocab_corpus = sorted(set(vocab_corpus + ["padding"]))
    dict_vocab = {j:i for i, j in enumerate(vocab_corpus)}
    return dict_vocab

def tokenize(t:str, filters="\(\)"):
    """
        Tokenize a string into a list of string

        Args:
            t (str): String text need to tokenize
            filters (str): String stopword

        Returns:
            list : contain all string being splitted from the original text
    """
    table = str.maketrans(filters, len(filters)*" ")
    t = t.translate(table)
    return word_tokenize(t)

def text_to_sequence(t:str, word_dict:dict):
    """
        Convert string of words into list of index from dictionary

        Args:
            t (str): string of words
            word_dict (dict): dictionary of words

        Returns:
            list of index words in original string
    """
    t = retrieve_text_only(t)
    token_words = tokenize(t)
    sequence = list(map(word_dict.get, token_words))


    return sequence

def compare_2_sentences(t_1:str, t_2:str, word_dict: dict, metric="Cosine"):
    """
        Return cosine similarity scores of 2 sentences

    """
    # Remove formula
    t_1 = retrieve_text_only(t_1)
    t_2 = retrieve_text_only(t_2)
    if metric == "L":
        return fuzz.ratio(t_1, t_2)
    # Convert string to sequence of index in dictionary
    seq_1 = text_to_sequence(t_1, word_dict)
    seq_2 = text_to_sequence(t_2, word_dict)

    max_len = max(len(seq_1), len(seq_2))
    
    # If either seq_1 and seq_2 have nothing inside, return 0
    if len(seq_1) == 0 or len(seq_2) == 0:
        return 0
        
    seq_1 = seq_1 + [word_dict["padding"]] * (max_len - len(seq_1))
    seq_2 = seq_2 + [word_dict["padding"]] * (max_len - len(seq_2))

    return cosine_similarity(np.array([seq_1]), np.array([seq_2]))[0][0]

def batch_measure_2_sentence(df, s_1:str, s_2:str, word_dict: dict, metric:str):
    """
        Return list of measure between batch of 
        
    """

    if metric == "L":
        pass
    else:
        scores = [compare_2_sentences(x, y, word_dict) for x,y in zip(df[s_1], df[s_2])]

    return scores

def create_cosine_matrix(df:pd.DataFrame, ):
        

## Create dictionary

In [125]:
total_q = data["Lookup Text"].tolist() + data["Question Text"].tolist()
total_q = [retrieve_text_only(q) for q in total_q]
dictionary = create_dict(total_q)
print(len(dictionary))

11378


In [126]:
# data["token_lookup"] = np.vectorize(text_to_sequence)(data["Lookup Text"], dictionary)
data["token_lookup"] = data["Lookup Text"].apply(lambda x: text_to_sequence(x, dictionary))
data["token_question"] = data["Question Text"].apply(lambda x: text_to_sequence(x, dictionary))
data["word_lookup"] = data["token_lookup"].apply(lambda x: len(x))

# data["word_question"] = np.vectorize(text_to_sequence)(data["Question Text"], dictionary)
data["word_question"] = data["token_question"].apply(lambda x: len(x))

In [127]:
data['formula_lookup'] = data['Lookup Text'].apply(lambda x: retrieve_formula_only(x))
data['formula_question'] = data["Question Text"].apply(lambda x: retrieve_formula_only(x))

In [128]:
data.head(5)

Unnamed: 0,Feedback ID,Lookup Text,Question Text,Feedback Type,Feedback Result,Human Feedback,Lookup Image,Question Image,token_lookup,token_question,word_lookup,word_question,cosine_simi,leveshtein,lookup_kind,question_kind,Feedback Result mod,formula_lookup,formula_question
0,4416.0,b) \( \left(\frac{-4}{15}-\frac{18}{19}\right)...,a) \( \frac{2}{3}+\frac{-1}{3}+\frac{7}{15} \)...,NEGATIVE,MATCH,NOT MATCH,https://s3.icankid.io/uploads/question/9a7aa76...,https://s3.icankid.io/uploads/question/41bf56d...,[5284],"[5137, 6200, 5284, 5610, 6492]",1,5,0.991626,0.27,Math and Text,Math and Text,MATCH,[\( \left(\frac{-4}{15}-\frac{18}{19}\right)-\...,"[\( \frac{2}{3}+\frac{-1}{3}+\frac{7}{15} \), ..."
1,4419.0,Đề số \( 1 . \)\n1. Tính\nb) \( \sqrt{25} \)\n...,b) \( \sqrt{25} \)\nc) \( \sqrt{-36} \)\na) \(...,NEGATIVE,MATCH,MATCH,https://s3.icankid.io/uploads/question/0398020...,https://s3.icankid.io/uploads/question/5a47b89...,"[10808, 8938, 1230, 39, 1230, 843, 5284, 8834,...","[5284, 8834, 10365, 1817, 10478, 5610, 5137, 8...",27,21,0.868071,0.85,Math and Text,Math and Text,MATCH,"[\( \sqrt{-36} \), \( \sqrt{(-5)^{2}} \), \( \...","[\( \sqrt{-36} \), \( 0=\frac{2 s}{\partial s}..."
2,4420.0,1. Tính: \( \sqrt{\frac{25}{4}}+\left(\sqrt{\f...,Đề số \( 3 . \)\n1. Tính: \( \sqrt{\frac{25}{4...,POSITIVE,MATCH,MATCH,https://s3.icankid.io/uploads/question/a81b075...,https://s3.icankid.io/uploads/question/2d3358a...,"[1230, 843, 2430]","[10808, 8938, 1868, 39, 1230, 843, 2430]",3,7,0.258122,0.53,Math and Text,Math and Text,MATCH,[\( \sqrt{\frac{25}{4}}+\left(\sqrt{\frac{1}{2...,[\( \sqrt{\frac{25}{4}}+\left(\sqrt{\frac{1}{2...
3,4423.0,Câu 49: Cho hàm số \( y=f(x) \) có đạo hàm là ...,Câu 49. Cho hàm số \( y=f(x) \) có đạo hàm \( ...,NEGATIVE,NOT_MATCH,NOT MATCH,https://s3.icankid.io/uploads/question/a981260...,https://s3.icankid.io/uploads/question/aaa4976...,"[3350, 2125, 2430, 3263, 6964, 6034, 11060, 75...","[3350, 2125, 39, 3263, 6964, 6034, 11060, 1014...",41,38,0.825576,0.91,Math and Text,Math and Text,NOT MATCH,"[\( y=f(x) \), \( f^{\prime}(x)=(x-1)\left(x^{...","[\( y=f(x) \), \( f^{\prime}(x)=(x+1)^{2}(x+3)..."
4,4425.0,b. Giải phương trình: \( 2 \sin ^{2} x+\sqrt{3...,1) \( \sin ^{2} 2 x-(2+\sqrt{3}) \sin 2 x+\sqr...,NEGATIVE,NOT_MATCH,NOT MATCH,https://s3.icankid.io/uploads/question/caa519f...,https://s3.icankid.io/uploads/question/7ea6f5f...,"[5284, 481, 8530, 2430, 39]",[1230],5,1,0.591172,0.08,Math and Text,Math and Text,NOT MATCH,[\( 2 \sin ^{2} x+\sqrt{3} \sin 2 x+1=3(\cos x...,[\( \sin ^{2} 2 x-(2+\sqrt{3}) \sin 2 x+\sqrt{...


## Check
- Summary: Number of word between MATCH and NOT MATCH
- Detail: Number of word between MATCH and NOT MATCH

In [129]:
"""
    Summary
"""
fig = px.histogram(data, x='Human Feedback', y=['word_lookup', 'word_question'], histfunc='avg',barmode='group')
fig.update_layout(title_text="Average number of words token")
fig.show()

In [12]:
"""
    Detail
"""
m = data[data["Human Feedback"]== "MATCH"]
nm = data[data["Human Feedback"] == "NOT MATCH"]

In [16]:
fig = ff.create_distplot([m['word_lookup'], m['word_question']], 
                        group_labels=["word_lookup", "word_question"]
                        )
fig.update_layout(title_text="'Human Feedback' = MATCH")
fig.show()

In [17]:
fig = ff.create_distplot([nm['word_lookup'], nm['word_question']], group_labels=["word_lookup", "word_question"])
fig.update_layout(title_text="'Human Feedback' = NOT MATCH")
fig.show()

## Try to find 2 similar question using Cosine Similarity method

In [7]:

data["cosine_simi"] = data.apply(lambda x: compare_2_sentences(x["Lookup Text"], x['Question Text'], dictionary), axis=1)
data["leveshtein"] = data.apply(lambda x: compare_2_sentences(x["Lookup Text"], x['Question Text'], dictionary, metric="L")/100, axis=1)

m = data[data["Human Feedback"]== "MATCH"]
nm = data[data["Human Feedback"] == "NOT MATCH"]

In [35]:
fig = ff.create_distplot([m['cosine_simi'], m['leveshtein']], 
                        group_labels=["cosine_simi", "leveshtein"], 
                        bin_size=.05,
                        histnorm= '',
                        curve_type='normal',
                        show_curve=True
                        )
fig.update_layout(title_text="'Human Feedback' = MATCH")
fig.show()

In [36]:
fig = ff.create_distplot([nm['cosine_simi'], nm['leveshtein']], 
                        group_labels=["cosine_simi", "leveshtein"], 
                        bin_size=.05,
                        histnorm=''
                        )
fig.update_layout(title_text="'Human Feedback' = NOT MATCH")
fig.show()

## Check amount of Text only and Math & Text
- Count check

In [8]:
data["lookup_kind"] = data["Lookup Text"].apply(lambda x: text_analyze(x))
data["question_kind"] = data["Question Text"].apply(lambda x: text_analyze(x))

In [22]:
data["lookup_kind"].unique()

array(['Math and Text', 'Text only', 'Math only'], dtype=object)

In [9]:
m = data[data["Human Feedback"]== "MATCH"]
nm = data[data["Human Feedback"] == "NOT MATCH"]

In [24]:
"""
    Summary
"""
fig = px.histogram(data, x=["lookup_kind", "question_kind"],barmode='group')
fig.update_layout(title_text="Total number of each kind")
fig.show()

In [None]:
text_n_math = data[data["lookup_kind"] == "Math and Text"]
pd.set_option('display.max_colwidth', None)
display(text_n_math.sample(5)[["Lookup Text", "Question Text"]])

### Standardize value string

In [10]:
data['Feedback Result mod'] = data['Feedback Result'].apply(lambda x: x if x!= "NOT_MATCH" else "NOT MATCH")
data['Feedback Result mod'] = data["Feedback Result mod"].apply(lambda x: x if x!= "SKIP" else "NOT MATCH")
data['Feedback Result mod'].value_counts()

NOT MATCH    1123
MATCH         877
Name: Feedback Result mod, dtype: int64

In [11]:
HF_cross_SE = pd.crosstab(data["Human Feedback"], data["Feedback Type"], normalize='index').round(4) * 100 
HF_cross_SE

Feedback Type,NEGATIVE,POSITIVE
Human Feedback,Unnamed: 1_level_1,Unnamed: 2_level_1
MATCH,17.34,82.66
NOT MATCH,84.07,15.93


In [12]:
fig = px.bar(HF_cross_SE, text_auto=True)
fig.update_layout(title_text="Human Feedback cross Search Engine(Feedback Type)")
fig.show()

In [56]:
HF_cross_UF = pd.crosstab(data["Human Feedback"], data["Feedback Result mod"], normalize='index').round(4) * 100

In [62]:
fig = px.bar(HF_cross_UF, text_auto=True)
fig.update_layout(title_text = "Human Feedback cross User Feedback")
fig.show()

In [54]:
SE_cross_UF = pd.crosstab(data["Feedback Type"], data["Feedback Result mod"], normalize='index').round(4) * 100  
SE_cross_UF

Feedback Result mod,MATCH,NOT MATCH
Feedback Type,Unnamed: 1_level_1,Unnamed: 2_level_1
NEGATIVE,33.3,66.7
POSITIVE,54.4,45.6


In [61]:
fig = px.bar(SE_cross_UF, text_auto=True)
fig.update_layout(title_text = "Seach Engine cross User Feedback")
fig.show()

In [155]:
data["text_lookup_ratio"] = data["Lookup Text"].apply(lambda x: calculate_ratio(x))
data["formula_lookup_ratio"] = data["Lookup Text"].apply(lambda x: calculate_ratio(x, 'f'))
data["text_question_ratio"] = data["Question Text"].apply(lambda x: calculate_ratio(x))
data["formula_question_ratio"] = data["Question Text"].apply(lambda x: calculate_ratio(x, 'f'))

In [156]:
math_n_text = data[data["lookup_kind"] == "Math and Text"]

In [157]:
fig = ff.create_distplot([math_n_text['text_lookup_ratio'], math_n_text['text_question_ratio']], 
                        group_labels=["text_lookup_ratio", "text_question_ratio"], 
                        bin_size=.05,
                        histnorm=''
                        )
fig.update_layout(title_text="Text ratio in Math & Text = Token text / (Token text + Token formula)")
fig.show()

In [137]:
def check_list_formula(l):

    bool_list = []
    for formula in l:
        bool_list.append(check_Latex_formula(formula[2:-2]))

    return bool_list

def check_Latex_formula(f):
    """
        Return True if the string in the right format for Latex math expression
        False if the string don't have 
    """
    try:
        f_pp = handle_escape_char(f)
        latex2sympy(f_pp)

    except Exception as e:
        print(e)
        return False
        
    return True

def handle_escape_char(s):
    dict_escape_ch={
        '\x07': '\\a',
        '\x08': '\\b',
        '\x0c': '\\f'
    }
    for e_ch in dict_escape_ch:
        s = s.replace(e_ch, dict_escape_ch[e_ch])
    return s

In [None]:
data['formula_lookup_status'] = data["formula_lookup"].apply(lambda x: check_list_formula(x))
data['formula_question_status'] = data["formula_question"].apply(lambda x: check_list_formula(x))

In [None]:
check = data[['Feedback ID', 'Feedback Type', 'Feedback Result', 'Human Feedback',
                "Lookup Image", "Question Image", 
                "formula_lookup", "formula_question",
                'formula_lookup_status', 'formula_question_status'
            ]]

display(check.head())

## TODO
- [x] Check ratio between formula and text
- [x] Confusion matrix between SE, User's feedback and Inhouse label
- [] Use 3-party library to check OCR accuracy
- [] Combine formula and text then use Cosine & Leveshtein

### Accuracy Improvement TODO
- [] OCR accuracy
- [] Using PCA to improve accuray and then trade back to check which text/formula part contributes to the accuracy
- [] Which features contribute to the final accuracy