In [None]:
import pandas as pd
import numpy as np
import Levenshtein as lev
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# https://towardsdatascience.com/overview-of-text-similarity-metrics-3397c4601f50
# https://towardsdatascience.com/calculating-string-similarity-in-python-276e18a7d33a
# https://www.geeksforgeeks.org/python-levenshtein-distance/
# https://towardsdatascience.com/semantic-search-measuring-meaning-from-jaccard-to-bert-a5aca61fc325

def get_jaccard_sim(str1, str2):
    a = set(str1.split())
    b = set(str2.split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))


def cosine_sim_vectors(vec1, vec2):
    vec1 = vec1.reshape(1, -1)
    vec2 = vec2.reshape(1, -1)
    return cosine_similarity(vec1, vec2)[0][0]

def get_cosine_sim(str1, str2):
    str_list = list([str1, str2])
    vectorizer = CountVectorizer().fit([str1])
    vectors = vectorizer.transform(str_list)
    # vectors = vectorizer.toarray()
    return cosine_sim_vectors(vectors[0], vectors[1])

In [None]:
ocr_dfs = ['ocr_df_2023_retests.xlsx']

comb_df = pd.DataFrame()

for i, ocr_df in enumerate(ocr_dfs):
    df = pd.read_excel(ocr_df)
    df = df.drop(columns=['Unnamed: 0'])
    if i >= 1:
        df = df.drop(columns=['item_id', 'item_filename', 'Transcribed', 'Trans_len', 'Type', 'url'])
    comb_df = pd.concat([comb_df, df], axis=1)
comb_df = comb_df.replace(r'\n', ' ', regex=True)
comb_df

In [None]:
# TODO: vectorize this

lev_df = pd.DataFrame()
jac_df = pd.DataFrame()
cos_df = pd.DataFrame()
for index, row in comb_df.iterrows():
    print(index)
    if row['Transcribed'] != 0:
        lev_df.loc[index, f'Type'] = row['Type']
        lev_df.loc[index, f'Trans_len'] = row['Trans_len']
        jac_df.loc[index, f'Type'] = row['Type']
        jac_df.loc[index, f'Trans_len'] = row['Trans_len']
        cos_df.loc[index, f'Type'] = row['Type']
        cos_df.loc[index, f'Trans_len'] = row['Trans_len']
        for col in comb_df.columns:
            if col not in ['Transcribed', 'item_id', 'item_filename', 'Type', 'url', 'Trans_len']:
                try:
                    lev_df.loc[index, f'{col}_lev_ratio'] = lev.ratio(row['Transcribed'], row[col])
                except:
                    lev_df.loc[index, f'{col}_lev_ratio'] = 0

                try:
                    jac_df.loc[index, f'{col}_jac_dist'] = get_jaccard_sim(row['Transcribed'], row[col])
                except:
                    jac_df.loc[index, f'{col}_jac_dist'] = 0

                try:
                    cos_df.loc[index, f'{col}_cossim'] = get_cosine_sim(row['Transcribed'], row[col])
                except:
                    cos_df.loc[index, f'{col}_cossim'] = 0
cos_df

In [None]:
def summarize_df(compare_df, type=''):
    summary_df = pd.DataFrame()
    for col in compare_df.columns:
        if col not in ['Transcribed', 'item_id', 'item_filename', 'Type', 'url', 'Trans_len']:
            # Overall
            summary_df.loc[0, f'{col}'] = np.average(a=compare_df[col], weights=compare_df['Trans_len'])
            # Handwritten
            summary_df.loc[1, f'{col}'] = np.average(a=compare_df[(compare_df['Type'] == 'H')  | (compare_df['Type'] == 'T/H')][col], weights=compare_df[(compare_df['Type'] == 'H') | (compare_df['Type'] == 'T/H')]['Trans_len'])
            # Typed
            summary_df.loc[2, f'{col}'] = np.average(a=compare_df[(compare_df['Type'] == 'T')  | (compare_df['Type'] == 'T/H')][col], weights=compare_df[(compare_df['Type'] == 'T') | (compare_df['Type'] == 'T/H')]['Trans_len'])
    summary_df = summary_df.transpose()
    summary_df.columns = [f'{type}_Overall', f'{type}_Handwritten', f'{type}_Typed']
    return summary_df
cos_sum_df = summarize_df(cos_df, 'cos')
jac_sum_df = summarize_df(jac_df, 'jac')
lev_sum_df = summarize_df(lev_df, 'lev')

summary_df = pd.concat([cos_sum_df, jac_sum_df, lev_sum_df], ignore_index=True, axis=1)

In [None]:
lev_sum_df_test = lev_sum_df.reset_index()
lev_sum_df_test.drop(columns=['index'], inplace=True)
jac_sum_df_test = jac_sum_df.reset_index()
jac_sum_df_test.drop(columns=['index'], inplace=True)
cos_sum_df_test = cos_sum_df.reset_index()
summary_df = pd.concat([cos_sum_df_test, lev_sum_df_test, jac_sum_df_test], axis=1)
summary_df

In [None]:
summary_df.to_excel('ocr_test_summary_df.xlsx')