In [1]:
import json
import pandas as pd


In [2]:
def load_jsonl(fname):
    fin = open(fname, encoding="utf-8")
    data = []
    for line in fin:
        d = json.loads(line.strip())
        data.append(d)

    return data

def save_jsonl(data, filename):
    with open(filename, "w", encoding="utf-8") as fo:
        for idx, d in enumerate(data):
            fo.write(json.dumps(d, ensure_ascii=False))
            fo.write("\n")

In [3]:
testmisp = load_jsonl(f"Datasets/WisesightSentiment/test-misp.jsonl")

# Misspelling Correction

In [16]:
from collections import defaultdict
from collections import Counter
from itertools import groupby

def norm_word(word):
    groups = [list(s) for _, s in groupby(word)]
    ch = []
    extraToken = ""
    for g in groups:
        if len(g)>=3:
            extraToken = "<rep>"
            ch.append(g[0])  
        else:
            ch += g
    word = "".join(ch)+extraToken
    return word

def create_mispelling_correction_model(sent):
    
    mispDict = defaultdict(list)
    intDict = defaultdict(list)
    for idx, s in enumerate(sent):
        for m in s["misp_tokens"]:
#             if norm_word(m["misp"]) != m["misp"]:
#                 print(norm_word(m["misp"]), m["misp"])
            mispDict[norm_word(m["misp"])].append(m["corr"])
            intDict[norm_word(m["misp"])].append(m["int"])
        
    MC = {}
    for misp in mispDict:
        if len(misp) <=1:
            continue

        cnt = Counter(mispDict[misp])
        corr = max(cnt, key=cnt.get)
        
        intProb = sum(intDict[misp])/len(intDict[misp])
        if intProb > 0.5:
            MC[misp] = (corr, True)
        else:
            MC[misp] = (corr, False)
    return MC

MC = create_mispelling_correction_model(testmisp)
save_jsonl([MC], "test_mispelling_correction.jsonl")

In [17]:
trainmisp = load_jsonl(f"Datasets/WisesightSentiment/few-shot/train-misp-3000.jsonl")

In [18]:
MD = create_mispelling_correction_model(trainmisp)
save_jsonl([MD], "train_mispelling_dection.jsonl")