In [81]:
import os

from lexnorm.data import lexicon
from lexnorm.data import norm_dict
from lexnorm.data.normEval import loadNormData
from lexnorm.definitions import DATA_PATH

In [82]:
raw, norm = loadNormData(os.path.join(DATA_PATH, "interim/train.txt"))
norm_dictionary = norm_dict.from_train(os.path.join(DATA_PATH, "interim/train.txt"))

In [83]:
lex = lexicon.build(
        {"english", "american"},
        {"contractions", "proper-names", "upper", "words"},
        50,
        1,
    )

lex = lexicon.refine(lex.union(lexicon.build_abbreviations()))

In [84]:
lexicon.evaluate(raw, norm, lex)

Correlation of lexicon with normalisation: 0.43
Most common un-normalised raw alphanumeric tokens in lexicon: [(('i', 'i'), 648), (('the', 'the'), 631), (('to', 'to'), 534), (('a', 'a'), 479), (('and', 'and'), 411), (('you', 'you'), 340), (('in', 'in'), 326), (('for', 'for'), 320), (('is', 'is'), 318), (('me', 'me'), 281), (('my', 'my'), 278), (('lol', 'lol'), 272), (('on', 'on'), 271), (('of', 'of'), 249), (('it', 'it'), 203), (('with', 'with'), 186), (('that', 'that'), 185), (('this', 'this'), 180), (('so', 'so'), 180), (('be', 'be'), 159)]
Most common normalised raw alphanumeric tokens in lexicon: [(('nigga', 'nigger'), 57), (('niggas', 'niggers'), 52), (('ur', 'your'), 33), (('gonna', 'going to'), 29), (('bout', 'about'), 21), (('wit', 'with'), 17), (('tho', 'though'), 17), (('cause', 'because'), 16), (('wanna', 'want to'), 12), (('cant', "can't"), 12), (('ur', "you're"), 12), (('yo', 'you'), 11), (('nd', 'and'), 11), (('ill', "i'll"), 9), (('dis', 'this'), 9), (('yo', 'your'), 8),

In [85]:
def original_token(tok):
    # MONOISE
    # needed if detect step is skipped, as all tokens will be replaced by one from the list of candidates
    return {tok}

In [86]:
def word_embeddings(tok):
    # TODO implement. word2vec
    # MONOISE
    # can use twitter embeddings from van der Goot - based on distributional hypothesis to find tokens with similar semantics
    return set()

In [87]:
def aspell(tok):
    # TODO implement
    # MONOISE
    # uses weighted character edit distance, double metaphone alg for similar looking + sounding words
    return set()

In [88]:
def lookup(tok, dictionary):
    # TODO: external norm dicts?
    # MONOISE
    # lookup in list of all replacement pairs found in the training data (and external sources?)
    # all norm tokens with raw token tok are included as candidates
    return {v for v in dictionary.get(tok, {}).keys()}

In [89]:
def clipping(tok, lex):
    # MONOISE
    # all words in lexicon that have tok as a prefix (capturing abbreviation). May only consider for tok length above 2?
    candidates = set()
    if len(tok) < 2:
        return set()
    # TODO: length threshold? prune generated (only some degree of clipping allowed w.r.t. edit distance)?
    return [t for t in lex if t.startswith(tok)]

In [90]:
# TODO: number of candidates on average generated by each module and with all modules
# TODO: which modules contribute the most / most unique correct candidates
# TODO: for modules and whole, percentage of correct vs incorrect candidates

In [91]:
def split(tok, lex):
    # MONOISE
    # hypothesis splits on (every/some) position and check if both words are in lexicon. May only consider of tok length above 3?
    candidates = set()
    if len(tok) < 3:
        return set()
    for pos in range(1, len(tok)):
        left = tok[:pos]
        right = tok[pos:]
        if left in lex and right in lex:
            candidates.add(" ".join([left, right]))
    # TODO: recursive candidate generation on each left and right? Probably not... More than one split? Probably not either...
    # TODO: length threshold?
    return candidates

In [92]:
# TODO: contextual features?

In [110]:
# def generate_candidates(tweet):
#     for token in tweet:
#         ...
def generate_candidates(tok):
    candidates = set()
    candidates = candidates.union(original_token(tok))
    candidates = candidates.union(word_embeddings(tok))
    candidates = candidates.union(aspell(tok))
    candidates = candidates.union(lookup(tok, norm_dictionary))
    candidates = candidates.union(clipping(tok, lex))
    candidates = candidates.union(split(tok, lex))
    return candidates

generate_candidates("bangin")

{'ban gin', 'bang in', 'bangin', 'banging'}