In [1]:
import os

from lexnorm.data import lexicon
from lexnorm.data import norm_dict
from lexnorm.data.normEval import loadNormData
from lexnorm.definitions import DATA_PATH

In [2]:
raw, norm = loadNormData(os.path.join(DATA_PATH, "interim/train.txt"))
norm_dictionary = norm_dict.from_train(os.path.join(DATA_PATH, "interim/train.txt"))

In [3]:
lex = lexicon.build(
        {"english", "american"},
        {"contractions", "proper-names", "upper", "words"},
        50,
        1,
    )

lex = lexicon.refine(lex.union(lexicon.build_abbreviations()))

In [4]:
lexicon.evaluate(raw, norm, lex)

Correlation of lexicon with normalisation: 0.43
Most common un-normalised raw alphanumeric tokens in lexicon: [(('i', 'i'), 648), (('the', 'the'), 631), (('to', 'to'), 534), (('a', 'a'), 479), (('and', 'and'), 411), (('you', 'you'), 340), (('in', 'in'), 326), (('for', 'for'), 320), (('is', 'is'), 318), (('me', 'me'), 281), (('my', 'my'), 278), (('lol', 'lol'), 272), (('on', 'on'), 271), (('of', 'of'), 249), (('it', 'it'), 203), (('with', 'with'), 186), (('that', 'that'), 185), (('this', 'this'), 180), (('so', 'so'), 180), (('be', 'be'), 159)]
Most common normalised raw alphanumeric tokens in lexicon: [(('nigga', 'nigger'), 57), (('niggas', 'niggers'), 52), (('ur', 'your'), 33), (('gonna', 'going to'), 29), (('bout', 'about'), 21), (('wit', 'with'), 17), (('tho', 'though'), 17), (('cause', 'because'), 16), (('wanna', 'want to'), 12), (('cant', "can't"), 12), (('ur', "you're"), 12), (('yo', 'you'), 11), (('nd', 'and'), 11), (('ill', "i'll"), 9), (('dis', 'this'), 9), (('yo', 'your'), 8),

In [5]:
def original_token(tok):
    # MONOISE
    # needed if detect step is skipped, as all tokens will be replaced by one from the list of candidates
    return {tok}

In [6]:
import gensim

w2v_vectors = gensim.models.KeyedVectors.load_word2vec_format(
        os.path.join(DATA_PATH, "external/monoise_data/w2v.bin"),
        binary=True,
        unicode_errors="ignore",
    )
# load embeddings from van der goot. Used params -size 100 -window 5 -cbow 0 -binary 1 -threads 45
# unicode imcompatibilities present so must ignore

In [7]:
train_model = model = gensim.models.Word2Vec(sentences=raw, vector_size=100, window=5)
# get keyed vectors only as finished training model
train_vectors = train_model.wv
w2v_vectors.add_vectors(train_vectors.index_to_key, train_vectors.vectors)

In [8]:
from lexnorm.models.annotation import list_eligible


def word_embeddings(tok, vectors, threshold=0):
    # TODO uni, bigram freqs?
    # TODO implement word2vec with keras. Use newer embeddings. Experiment with different no. of candidates generated. Could even create twitter embeddings myself? Could clean up as VDG did before creating train embeddings. Cosine similarity threshold?
    # MONOISE
    # can use twitter embeddings from van der Goot - based on distributional hypothesis to find tokens with similar semantics
    # could use cosine similarity as a feature for selection? Using here to get most similar candidates.
    # ISSUE: antonyms also often present in same contexts.
    candidates = set()
    if tok in vectors:
        candidates = set(vectors.most_similar([tok]))
    return {c[0] for c in candidates if list_eligible([c[0]]) == [1] and c[1] >= threshold}

# pretty much no tokens in train set not in twitter embeddings from vdg but can't assume this is the case
# possible that tok will not be in vectors
word_embeddings("lol", w2v_vectors)

  dists = dot(self.vectors[clip_start:clip_end], mean) / self.norms[clip_start:clip_end]


{'lmao', 'lmaooo', 'lmfao', 'smh'}

In [9]:
def lookup(tok, dictionary):
    # TODO: external norm dicts?
    # MONOISE
    # lookup in list of all replacement pairs found in the training data (and external sources?)
    # all norm tokens with raw token tok are included as candidates
    return {v for v in dictionary.get(tok, {}).keys()}

In [10]:
def clipping(tok, lex):
    # MONOISE
    # all words in lexicon that have tok as a prefix (capturing abbreviation). May only consider for tok length above 2?
    candidates = set()
    if len(tok) < 2:
        return set()
    # TODO: length threshold? prune generated (only some degree of clipping allowed w.r.t. edit distance)?
    return [t for t in lex if t.startswith(tok)]

In [11]:
# TODO: number of candidates on average generated by each module and with all modules
# TODO: which modules contribute the most / most unique correct candidates
# TODO: for modules and whole, percentage of correct vs incorrect candidates

In [12]:
def split(tok, lex):
    # MONOISE
    # hypothesis splits on (every/some) position and check if both words are in lexicon. May only consider of tok length above 3?
    candidates = set()
    if len(tok) < 3:
        return set()
    for pos in range(1, len(tok)):
        left = tok[:pos]
        right = tok[pos:]
        if left in lex and right in lex:
            candidates.add(" ".join([left, right]))
    # TODO: recursive candidate generation on each left and right? Probably not... More than one split? Probably not either...
    # TODO: length threshold?
    return candidates

In [13]:
from spylls.hunspell import Dictionary

def spellcheck(tok):
    # TODO: no control over this - can I change in source code? Try and load in custom lexicon.
    dictionary = Dictionary.from_files('en_US')
    return {c for c in dictionary.suggest(tok)}

In [14]:
# TODO: contextual features?
# def generate_candidates(tweet):
#     for token in tweet:
#         ...

In [15]:
def generate_candidates(tok):
    candidates = set()
    candidates = candidates.union(original_token(tok))
    candidates = candidates.union(word_embeddings(tok, w2v_vectors))
    candidates = candidates.union(spellcheck(tok))
    candidates = candidates.union(lookup(tok, norm_dictionary))
    candidates = candidates.union(clipping(tok, lex))
    candidates = candidates.union(split(tok, lex))
    return candidates

In [18]:
generate_candidates("peng")

{'Deng',
 'Peking',
 'blick',
 'butters',
 'butterz',
 'clapt',
 'minging',
 'neeky',
 'pang',
 'peeing',
 'peen',
 'peg',
 'pen',
 'pen g',
 'pen-g',
 'pend',
 'peng',
 'penger',
 'pengggggg',
 'penguin',
 "penguin's",
 'penguins',
 'pens',
 'pent',
 'peonage',
 'pieing',
 'ping',
 'pong'}