In [14]:
import os

import numpy as np

from lexnorm.data import lexicon
from lexnorm.data import norm_dict
from lexnorm.data.normEval import loadNormData
from lexnorm.definitions import DATA_PATH
from lexnorm.models.normalise import load_candidates
import pickle

In [7]:
raw, norm = loadNormData(os.path.join(DATA_PATH, "processed/combined.txt"))
norm_dictionary = norm_dict.construct(raw, norm)

In [31]:
cands = load_candidates(os.path.join(DATA_PATH, "hpc/fixed_dev_ngrams.norm"))
cands.loc["thessaloníki"]

# with open(os.path.join(DATA_PATH, "processed/feature_lexicon.txt"), "rb") as f:
#     lex = pickle.load(f)
# "lol" in lex

Unnamed: 0,cosine_to_orig,frac_norms_seen,from_clipping,from_embeddings,from_original_token,from_spellcheck,from_split,norms_seen,spellcheck_score,length,...,twitter_bi_next,wiki_uni,wiki_bi_prev,wiki_bi_next,orig_twitter_uni,orig_twitter_bi_prev,orig_twitter_bi_next,orig_wiki_uni,orig_wiki_bi_prev,orig_wiki_bi_next
thessaloníki,,,1.0,,,,,,-24,12,...,0.0,5.767776e-09,0.0,0.0,0.018497,0.045914,0.256350,0.070067,0.037588,0.102629
thessaloníki,,,1.0,,,,,,-24,12,...,0.0,5.767776e-09,0.0,0.0,0.018497,0.026903,0.031517,0.070067,0.031906,0.060369
thessaloníki,,,1.0,,,,,,-24,12,...,0.0,5.767776e-09,0.0,0.0,0.018497,0.019594,0.029688,0.070067,0.000000,0.185942
thessaloníki,,,1.0,,,,,,-24,12,...,0.0,5.767776e-09,0.0,0.0,0.018497,0.212503,0.064734,0.070067,0.227539,0.092017
thessaloníki,,,1.0,,,,,,-24,12,...,0.0,5.767776e-09,0.0,0.0,0.018497,0.065076,0.600666,0.070067,0.082351,0.442959
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
thessaloníki,,,1.0,,,,,,-24,12,...,0.0,5.767776e-09,0.0,0.0,0.018497,0.012056,0.108169,0.070067,0.029170,0.000000
thessaloníki,,,1.0,,,,,,-24,12,...,0.0,5.767776e-09,0.0,0.0,0.018497,0.151509,0.093742,0.070067,0.114203,0.067757
thessaloníki,,,1.0,,,,,,-24,12,...,0.0,5.767776e-09,0.0,0.0,0.018497,0.008760,0.072405,0.070067,0.161784,0.000000
thessaloníki,,,1.0,,,,,,-24,12,...,0.0,5.767776e-09,0.0,0.0,0.018497,0.043378,0.072405,0.070067,0.145342,0.000000


In [8]:
def original_token(tok):
    # MONOISE
    # needed if detect step is skipped, as all tokens will be replaced by one from the list of candidates
    return {tok}

In [12]:
from lexnorm.data import word2vec
vectors = word2vec.get_vectors(raw)
# load embeddings from van der goot. Used params -size 100 -window 5 -cbow 0 -binary 1 -threads 45
# there are 400 length embeddings which supposedly give slight performance improvement, but quite slow
# unicode incompatibilities present so must ignore when loading

In [19]:
from lexnorm.generate_extract.filtering import is_eligible

def word_embeddings(tok, vectors, threshold=0):
    # TODO uni, bigram freqs?
    # TODO implement word2vec with keras. Use newer embeddings. Experiment with different no. of candidates generated. Could even create twitter embeddings myself? Could clean up as VDG did before creating train embeddings. Cosine similarity threshold?
    # MONOISE
    # can use twitter embeddings from van der Goot - based on distributional hypothesis to find tokens with similar semantics
    # could use cosine similarity as a feature for selection? Using here to get most similar candidates.
    # ISSUE: antonyms also often present in same contexts.
    candidates = set()
    if tok in vectors:
        candidates = set(vectors.similar_by_vector(tok))
    return {c[0].lower() for c in candidates if is_eligible(c[0]) and c[1] >= threshold}

# pretty much no tokens in train set not in twitter embeddings from vdg but can't assume this is the case
# possible that tok will not be in vectors
vectors.similar_by_vector("yo")
# pretty terrible performance for train_vectors - to be expected as such a low amount of data could literally just use external set as contains almost every word anyway and will be much more accurate

  dists = dot(self.vectors[clip_start:clip_end], mean) / self.norms[clip_start:clip_end]


[('ya', 0.8311921954154968),
 ('yo’', 0.8208641409873962),
 ('nigga’s', 0.7973958849906921),
 ('citch', 0.7568634748458862),
 ('shordy', 0.7551469206809998),
 ('😂😂🚮', 0.7489941716194153),
 ('😭like', 0.7452959418296814),
 ('nigga', 0.7426594495773315),
 ('hoe', 0.741933286190033),
 ('bitch🗣', 0.7406516671180725)]

In [26]:
def lookup(tok, dictionary):
    # TODO: external norm dicts?
    # MONOISE
    # lookup in list of all replacement pairs found in the training data (and external sources?)
    # all norm tokens with raw token tok are included as candidates
    return [(k, v) for k, v in dictionary.get(tok, {}).items()]

In [15]:
def clipping(tok, lex):
    # MONOISE
    # all words in lexicon that have tok as a prefix (capturing abbreviation). May only consider for tok length above 2?
    candidates = set()
    if len(tok) < 2:
        return set()
    # TODO: length threshold? prune generated (only some degree of clipping allowed w.r.t. edit distance)?
    return [t for t in lex if t.startswith(tok)]

In [16]:
# TODO: number of candidates on average generated by each module and with all modules
# TODO: which modules contribute the most / most unique correct candidates
# TODO: for modules and whole, percentage of correct vs incorrect candidates

In [17]:
def split(tok, lex):
    # MONOISE
    # hypothesis splits on (every/some) position and check if both words are in lexicon. May only consider of tok length above 3?
    candidates = set()
    if len(tok) < 3:
        return set()
    for pos in range(1, len(tok)):
        left = tok[:pos]
        right = tok[pos:]
        if left in lex and right in lex:
            candidates.add(" ".join([left, right]))
    # TODO: recursive candidate generation on each left and right? Probably not... More than one split? Probably not either...
    # TODO: length threshold?
    return candidates

In [18]:
from spylls.hunspell import Dictionary

def spellcheck(tok):
    # TODO: no control over this - can I change in source code? Try and load in custom lexicon.
    dictionary = Dictionary.from_files('en_US')
    return {c.lower() for c in dictionary.suggest(tok)}

In [19]:
# TODO: contextual features?
# def generate_candidates(tweet):
#     for token in tweet:
#         ...

In [20]:
def generate_candidates(tok):
    candidates = set()
    candidates = candidates.union(original_token(tok))
    candidates = candidates.union(word_embeddings(tok, w2v_vectors))
    candidates = candidates.union(spellcheck(tok))
    # obviously lookup on the train set will always give the correct answer!
    # candidates = candidates.union(lookup(tok, norm_dictionary))
    candidates = candidates.union(clipping(tok, lex))
    candidates = candidates.union(split(tok, lex))
    return candidates

In [27]:
from lexnorm.evaluation import condition_normalisation
from lexnorm.data import normEval
import os
from lexnorm.definitions import DATA_PATH
from lexnorm.data import norm_dict
from lexnorm.models.filtering import is_eligible
from random import sample

norm_dictionary = norm_dict.construct(os.path.join(DATA_PATH, "interim/train.txt"))
# keys = sample(list(norm_dictionary.keys()), 100)
# assert condition_normalisation.contingency(
#     raw, norm, lambda x: x[0] == "a", True
# ) == condition_normalisation.contingency_from_dict(
#     norm_dictionary, lambda x: x[0][0] == "a"
# )
a, b, c, d = condition_normalisation.contingency_from_dict(
    norm_dictionary, lambda x: not x[1] in generate_candidates(x[0])
)

# print(sum(a.values()) + sum(b.values()) + sum(c.values()) + sum(d.values()))

# a2, b2, c2, d2 = condition_normalisation.contingency(raw, norm, lambda x: x[0] == "a", True)

# print(sum(a.values()) + sum(b.values()) + sum(c.values()) + sum(d.values()))

In [22]:
# TODO merge module that checks some tokens ahead of current token (perhaps only one)

In [39]:
import lexnorm.models.candidate_generation as candidate_generation
import importlib
import numpy as np
from spylls.hunspell import Dictionary
importlib.reload(candidate_generation)
dictionary = Dictionary.from_files("en_US")

In [29]:
train_data = pd.DataFrame()
candidates = candidate_generation.candidates_from_token("bruh", w2v_vectors, norm_dictionary, lex, dictionary)
# candidates.cosine_to_orig = candidates.index.map(lambda x: w2v_vectors.similarity(x, "lol") if x in w2v_vectors else 0)
# w2v_vectors.similarity(candidates.index, "lol") if indexes in w2v_vectors else 0
# candidates.index.to_series()
candidates

In [None]:
d1 = pd.DataFrame(columns=["feature"])
d2 = pd.DataFrame(columns=["feature"])
d1.loc["key"] = {"feature": np.nan}
d2.loc["key"] = {"feature": 1}
d2.loc["key2"] = {"feature": 3}

In [30]:
d1.combine_first(d2)

In [4]:
import pandas as pd
df = pd.read_csv(os.path.join(DATA_PATH, "hpc/dev_pipeline.txt"), index_col=0)

In [161]:
df = pd.read_csv(os.path.join(DATA_PATH, "../hpc/candidates.txt"), index_col=0)

In [167]:
# get all tokens where correct normalisation not produced by candidate generation
filtered = df.groupby("raw_tok_index").filter(lambda x: x.sum()["correct"] == 0)
ungenerated = filtered.loc[filtered["from_original_token"] == 1.0]["gold"]

In [95]:
raw, _ = loadNormData(os.path.join(DATA_PATH, "raw/dev.norm"))

In [96]:
count = 0
for raw_tweet, norm_tweet in zip(raw, norm):
    for raw_tok, norm_tok in zip(raw_tweet, norm_tweet):
        if is_eligible(raw_tok):
            count += 1
count
# as expected - perhaps can be used to test candidate_generation

6876

In [174]:
from collections import Counter

Counter(zip(ungenerated.index, ungenerated)).most_common()
# very few ungenerated correct candidates! So candidate generation module perhaps alright.

[(('v', 'very'), 2),
 (('hapi', 'happy'), 2),
 (('witchu', 'with you'), 2),
 (('yessss', 'yes'), 1),
 (('bestie', 'best friend'), 1),
 (('niggra', 'nigger'), 1),
 (('wada', 'water'), 1),
 (('chu', 'you'), 1),
 (('nows', 'now is'), 1),
 (('nuh', 'know'), 1),
 (('ntn', 'nothing'), 1),
 (('mnl', 'my new love'), 1),
 (('za', 'that'), 1),
 (('skepta', 'sunglasses'), 1),
 (('whatdoiwear', 'what do i wear'), 1),
 (('brutha', 'brother'), 1),
 (('yah', 'you'), 1),
 (('nuff', 'enough'), 1),
 (('shizz', 'shit'), 1),
 (('nuffin', 'nothing'), 1),
 (('diss', 'this'), 1),
 (('vas', 'was'), 1),
 (('redsox', 'red sox'), 1),
 (('nem', 'they'), 1),
 (('fkn', 'fucking'), 1),
 (('sim', 'seems'), 1),
 (('hbu', 'how about you'), 1),
 (('goddamit', 'god damn it'), 1),
 (('satnite', 'saturday night'), 1),
 (('dese', 'these'), 1),
 (('summn', 'something'), 1),
 (('cums', 'comes'), 1),
 (('dnt', "doesn't"), 1),
 (('getcha', 'get you'), 1),
 (('trynna', 'trying to'), 1),
 (('ya', 'your'), 1),
 (('lottle', 'lot'),