In [1]:
import os
from lexnorm.data import normEval
from lexnorm.data import baseline
from lexnorm.definitions import DATA_PATH

In [2]:
train_raw, train_norm = normEval.loadNormData(os.path.join(DATA_PATH, 'raw/train.norm'))
test_raw, test_norm = normEval.loadNormData(os.path.join(DATA_PATH, 'raw/test.norm'))
dev_raw, dev_norm = normEval.loadNormData(os.path.join(DATA_PATH, 'raw/dev.norm'))

In [3]:
print(f"Total number of tweets: {len(train_raw) + len(test_raw) + len(dev_raw)}")
print(f"Total number of normed tweets: {len(train_norm) + len(test_norm) + len(dev_norm)}")
print("Unchanged from 2015 dataset.")

Total number of tweets: 4917
Total number of normed tweets: 4917
Unchanged from 2015 dataset.


In [4]:
print(f"Size of test set: {len(test_raw)}")
print("Unchanged from 2015 dataset. Keep split so can compare with both 2015 and 2021 entries.")
print(f"Size of train set: {len(train_raw)}")
print("As described in 2021 task paper.")

Size of test set: 1967
Unchanged from 2015 dataset. Keep split so can compare with both 2015 and 2021 entries.
Size of train set: 2360
As described in 2021 task paper.


In [5]:
full_raw = train_raw + dev_raw + test_raw
full_norm = train_norm + dev_norm + test_norm

In [6]:
for tweet_raw, tweet_norm in zip(full_raw, full_norm):
    if len(tweet_raw) != len(tweet_norm):
        print("Length mismatch!")
print("No length mismatch (as expected).")

No length mismatch (as expected).


In [381]:
for name, collection in [("TRAIN", zip(train_raw, train_norm)), ("DEV", zip(dev_raw, dev_norm)), ("TEST", zip(test_raw, test_norm)), ("ALL", zip(full_raw, full_norm))]:
    print(name)
    one_to_n_count = 0
    n_to_one_count = 0
    raw_count = 0
    norm_count = 0
    raw_normalised_count = 0
    for tweet_raw, tweet_norm in collection:
        for token_raw, token_norm in zip(tweet_raw, tweet_norm):
            raw_count += 1
            norm_count += len(token_norm.split(" "))
            if not token_norm:
                n_to_one_count += 1
            if len(token_norm.split(" ")) > 1:
                one_to_n_count += 1
            if token_norm != token_raw:
                raw_normalised_count += 1
    print(f"Number of raw tokens: {raw_count}")
    print(f"Number of normed tokens: {norm_count}")
    print(f"Number of 1 to n normalisation raw tokens: {one_to_n_count}")
    print(f"Percentage of 1 to n: {one_to_n_count * 100 / raw_count:.2f}")
    print(f"Number of n to 1 normalisation raw tokens: {n_to_one_count}")
    print(f"Percentage of n to 1: {n_to_one_count * 100 / raw_count:.2f}")
    print(f"Number of normalised raw tokens: {raw_normalised_count}")
    print(f"Percentage normalised: {raw_normalised_count * 100 / raw_count}")

TRAIN
Number of raw tokens: 35216
Number of normed tokens: 35598
Number of 1 to n normalisation raw tokens: 307
Percentage of 1 to n: 0.87
Number of n to 1 normalisation raw tokens: 13
Percentage of n to 1: 0.04
Number of normalised raw tokens: 2666
Percentage normalised: 7.570422535211268
DEV
Number of raw tokens: 9169
Number of normed tokens: 9282
Number of 1 to n normalisation raw tokens: 98
Percentage of 1 to n: 1.07
Number of n to 1 normalisation raw tokens: 1
Percentage of n to 1: 0.01
Number of normalised raw tokens: 633
Percentage normalised: 6.903697240702367
TEST
Number of raw tokens: 29421
Number of normed tokens: 29738
Number of 1 to n normalisation raw tokens: 262
Percentage of 1 to n: 0.89
Number of n to 1 normalisation raw tokens: 17
Percentage of n to 1: 0.06
Number of normalised raw tokens: 2324
Percentage normalised: 7.899119676421604
ALL
Number of raw tokens: 73806
Number of normed tokens: 74618
Number of 1 to n normalisation raw tokens: 667
Percentage of 1 to n: 0.9

In [8]:
print("For some reason, the stats in the 2021 task paper are on the train set only (correct in that case).")
print("Apart from percentage change - unclear how this is calculated anyway.")
print("Note the 1 to n and n to 1 counts are counting the number of raw tokens involved in the respective normalisations.")
print("So a 5 to 1 normalisation would produce a count of 4 (number of tokens merged into first token) for 1 to n, and a 1 to 5 normalisation would produce a count of 1 for n to 1.")
print("Note as no capitalisation correction for 2015 dataset, in 2021 dataset version EVERYTHING (RAW AND GOLD) IS LOWER CASE")
print("2015 task paper has different statistics, one reason for this being lack of capitalisation consideration in 2021")

For some reason, the stats in the 2021 task paper are on the train set only (correct in that case).
Apart from percentage change - unclear how this is calculated anyway.
Note the 1 to n and n to 1 counts are counting the number of raw tokens involved in the respective normalisations.
So a 5 to 1 normalisation would produce a count of 4 (number of tokens merged into first token) for 1 to n, and a 1 to 5 normalisation would produce a count of 1 for n to 1.
Note as no capitalisation correction for 2015 dataset, in 2021 dataset version EVERYTHING (RAW AND GOLD) IS LOWER CASE
2015 task paper has different statistics, one reason for this being lack of capitalisation consideration in 2021


In [9]:
import json
f_train = open(os.path.join(DATA_PATH, "raw/2015/train_data.json"))
f_test = open(os.path.join(DATA_PATH, "raw/2015/test_truth.json"))

In [10]:
fif_data = json.load(f_train)
fif_data += json.load(f_test)

In [11]:
from collections import Counter
raw_diff = 0
norm_diff = 0
norm = Counter()
for fif, twe_raw, twe_norm in zip(fif_data, full_raw, full_norm):
    fif_raw = [x.lower() for x in fif["input"]]
    fif_norm = [x.lower() for x in fif["output"]]
    if fif_raw != twe_raw:
        raw_diff += 1
    elif fif_norm != twe_norm:
        norm_diff += 1
        norm.update((x, y) for x, y in zip(fif_norm, twe_norm) if x != y)
print(f"{raw_diff} raw differences, {norm_diff} norm only differences")
print(f"Most common norm differences: {norm.most_common(10)}")
print("Differences in 2015, 2021 raw due to username anonymization")
print("Differences in 2015, 2021 gold due to leaving interjections alone e.g. lol, lmfao, ctfu and normalising gonna and wanna")
print("Hence make sure models do too to ensure good performance on 2021 set (maybe dict lookup, hard coding?)")
print("Could evaluate on both datasets to compare with submissions from both tasks")

29 raw differences, 1036 norm only differences
Most common norm differences: [(('laughing out loud', 'lol'), 465), (('oh my god', 'omg'), 100), (('laughing my ass off', 'lmao'), 96), (("i don't know", 'idk'), 63), (('gonna', 'going to'), 46), (('what the fuck', 'wtf'), 45), (('shaking my head', 'smh'), 40), (('wanna', 'want to'), 33), (('laughing my fucking ass off', 'lmfao'), 26), (("i don't care", 'idc'), 22)]
Differences in 2015, 2021 raw due to username anonymization
Differences in 2015, 2021 gold due to leaving interjections alone e.g. lol, lmfao, ctfu and normalising gonna and wanna
Hence make sure models do too to ensure good performance on 2021 set (maybe dict lookup, hard coding?)
Could evaluate on both datasets to compare with submissions from both tasks


In [12]:
normalised_pairs = Counter()
non_standard_tokens = Counter()

for tweet_raw, tweet_norm in zip(full_raw, full_norm):
    for token_raw, token_norm in zip(tweet_raw, tweet_norm):
        if token_raw != token_norm:
            normalised_pairs.update([(token_raw, token_norm)])
            non_standard_tokens.update([token_raw])

print(f"Most common normalisation pairs: {normalised_pairs.most_common(10)}")
print(f"Most common normalised raw words: {non_standard_tokens.most_common(10)}")
print("Remember this is including the test set - can't use all of this for the normalisation dictionary!")

Most common normalisation pairs: [(('u', 'you'), 562), (('im', "i'm"), 334), (('dont', "don't"), 149), (('nigga', 'nigger'), 117), (('niggas', 'niggers'), 93), (('n', 'and'), 89), (('pls', 'please'), 68), (('lil', 'little'), 62), (('ur', 'your'), 54), (('thats', "that's"), 54)]
Most common normalised raw words: [('u', 569), ('im', 336), ('dont', 149), ('nigga', 117), ('niggas', 94), ('n', 93), ('ur', 74), ('pls', 68), ('lil', 62), ('thats', 54)]
Remember this is including the test set - can't use all of this for the normalisation dictionary!


In [13]:
normEval.evaluate(test_raw, test_norm, baseline.mfr(train_raw, train_norm, test_raw))
print("As in 2021 paper. For some reason not using dev for training - could fix. Notice the difference in accuracy and ERR.")

Baseline acc.(LAI): 92.10
Accuracy:           97.23
ERR:                64.93
As in 2021 paper. For some reason not using dev for training - could fix. Notice the difference in accuracy and ERR.


In [14]:
american_70 = open(os.path.join(DATA_PATH, "interim/american-70.txt"))
words = set()
for line in american_70:
    words.add(line.strip().lower())

In [70]:
import math

raw_non_lexical = Counter()
norm_non_lexical = Counter()
changed_non_lexical = Counter()
unchanged_non_lexical = Counter()
changed_lexical = Counter()
unchanged_lexical = Counter()

for twe_raw, twe_norm in zip(full_raw, full_norm):
    for token in twe_raw:
        if token.isalnum() and token not in words:
            raw_non_lexical.update([token])
    for norm in twe_norm:
        for token in norm.split():
            if token.isalnum() and token not in words:
                norm_non_lexical.update([token])
    for raw, norm in zip(twe_raw, twe_norm):
        if raw.isalnum():
            if raw in words:
                if raw == norm:
                    unchanged_lexical.update([raw])
                else:
                    changed_lexical.update([raw])
            elif raw not in words:
                if raw == norm:
                    unchanged_non_lexical.update([raw])
                else:
                    changed_non_lexical.update([raw])
print(f"Percent raw tokens not in lexicon: {sum(raw_non_lexical.values())*100/raw_count}")
print(f"Percent normed tokens not in lexicon: {sum(norm_non_lexical.values())*100/raw_count}")
a = sum(unchanged_lexical.values())
b = sum(changed_lexical.values())
c = sum(unchanged_non_lexical.values())
d = sum(changed_non_lexical.values())
phi = (a * d - b * c) / math.sqrt((a+b)*(b+d)*(a+c)*(c+d))
print(f"Phi coefficient between in lexicon and normalisation: {phi:.2f}")
print("This means there is a moderate relationship between them.")
print(f"Most common un-normalised raw alphanumeric tokens in lexicon: {unchanged_lexical.most_common(20)}")
print("Not super helpful except note domain specific 'rt' left alone.")
print(f"Most common normalised raw alphanumeric tokens in lexicon: {changed_lexical.most_common(20)}")
print("Annotation guidelines for the n-word! May want to remove single letter words from lexicon (apart from v. common ones), and filter out double letter acronyms with an internet acronyms list. Note wanna normalised in 2021, inconsistent rt normalisation.")
print("This is important as lookup is used to determine if generated candidate is valid/used as feature for candidate selection.")
print(f"Most common un-normalised raw alphanumeric tokens not in lexicon: {unchanged_non_lexical.most_common(20)}")
print("Pretty much all interjections and some common names (one direction - time specific). May want to expand lexicon in these areas - have to generate as candidates to get correct!. Hard code what to leave alone?")
print(f"Most common normalised raw alphanumeric tokens not in lexicon: {changed_non_lexical.most_common(20)}")
print("A lot of missing apostrophes - think about for candidate generation.")
print("A good lexicon (high correlation between in lexicon and normalised) will be good for checking validity of generated candidates - not suggesting candidates that wouldn't be considered normalised, not rejecting ones that would be (obviously on individual word basis). Size offers tradeoff between former and latter.")
print("WANT: to either expand lexicon or hard code to reduce un-normalised non-lexical, so that non-lexical->normalised. to reduce lexicon to reduce normalised lexical, so that lexical->non-normalised. Of course there will always be OOV words not requiring normalisation, e.g. unknown/nobel named entities, and IV words requiring normalisation, e.g. where misspelling of other words etc.")
print(f"Most common raw tokens not in lexicon: {raw_non_lexical.most_common(10)}")
print(f"Most common norm tokens not in lexicon: {norm_non_lexical.most_common(10)}")

Percent raw tokens not in lexicon: 17.770912933907812
Percent normed tokens not in lexicon: 13.020621629677803
Phi coefficient between in lexicon and normalisation: 0.31
This means there is a moderate relationship between them.
Most common un-normalised raw alphanumeric tokens in lexicon: [('rt', 1503), ('i', 1102), ('the', 1036), ('to', 888), ('a', 814), ('and', 714), ('you', 577), ('in', 536), ('is', 529), ('for', 522), ('me', 457), ('my', 450), ('on', 440), ('of', 428), ('it', 357), ('with', 321), ('that', 313), ('this', 312), ('so', 290), ('be', 269)]
Not super helpful except note domain specific 'rt' left alone.
Most common normalised raw alphanumeric tokens in lexicon: [('u', 569), ('nigga', 117), ('niggas', 94), ('n', 93), ('ur', 74), ('rt', 49), ('gonna', 46), ('r', 44), ('bout', 37), ('wanna', 33), ('d', 31), ('yo', 30), ('cause', 29), ('b', 29), ('dat', 28), ('bc', 28), ('tho', 27), ('ya', 27), ('wit', 26), ('cant', 25)]
Annotation guidelines for the n-word! May want to remov

In [126]:
def if_normed(raw_list, norm_list):
    resp = []
    for raw_tweet, norm_tweet in zip(raw_list, norm_list):
        resp_tweet = []
        for raw_tok, norm_tok in zip(raw_tweet, norm_tweet):
            if raw_tok == norm_tok:
                resp_tweet.append((raw_tok, ""))
            else:
                resp_tweet.append((raw_tok, norm_tok))
        resp.append(resp_tweet)
    return resp

full_if_normed = if_normed(full_raw, full_norm)

In [382]:
def norm_condition(raw_list, norm_list, condition, pair=False, norm=False):
    """
    Return counter for every combination of normalised and satisfying condition given a list of tuples of tokens and normalisations if done
    """
    p_normed = Counter()
    p_unnormed = Counter()
    n_normed = Counter()
    n_unnormed = Counter()
    for tweet_raw, tweet_norm in zip(raw_list, norm_list):
        for raw, normed in zip(tweet_raw, tweet_norm):
            tok = normed if norm else raw
            to_update = (tok, normed) if pair else tok
            if raw != normed and condition(tok):
                p_normed.update([to_update])
            elif raw == normed and condition(tok):
                p_unnormed.update([to_update])
            elif raw != normed and not condition(tok):
                n_normed.update([to_update])
            else:
                n_unnormed.update([to_update])
    return p_unnormed, p_normed, n_unnormed, n_normed

def correlation_with_norm(p_unnormed, p_normed, n_unnormed, n_normed):
    """
    Calculate matthew's correlation coefficient between normalisation and the condition
    """
    a = sum(p_unnormed.values())
    b = sum(p_normed.values())
    c = sum(n_unnormed.values())
    d = sum(n_normed.values())
    phi = (a * d - b * c) / math.sqrt((a+b)*(b+d)*(a+c)*(c+d))
    return phi

# a, b, c, d = norm_condition([[tok for tok in tweet if tok[0].isalnum()] for tweet in full_if_normed], lambda x: x in words)
# correlation_with_norm(a, b, c, d)

In [384]:
_, b, _, _= norm_condition(full_raw, full_norm, lambda x: not x.isalnum(), True, False)
print(f"Only {sum(b.values())} non-alphanumeric raw tokens are normalised: {b.most_common()}. These all contain apostrophes, which are included as not invalidating tokens for normalisation. However, could've and should've should not have been normalised under 2015 annotation rule 9. This rule also makes sense as expanding could change the formality of the text. ca'nt and do'nt have the apostrophe in the wrong place. Perhaps the contraction module could handle moving apostrophes as well as inserting to get a IV word. Modules should only be able to add candidates, not remove them. Unclear what is happening in good'o but perhaps could hypothesise drop/split before/after the apostrophe? The alphanumeric filter conveniently also filters out domain specific entities.")

Only 6 non-alphanumeric raw tokens are normalised: [(("could've", 'could have'), 2), (("good'o", 'good'), 1), (("should've", 'should have'), 1), (("ca'nt", "can't"), 1), (("do'nt", "don't"), 1)]. These all contain apostrophes, which are included as not invalidating tokens for normalisation. However, could've and should've should not have been normalised under 2015 annotation rule 9. This rule also makes sense as expanding could change the formality of the text. ca'nt and do'nt have the apostrophe in the wrong place. Perhaps the contraction module could handle moving apostrophes as well as inserting to get a IV word. Modules should only be able to add candidates, not remove them. Unclear what is happening in good'o but perhaps could hypothesise drop/split before/after the apostrophe? The alphanumeric filter conveniently also filters out domain specific entities.


In [385]:
a, b, _, _ = norm_condition(full_raw, full_norm, lambda x: any(char.isspace() for char in x), True)
print(f"There are {sum((a+b).values())} raw tokens containing whitespace.")

There are 0 raw tokens containing whitespace.


In [387]:
a, b, _, _ = norm_condition(full_raw, full_norm, lambda x: x == "rt", True)
print(f"There are {sum((a+b).values())} rt tokens total, with {sum(a.values())} left alone. rt at the start of a tweet is a domain specific entity and thus should be disregarded as per rule 3. In the middle of a tweet, this is more debatable. There are {sum(1 if tweet[0] == 'rt' else 0 for tweet in full_raw)} rt tokens at the start of tweets, which are all left alone. Middle rts (130) are inconsistently annotated (46 normalised), but if they are normalised, this is to retweet. From observation looks like domain-specific rts are always followed by a @mention. 42/46 normalised middle rts are not followed by @, 61/84 unnormalised middle rts are. There are 6 rts at the end of tweets, and they are split down the middle in terms of annotation. So perhaps rule could be: if at start of tweet or followed by @mention, ignore.")

There are 1552 rt tokens total, with 1503 left alone. rt at the start of a tweet is a domain specific entity and thus should be disregarded as per rule 3. In the middle of a tweet, this is more debatable. There are 1416 rt tokens at the start of tweets, which are all left alone. Middle rts (130) are inconsistently annotated (46 normalised), but if they are normalised, this is to retweet. From observation looks like domain-specific rts are always followed by a @mention. 42/46 normalised middle rts are not followed by @, 61/84 unnormalised middle rts are. There are 6 rts at the end of tweets, and they are split down the middle in terms of annotation. So perhaps rule could be: if at start of tweet or followed by @mention, ignore.


In [283]:
# print([tweet[tweet.index('rt'):tweet.index('rt')+2] for tweet in full_raw if 'rt' in tweet and len(tweet[tweet.index('rt'):tweet.index('rt')+2]) == 2 and tweet[tweet.index('rt')+1][0] != '@'])

# count = 0
# for tweet in full_if_normed:
#     for i, tok in enumerate(tweet):
#         if tok[0] == 'rt' and 0 < i < len(tweet)-1 and not tok[1] and tweet[i+1][0][0] != '@':
#             print(tweet[i+1])
#             count += 1
# print(count)

# [[tweet[i:i+2] for i, tok in enumerate(tweet) if tok == 'rt' and len(tweet[i:i+2]) < 2] for tweet in full_raw if 'rt' in tweet]

# No starting rts normalised
# [[tok for i, tok in enumerate(tweet) if tok[0] == 'rt' and tok[1] and i == 0] for tweet in full_if_normed if len([tok for i, tok in enumerate(tweet) if tok[0] == 'rt' and tok[1] and i==0])]

# Middle rts inconsistently normalised
# [[tok for i, tok in enumerate(tweet) if tok[0] == 'rt' and 0 < i < len(tweet)-1] for tweet in full_if_normed if len([tok for i, tok in enumerate(tweet) if tok[0] == 'rt' and 0 < i < len(tweet)-1])])

In [285]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [309]:
spacy_full_raw = []
for tweet in full_raw:
    spacy_full_raw.append(nlp(" ".join(tweet)))

In [388]:
# for tweet in full_if_normed:
#     for tok, norm in tweet:
#         dok = nlp(tok)[0]
#         type = dok.pos_
#         if type == "PROPN":
#             print(tok, norm)

a, b, c, d = norm_condition(full_raw, full_norm, lambda x: nlp(x)[0].pos_ == "PROPN", True, False)

In [418]:
# print(f"{sum(b.values())}/{sum(d.values())+sum(b.values())}")
print([k[0] for k in a.keys() if k[0] not in words and k[0].isalnum()])
print(f"Correlation of PROPN and normalisation {correlation_with_norm(a, b, c, d):.2f}. Very low - probably due to bad POS performance thanks to lowercasing and other irregularities of the text - sort of a chicken and egg issue with downstream processing. A lot these could probably be found by expanding the lexicon instead.")

['ozil', 'bie', 'haha', 'avedon', 'pshh', 'hahaha', 'bria', 'exo', 'chanyeol', 'sehun', 'lol', 'grecia', 'hana', 'jeongguk', 'dremel', 'keelan', 'amo', 'investissement', 'heheheh', 'galeris', 'salmann', 'khann', 'croke', 'mtfs', 'pss', 'havard', 'nordtveit', 'yaya', 'shakira', 'calle', 'wtf', 'xo', 'gervais', 'peppa', 'kylie', 'sungjong', 'seongyeol', 'myungsoo', 'hai', 'kcu', 'oppa', 'kuru', 'manis', 'moscato', 'mq', 'doto', 'shourie', 'ji', 'badboy', 'ballerino', 'lmao', 'marwan', 'asmar', 'lonergan', 'kna', 'jga', 'enda', 'poch', 'kalau', 'hisap', 'inte', 'babbar', 'pacquiao', 'zedd', 'kamar', 'bae', 'alcon', 'chmp', 'simbrinz', 'kanye', 'zimuvuta', 'gwamba', 'fasta', 'pouncey', 'annabeth', 'kloss', 'enduro', 'fenwick', 'tbh', 'benim', 'kamran', 'apna', 'uma', 'toma', 'ng', 'jaya', 'haahaha', 'hahagaha', 'lantak', 'rasheeda', 'jinka', 'hu', 'parha', 'hyn', 'mathata', 'ke', 'annalynne', 'mccord', 'catz', 'gazzetta', 'psg', 'hahah', 'jfb', 'juseyo', 'bgt', 'matty', 'scaramouche', 'mao