In [1]:
import os
from lexnorm.data import normEval
from lexnorm.data import baseline
from lexnorm.definitions import DATA_PATH

In [2]:
train_raw, train_norm = normEval.loadNormData(os.path.join(DATA_PATH, 'raw/train.norm'))
test_raw, test_norm = normEval.loadNormData(os.path.join(DATA_PATH, 'raw/test.norm'))
dev_raw, dev_norm = normEval.loadNormData(os.path.join(DATA_PATH, 'raw/dev.norm'))

In [3]:
print(f"Total number of tweets: {len(train_raw) + len(test_raw) + len(dev_raw)}")
print(f"Total number of normed tweets: {len(train_norm) + len(test_norm) + len(dev_norm)}")
print("Unchanged from 2015 dataset.")

Total number of tweets: 4917
Total number of normed tweets: 4917
Unchanged from 2015 dataset.


In [4]:
print(f"Size of test set: {len(test_raw)}")
print("Unchanged from 2015 dataset. Keep split so can compare with both 2015 and 2021 entries.")
print(f"Size of train set: {len(train_raw)}")
print("As described in 2021 task paper.")

Size of test set: 1967
Unchanged from 2015 dataset. Keep split so can compare with both 2015 and 2021 entries.
Size of train set: 2360
As described in 2021 task paper.


In [5]:
full_raw = train_raw + dev_raw + test_raw
full_norm = train_norm + dev_norm + test_norm

In [6]:
for tweet_raw, tweet_norm in zip(full_raw, full_norm):
    if len(tweet_raw) != len(tweet_norm):
        print("Length mismatch!")
print("No length mismatch (as expected).")

No length mismatch (as expected).


In [7]:
for name, collection in [("TRAIN", zip(train_raw, train_norm)), ("DEV", zip(dev_raw, dev_norm)), ("TEST", zip(test_raw, test_norm)), ("ALL", zip(full_raw, full_norm))]:
    print(name)
    one_to_n_count = 0
    n_to_one_count = 0
    raw_count = 0
    norm_count = 0
    raw_normalised_count = 0
    for tweet_raw, tweet_norm in collection:
        for token_raw, token_norm in zip(tweet_raw, tweet_norm):
            raw_count += 1
            norm_count += len(token_norm.split(" "))
            if not token_norm:
                n_to_one_count += 1
            if len(token_norm.split(" ")) > 1:
                one_to_n_count += 1
            if token_norm != token_raw:
                raw_normalised_count += 1
    print(f"Number of raw tokens: {raw_count}")
    print(f"Number of normed tokens: {norm_count}")
    print(f"Number of 1 to n normalisation raw tokens: {one_to_n_count}")
    print(f"Percentage of 1 to n: {one_to_n_count * 100 / raw_count:.2f}")
    print(f"Number of n to 1 normalisation raw tokens: {n_to_one_count}")
    print(f"Percentage of n to 1: {n_to_one_count * 100 / raw_count:.2f}")
    print(f"Number of normalised raw tokens: {raw_normalised_count}")
    print(f"Percentage normalised: {raw_normalised_count * 100 / raw_count}")

TRAIN
Number of raw tokens: 35216
Number of normed tokens: 35598
Number of 1 to n normalisation raw tokens: 307
Percentage of 1 to n: 0.87
Number of n to 1 normalisation raw tokens: 13
Percentage of n to 1: 0.04
Number of normalised raw tokens: 2666
Percentage normalised: 7.570422535211268
DEV
Number of raw tokens: 9169
Number of normed tokens: 9282
Number of 1 to n normalisation raw tokens: 98
Percentage of 1 to n: 1.07
Number of n to 1 normalisation raw tokens: 1
Percentage of n to 1: 0.01
Number of normalised raw tokens: 633
Percentage normalised: 6.903697240702367
TEST
Number of raw tokens: 29421
Number of normed tokens: 29738
Number of 1 to n normalisation raw tokens: 262
Percentage of 1 to n: 0.89
Number of n to 1 normalisation raw tokens: 17
Percentage of n to 1: 0.06
Number of normalised raw tokens: 2324
Percentage normalised: 7.899119676421604
ALL
Number of raw tokens: 73806
Number of normed tokens: 74618
Number of 1 to n normalisation raw tokens: 667
Percentage of 1 to n: 0.9

In [8]:
print("For some reason, the stats in the 2021 task paper are on the train set only (correct in that case).")
print("Apart from percentage change - unclear how this is calculated anyway.")
print("Note the 1 to n and n to 1 counts are counting the number of raw tokens involved in the respective normalisations.")
print("So a 5 to 1 normalisation would produce a count of 4 (number of tokens merged into first token) for 1 to n, and a 1 to 5 normalisation would produce a count of 1 for n to 1.")
print("Note as no capitalisation correction for 2015 dataset, in 2021 dataset version EVERYTHING (RAW AND GOLD) IS LOWER CASE")
print("2015 task paper has different statistics, one reason for this being lack of capitalisation consideration in 2021")

For some reason, the stats in the 2021 task paper are on the train set only (correct in that case).
Apart from percentage change - unclear how this is calculated anyway.
Note the 1 to n and n to 1 counts are counting the number of raw tokens involved in the respective normalisations.
So a 5 to 1 normalisation would produce a count of 4 (number of tokens merged into first token) for 1 to n, and a 1 to 5 normalisation would produce a count of 1 for n to 1.
Note as no capitalisation correction for 2015 dataset, in 2021 dataset version EVERYTHING (RAW AND GOLD) IS LOWER CASE
2015 task paper has different statistics, one reason for this being lack of capitalisation consideration in 2021


In [9]:
import json
f_train = open(os.path.join(DATA_PATH, "raw/2015/train_data.json"))
f_test = open(os.path.join(DATA_PATH, "raw/2015/test_truth.json"))

In [10]:
fif_data = json.load(f_train)
fif_data += json.load(f_test)

In [27]:
from collections import Counter
raw_diff = 0
norm_diff = 0
norm = Counter()
for fif, twe_raw, twe_norm in zip(fif_data, full_raw, full_norm):
    fif_raw = [x.lower() for x in fif["input"]]
    fif_norm = [x.lower() for x in fif["output"]]
    if fif_raw != twe_raw:
        raw_diff += 1
    elif fif_norm != twe_norm:
        norm_diff += 1
        norm.update((x, y) for x, y in zip(fif_norm, twe_norm) if x != y)
print(f"{raw_diff} raw differences, {norm_diff} norm only differences")
print(f"Most common norm differences: {norm.most_common(10)}")
print("Differences in 2015, 2021 raw due to username anonymization")
print("Differences in 2015, 2021 gold due to leaving interjections alone e.g. lol, lmfao, ctfu and normalising gonna and wanna")
print("Hence make sure models do too to ensure good performance on 2021 set (maybe dict lookup, hard coding?)")
print("Could evaluate on both datasets to compare with submissions from both tasks")

29 raw differences, 1036 norm only differences
Most common norm differences: [(('laughing out loud', 'lol'), 465), (('oh my god', 'omg'), 100), (('laughing my ass off', 'lmao'), 96), (("i don't know", 'idk'), 63), (('gonna', 'going to'), 46), (('what the fuck', 'wtf'), 45), (('shaking my head', 'smh'), 40), (('wanna', 'want to'), 33), (('laughing my fucking ass off', 'lmfao'), 26), (("i don't care", 'idc'), 22)]
Differences in 2015, 2021 raw due to username anonymization
Differences in 2015, 2021 gold due to leaving interjections alone e.g. lol, lmfao, ctfu and normalising gonna and wanna
Hence make sure models do too to ensure good performance on 2021 set (maybe dict lookup, hard coding?)
Could evaluate on both datasets to compare with submissions from both tasks


In [64]:
normalised_pairs = Counter()
non_standard_tokens = Counter()

for tweet_raw, tweet_norm in zip(full_raw, full_norm):
    for token_raw, token_norm in zip(tweet_raw, tweet_norm):
        if token_raw != token_norm:
            normalised_pairs.update([(token_raw, token_norm)])
            non_standard_tokens.update([token_raw])

print(f"Most common normalisation pairs: {normalised_pairs.most_common(10)}")
print(f"Most common normalised raw words: {non_standard_tokens.most_common(10)}")
print("Remember this is including the test set - can't use all of this for the normalisation dictionary!")

Most common normalisation pairs: [(('u', 'you'), 562), (('im', "i'm"), 334), (('dont', "don't"), 149), (('nigga', 'nigger'), 117), (('niggas', 'niggers'), 93), (('n', 'and'), 89), (('pls', 'please'), 68), (('lil', 'little'), 62), (('ur', 'your'), 54), (('thats', "that's"), 54)]
Most common normalised raw words: [('u', 569), ('im', 336), ('dont', 149), ('nigga', 117), ('niggas', 94), ('n', 93), ('ur', 74), ('pls', 68), ('lil', 62), ('thats', 54)]
Remember this is including the test set - can't use all of this for the normalisation dictionary!


In [65]:
normEval.evaluate(test_raw, test_norm, baseline.mfr(train_raw, train_norm, test_raw))
print("As in 2021 paper. For some reason not using dev for training - could fix.")

Baseline acc.(LAI): 92.10
Accuracy:           97.23
ERR:                64.93
As in 2021 paper. For some reason not using dev for training - could fix.


In [66]:
american_70 = open(os.path.join(DATA_PATH, "interim/american-70.txt"))
words = set()
for line in american_70:
    words.add(line.strip().lower())

In [129]:
import math

raw_non_lexical = Counter()
norm_non_lexical = Counter()
changed_non_lexical = Counter()
unchanged_non_lexical = Counter()
changed_lexical = Counter()
unchanged_lexical = Counter()
lex_raw_count = 0

# TODO: percent normalised tokens in lexicon, percent raw tokens in lexicon
# TODO: percent non-alphanumeric raw tokens normalised (should be 0), percent alphanumeric raw tokens normalised. Extend to other annotation guidelines

for twe_raw, twe_norm in zip(full_raw, full_norm):
    for token in twe_raw:
        if token.isalnum() and token not in words:
            raw_non_lexical.update([token])
    for norm in twe_norm:
        for token in norm.split():
            if token.isalnum() and token not in words:
                norm_non_lexical.update([token])
    for raw, norm in zip(twe_raw, twe_norm):
        if raw.isalnum():
            if raw in words:
                if raw == norm:
                    unchanged_lexical.update([raw])
                else:
                    changed_lexical.update([raw])
            elif raw not in words:
                if raw == norm:
                    unchanged_non_lexical.update([raw])
                else:
                    changed_non_lexical.update([raw])
a = sum(unchanged_lexical.values())
b = sum(changed_lexical.values())
c = sum(unchanged_non_lexical.values())
d = sum(changed_non_lexical.values())
phi = (a * d - b * c) / math.sqrt((a+b)*(b+d)*(a+c)*(c+d))
print(f"Phi coefficient between in lexicon and normalisation: {phi:.2f}")
print("This means there is a moderate relationship between them.")
print(f"Most common un-normalised raw alphanumeric tokens in lexicon: {unchanged_lexical.most_common(10)}")
print("Not super helpful except note domain specific 'rt' left alone.")
print(f"Most common normalised raw alphanumeric tokens in lexicon: {changed_lexical.most_common(10)}")
print("Annotation guidelines for the n-word! May want to remove single letter, double letter words from lexicon (apart from v. common ones). Note wanna normalised.")
print("This is important as lookup is used to determine if generated candidate is valid/used as feature for candidate selection.")
print(f"Most common un-normalised raw alphanumeric tokens not in lexicon: {unchanged_non_lexical.most_common(10)}")
print("Pretty much all interjections and some common names (one direction - time specific). May want to expand lexicon in these areas. Hard code what to leave alone?")
print(f"Most common normalised raw alphanumeric tokens not in lexicon: {changed_non_lexical.most_common(10)}")
print("A lot of missing apostrophes - think about for candidate generation.")
print("A good lexicon (high correlation between in lexicon and normalised) will be good for checking validity of generated candidates - not suggesting candidates that wouldn't be considered normalised, not rejecting ones that would be (obviously on individual word basis). Size offers tradeoff between former and latter.")
print("WANT: to either expand lexicon or hard code to reduce un-normalised non-lexical, so that non-lexical->normalised. to reduce lexicon to reduce normalised lexical, so that lexical->non-normalised")

Phi coefficient between in lexicon and normalisation: 0.31
This means there is a moderate relationship between them.
Most common un-normalised raw alphanumeric tokens in lexicon: [('rt', 1503), ('i', 1102), ('the', 1036), ('to', 888), ('a', 814), ('and', 714), ('you', 577), ('in', 536), ('is', 529), ('for', 522)]
Not super helpful except note domain specific 'rt' left alone.
Most common normalised raw alphanumeric tokens in lexicon: [('u', 569), ('nigga', 117), ('niggas', 94), ('n', 93), ('ur', 74), ('rt', 49), ('gonna', 46), ('r', 44), ('bout', 37), ('wanna', 33)]
Annotation guidelines for the n-word! May want to remove single letter, double letter words from lexicon (apart from v. common ones). Note wanna normalised.
This is important as lookup is used to determine if generated candidate is valid/used as feature for candidate selection.
Most common un-normalised raw alphanumeric tokens not in lexicon: [('lol', 469), ('haha', 127), ('omg', 101), ('lmao', 96), ('2', 65), ('exo', 63), (