In [2]:
import os
from lexnorm.data import normEval
from lexnorm.data import baseline
from lexnorm.definitions import DATA_PATH

In [3]:
train_raw, train_norm = normEval.loadNormData(os.path.join(DATA_PATH, 'raw/train.norm'))
test_raw, test_norm = normEval.loadNormData(os.path.join(DATA_PATH, 'raw/test.norm'))
dev_raw, dev_norm = normEval.loadNormData(os.path.join(DATA_PATH, 'raw/dev.norm'))

In [4]:
print(f"Total number of tweets: {len(train_raw) + len(test_raw) + len(dev_raw)}")
print(f"Total number of normed tweets: {len(train_norm) + len(test_norm) + len(dev_norm)}")
print("Unchanged from 2015 dataset.")

Total number of tweets: 4917
Total number of normed tweets: 4917
Unchanged from 2015 dataset.


In [29]:
print(f"Size of test set: {len(test_raw)}")
print("Unchanged from 2015 dataset. Keep split so can compare with both 2015 and 2021 entries.")
print(f"Size of train set: {len(train_raw)}")
print("As described in 2021 task paper.")

Size of test set: 1967
Unchanged from 2015 dataset. Keep split so can compare with both 2015 and 2021 entries.
Size of train set: 2360
As described in 2021 task paper.


In [30]:
full_raw = train_raw + dev_raw
full_norm = train_norm + dev_norm

In [31]:
for tweet_raw, tweet_norm in zip(full_raw, full_norm):
    if len(tweet_raw) != len(tweet_norm):
        print("Length mismatch!")
print("No length mismatch (as expected).")

No length mismatch (as expected).


In [32]:
for name, collection in [("TRAIN", zip(train_raw, train_norm)), ("DEV", zip(dev_raw, dev_norm)), ("FULL", zip(full_raw, full_norm))]:
    print(name)
    one_to_n_count = 0
    n_to_one_count = 0
    raw_count = 0
    norm_count = 0
    raw_normalised_count = 0
    for tweet_raw, tweet_norm in collection:
        for token_raw, token_norm in zip(tweet_raw, tweet_norm):
            raw_count += 1
            norm_count += len(token_norm.split(" "))
            if not token_norm:
                n_to_one_count += 1
            if len(token_norm.split(" ")) > 1:
                one_to_n_count += 1
            if token_norm != token_raw:
                raw_normalised_count += 1
    print(f"Number of raw tokens: {raw_count}")
    print(f"Number of normed tokens: {norm_count}")
    print(f"Number of 1 to n normalisation raw tokens: {one_to_n_count}")
    print(f"Percentage of 1 to n: {one_to_n_count * 100 / raw_count:.2f}")
    print(f"Number of n to 1 normalisation raw tokens: {n_to_one_count}")
    print(f"Percentage of n to 1: {n_to_one_count * 100 / raw_count:.2f}")
    print(f"Number of normalised raw tokens: {raw_normalised_count}")
    print(f"Percentage normalised: {raw_normalised_count * 100 / raw_count}")

TRAIN
Number of raw tokens: 35216
Number of normed tokens: 35598
Number of 1 to n normalisation raw tokens: 307
Percentage of 1 to n: 0.87
Number of n to 1 normalisation raw tokens: 13
Percentage of n to 1: 0.04
Number of normalised raw tokens: 2666
Percentage normalised: 7.570422535211268
DEV
Number of raw tokens: 9169
Number of normed tokens: 9282
Number of 1 to n normalisation raw tokens: 98
Percentage of 1 to n: 1.07
Number of n to 1 normalisation raw tokens: 1
Percentage of n to 1: 0.01
Number of normalised raw tokens: 633
Percentage normalised: 6.903697240702367
FULL
Number of raw tokens: 44385
Number of normed tokens: 44880
Number of 1 to n normalisation raw tokens: 405
Percentage of 1 to n: 0.91
Number of n to 1 normalisation raw tokens: 14
Percentage of n to 1: 0.03
Number of normalised raw tokens: 3299
Percentage normalised: 7.432691224512785


In [33]:
print("For some reason, the stats in the 2021 task paper are on the train set only (correct in that case).")
print("Apart from percentage normalised - unclear how this is calculated anyway.")
print("Note the 1 to n and n to 1 counts are counting the number of raw tokens involved in the respective normalisations.")
print("So a 5 to 1 normalisation would produce a count of 4 (number of tokens merged into first token) for 1 to n, and a 1 to 5 normalisation would produce a count of 1 for n to 1.")
print("Note as no capitalisation correction for 2015 dataset, in 2021 dataset version EVERYTHING (RAW AND GOLD) IS LOWER CASE")
print("2015 task paper has different statistics, one reason for this being lack of capitalisation consideration in 2021")

For some reason, the stats in the 2021 task paper are on the train set only (correct in that case).
Apart from percentage normalised - unclear how this is calculated anyway.
Note the 1 to n and n to 1 counts are counting the number of raw tokens involved in the respective normalisations.
So a 5 to 1 normalisation would produce a count of 4 (number of tokens merged into first token) for 1 to n, and a 1 to 5 normalisation would produce a count of 1 for n to 1.
Note as no capitalisation correction for 2015 dataset, in 2021 dataset version EVERYTHING (RAW AND GOLD) IS LOWER CASE
2015 task paper has different statistics, one reason for this being lack of capitalisation consideration in 2021


In [34]:
import json
f_train = open(os.path.join(DATA_PATH, "raw/2015/train_data.json"))
# f_test = open(os.path.join(DATA_PATH, "raw/2015/test_truth.json"))

In [35]:
fif_data = json.load(f_train)
# fif_data += json.load(f_test)

In [36]:
from collections import Counter
raw_diff = 0
norm_diff = 0
norm = Counter()
for fif, twe_raw, twe_norm in zip(fif_data, full_raw, full_norm):
    fif_raw = [x.lower() for x in fif["input"]]
    fif_norm = [x.lower() for x in fif["output"]]
    if fif_raw != twe_raw:
        raw_diff += 1
    elif fif_norm != twe_norm:
        norm_diff += 1
        norm.update((x, y) for x, y in zip(fif_norm, twe_norm) if x != y)
print(f"{raw_diff} raw differences, {norm_diff} norm only differences")
print(f"Most common norm differences: {norm.most_common(10)}")
print("Differences in 2015, 2021 raw due to username anonymization")
print("Differences in 2015, 2021 gold due to leaving interjections alone e.g. lol, lmfao, ctfu and normalising gonna and wanna")
print("Hence make sure models do too to ensure good performance on 2021 set (maybe dict lookup, hard coding?)")
print("Could evaluate on both datasets to compare with submissions from both tasks")

20 raw differences, 606 norm only differences
Most common norm differences: [(('laughing out loud', 'lol'), 271), (('oh my god', 'omg'), 66), (('laughing my ass off', 'lmao'), 51), (("i don't know", 'idk'), 36), (('gonna', 'going to'), 29), (('what the fuck', 'wtf'), 26), (('shaking my head', 'smh'), 21), (('to be honest', 'tbh'), 16), (("i don't care", 'idc'), 15), (('laughing my fucking ass off', 'lmfao'), 13)]
Differences in 2015, 2021 raw due to username anonymization
Differences in 2015, 2021 gold due to leaving interjections alone e.g. lol, lmfao, ctfu and normalising gonna and wanna
Hence make sure models do too to ensure good performance on 2021 set (maybe dict lookup, hard coding?)
Could evaluate on both datasets to compare with submissions from both tasks


In [39]:
normalised_pairs = Counter()
non_standard_tokens = Counter()

for tweet_raw, tweet_norm in zip(full_raw, full_norm):
    for token_raw, token_norm in zip(tweet_raw, tweet_norm):
        if token_raw != token_norm:
            normalised_pairs.update([(token_raw, token_norm)])
            non_standard_tokens.update([token_raw])

print(f"Most common normalisation pairs: {normalised_pairs.most_common(10)}")
print(f"Most common normalised raw words: {non_standard_tokens.most_common(10)}")
print("Think about for candidate generation.")
# print("Remember this is including the test set - can't use all of this for the normalisation dictionary!")

Most common normalisation pairs: [(('u', 'you'), 328), (('im', "i'm"), 181), (('dont', "don't"), 92), (('nigga', 'nigger'), 57), (('niggas', 'niggers'), 52), (('n', 'and'), 47), (('pls', 'please'), 43), (('lil', 'little'), 35), (('ur', 'your'), 33), (('thats', "that's"), 33)]
Most common normalised raw words: [('u', 333), ('im', 182), ('dont', 92), ('nigga', 57), ('niggas', 52), ('n', 49), ('ur', 46), ('pls', 43), ('lil', 35), ('thats', 33)]
Think about for candidate generation.


In [41]:
normEval.evaluate(test_raw, test_norm, baseline.mfr(train_raw, train_norm, test_raw))
print("As in 2021 paper. For some reason not using dev for training - could fix. Notice the difference in accuracy and ERR.")

Baseline acc.(LAI): 92.10
Accuracy:           97.23
ERR:                64.93
As in 2021 paper. For some reason not using dev for training - could fix. Notice the difference in accuracy and ERR.


In [42]:
american_70 = open(os.path.join(DATA_PATH, "interim/american-70.txt"))
words = set()
for line in american_70:
    words.add(line.strip().lower())

In [48]:
import math

raw_non_lexical = Counter()
norm_non_lexical = Counter()
changed_non_lexical = Counter()
unchanged_non_lexical = Counter()
changed_lexical = Counter()
unchanged_lexical = Counter()

for twe_raw, twe_norm in zip(full_raw, full_norm):
    for token in twe_raw:
        if token.isalnum() and token not in words:
            raw_non_lexical.update([token])
    for norm in twe_norm:
        for token in norm.split():
            if token.isalnum() and token not in words:
                norm_non_lexical.update([token])
    for raw, norm in zip(twe_raw, twe_norm):
        if raw.isalnum():
            if raw in words:
                if raw == norm:
                    unchanged_lexical.update([raw])
                else:
                    changed_lexical.update([raw])
            elif raw not in words:
                if raw == norm:
                    unchanged_non_lexical.update([raw])
                else:
                    changed_non_lexical.update([raw])
print("SCOWL AMERICAN 70")
print(f"Percent raw tokens not in lexicon: {sum(raw_non_lexical.values())*100/raw_count:.2f}")
print(f"Percent normed tokens not in lexicon: {sum(norm_non_lexical.values())*100/raw_count:.2f}")
a = sum(unchanged_lexical.values())
b = sum(changed_lexical.values())
c = sum(unchanged_non_lexical.values())
d = sum(changed_non_lexical.values())
phi = (a * d - b * c) / math.sqrt((a+b)*(b+d)*(a+c)*(c+d))
print(f"Phi coefficient between in lexicon and normalisation: {phi:.2f}")
print("This means there is a moderate relationship between them.")
print(f"Most common un-normalised raw alphanumeric tokens in lexicon: {unchanged_lexical.most_common(20)}")
print("Not super helpful except note domain specific 'rt' left alone.")
print(f"Most common normalised raw alphanumeric tokens in lexicon: {changed_lexical.most_common(20)}")
print("Annotation guidelines for the n-word! May want to remove single letter words from lexicon (apart from v. common ones), and filter out double letter acronyms with an internet acronyms list. Note wanna normalised in 2021, inconsistent rt normalisation.")
print("This is important as lookup is used to determine if generated candidate is valid/used as feature for candidate selection.")
print(f"Most common un-normalised raw alphanumeric tokens not in lexicon: {unchanged_non_lexical.most_common(20)}")
print("Pretty much all interjections and some common names (one direction - time specific). May want to expand lexicon in these areas - have to generate as candidates to get correct!. Hard code what to leave alone?")
print(f"Most common normalised raw alphanumeric tokens not in lexicon: {changed_non_lexical.most_common(20)}")
print("A lot of missing apostrophes - think about for candidate generation.")
print("A good lexicon (high correlation between in lexicon and normalised) will be good for checking validity of generated candidates - not suggesting candidates that wouldn't be considered normalised, not rejecting ones that would be (obviously on individual word basis). Size offers tradeoff between former and latter.")
print("WANT: to either expand lexicon or hard code to reduce un-normalised non-lexical, so that non-lexical->normalised. to reduce lexicon to reduce normalised lexical, so that lexical->non-normalised. Of course there will always be OOV words not requiring normalisation, e.g. unknown/novel named entities, and IV words requiring normalisation, e.g. where misspelling of other words etc.")
print(f"Most common raw tokens not in lexicon: {raw_non_lexical.most_common(10)}")
# print(f"Most common norm tokens not in lexicon: {norm_non_lexical.most_common(10)}")

SCOWL AMERICAN 70
Percent raw tokens not in lexicon: 17.72
Percent normed tokens not in lexicon: 13.04
Phi coefficient between in lexicon and normalisation: 0.31
This means there is a moderate relationship between them.
Most common un-normalised raw alphanumeric tokens in lexicon: [('rt', 921), ('i', 648), ('the', 631), ('to', 534), ('a', 479), ('and', 411), ('you', 340), ('in', 326), ('for', 320), ('is', 318), ('me', 281), ('my', 278), ('on', 271), ('of', 249), ('it', 203), ('with', 186), ('that', 185), ('this', 180), ('so', 180), ('be', 159)]
Not super helpful except note domain specific 'rt' left alone.
Most common normalised raw alphanumeric tokens in lexicon: [('u', 333), ('nigga', 57), ('niggas', 52), ('n', 49), ('ur', 46), ('gonna', 29), ('rt', 29), ('r', 24), ('d', 22), ('bout', 21), ('yo', 19), ('b', 18), ('wit', 17), ('tho', 17), ('cause', 16), ('dat', 16), ('bc', 16), ('ya', 16), ('da', 14), ('wanna', 12)]
Annotation guidelines for the n-word! May want to remove single lette

In [50]:
# def if_normed(raw_list, norm_list):
#     resp = []
#     for raw_tweet, norm_tweet in zip(raw_list, norm_list):
#         resp_tweet = []
#         for raw_tok, norm_tok in zip(raw_tweet, norm_tweet):
#             if raw_tok == norm_tok:
#                 resp_tweet.append((raw_tok, ""))
#             else:
#                 resp_tweet.append((raw_tok, norm_tok))
#         resp.append(resp_tweet)
#     return resp
#
# full_if_normed = if_normed(full_raw, full_norm)

In [51]:
def norm_condition(raw_list, norm_list, condition, pair=False, norm=False):
    """
    Return counter for each quadrant of normalised and condition.
    pair option returns the normalisation pair if True
    norm option evaluates the condition on the normed rather than the raw token if True
    """
    p_normed = Counter()
    p_unnormed = Counter()
    n_normed = Counter()
    n_unnormed = Counter()
    for tweet_raw, tweet_norm in zip(raw_list, norm_list):
        for raw, normed in zip(tweet_raw, tweet_norm):
            tok = normed if norm else raw
            to_update = (tok, normed) if pair else tok
            if raw != normed and condition(tok):
                p_normed.update([to_update])
            elif raw == normed and condition(tok):
                p_unnormed.update([to_update])
            elif raw != normed and not condition(tok):
                n_normed.update([to_update])
            else:
                n_unnormed.update([to_update])
    return p_unnormed, p_normed, n_unnormed, n_normed

def correlation_with_norm(p_unnormed, p_normed, n_unnormed, n_normed):
    """
    Calculate phi correlation coefficient between normalisation and a condition (two binary variables)
    """
    a = sum(p_unnormed.values())
    b = sum(p_normed.values())
    c = sum(n_unnormed.values())
    d = sum(n_normed.values())
    phi = (a * d - b * c) / math.sqrt((a+b)*(b+d)*(a+c)*(c+d))
    return phi

# a, b, c, d = norm_condition([[tok for tok in tweet if tok[0].isalnum()] for tweet in full_if_normed], lambda x: x in words)
# correlation_with_norm(a, b, c, d)

In [54]:
_, b, _, _= norm_condition(full_raw, full_norm, lambda x: not x.isalnum(), True, False)
print(f"Only {sum(b.values())} non-alphanumeric raw tokens are normalised: {b.most_common()}. These all contain apostrophes, which are included as not invalidating tokens for normalisation. However, could've and should've should not have been normalised under 2015 annotation rule 9. This rule also makes sense as expanding could change the formality of the text. ca'nt has the apostrophe in the wrong place. Perhaps the contraction module could handle moving apostrophes as well as inserting to get a IV word. Modules should only be able to add candidates, not remove them. Unclear what is happening in good'o but perhaps could hypothesise drop/split before/after the apostrophe? The alphanumeric filter conveniently also filters out domain specific entities (hashtags and mentions).")

Only 4 non-alphanumeric raw tokens are normalised: [(("good'o", 'good'), 1), (("could've", 'could have'), 1), (("should've", 'should have'), 1), (("ca'nt", "can't"), 1)]. These all contain apostrophes, which are included as not invalidating tokens for normalisation. However, could've and should've should not have been normalised under 2015 annotation rule 9. This rule also makes sense as expanding could change the formality of the text. ca'nt has the apostrophe in the wrong place. Perhaps the contraction module could handle moving apostrophes as well as inserting to get a IV word. Modules should only be able to add candidates, not remove them. Unclear what is happening in good'o but perhaps could hypothesise drop/split before/after the apostrophe? The alphanumeric filter conveniently also filters out domain specific entities (hashtags and mentions).


In [55]:
a, b, _, _ = norm_condition(full_raw, full_norm, lambda x: any(char.isspace() for char in x), True)
print(f"There are {sum((a+b).values())} raw tokens containing whitespace.")

There are 0 raw tokens containing whitespace.


In [82]:
a, b, _, _ = norm_condition(full_raw, full_norm, lambda x: x == "rt", True)
print(f"There are {sum((a+b).values())} rt tokens total, with {sum(a.values())} left alone. rt at the start of a tweet is a domain specific entity and thus should be disregarded as per rule 3. In the middle of a tweet, this is more debatable. There are {sum(1 if tweet[0] == 'rt' else 0 for tweet in full_raw)} rt tokens at the start of tweets, which are all left alone. Middle rts (85) are inconsistently normalised (27 normalised), but if they are normalised, this is to retweet. From observation looks like domain-specific rts are always followed by a @mention. 24/27 normalised middle rts are not followed by @, 42/58 left alone middle rts are. There are 6 rts at the end of tweets, and they are inconsistently normalised. So perhaps rule could be: if at start of tweet or followed by @mention, ignore. Otherwise normalise to retweet.")

There are 950 rt tokens total, with 921 left alone. rt at the start of a tweet is a domain specific entity and thus should be disregarded as per rule 3. In the middle of a tweet, this is more debatable. There are 860 rt tokens at the start of tweets, which are all left alone. Middle rts (85) are inconsistently normalised (27 normalised), but if they are normalised, this is to retweet. From observation looks like domain-specific rts are always followed by a @mention. 24/27 normalised middle rts are not followed by @, 42/58 left alone middle rts are. There are 6 rts at the end of tweets, and they are inconsistently normalised. So perhaps rule could be: if at start of tweet or followed by @mention, ignore. Otherwise normalise to retweet.


In [83]:
# count = 0
# for tweet_raw, tweet_norm in zip(full_raw, full_norm):
#     for i, tok in enumerate(zip(tweet_raw, tweet_norm)):
#         tok_raw, tok_norm = tok
#         # if tok_raw == 'rt' and 0 < i < len(tweet_raw)-1 and tok_raw == tok_norm:
#         if tok_raw == 'rt' and i == len(tweet_raw)-1:
#             # print(tweet_raw[i+1])
#             print(tok_norm)
#             count += 1
# print(count)

In [84]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [85]:
spacy_full_raw = []
for tweet in full_raw:
    spacy_full_raw.append(nlp(" ".join(tweet)))

In [86]:
# for tweet in full_if_normed:
#     for tok, norm in tweet:
#         dok = nlp(tok)[0]
#         type = dok.pos_
#         if type == "PROPN":
#             print(tok, norm)

a, b, c, d = norm_condition(full_raw, full_norm, lambda x: nlp(x)[0].pos_ == "PROPN", True, False)

In [93]:
# print(f"{sum(b.values())}/{sum(d.values())+sum(b.values())}")
print(f"Examples of unnormalised words tagged as PROPN not in lexicon: {[k[0] for k in a.keys() if k[0] not in words and k[0].isalnum()][:10]}")
print(f"Correlation of PROPN and normalisation: {correlation_with_norm(a, b, c, d):.2f}. Very low - probably due to bad POS performance thanks to lower casing and other irregularities of the text - sort of a chicken and egg issue with downstream processing. A lot these could probably be found by expanding the lexicon instead - looking at POS tagging may be a red herring")

Examples of unnormalised words tagged as PROPN not in lexicon: ['ozil', 'bie', 'haha', 'avedon', 'pshh', 'hahaha', 'bria', 'exo', 'chanyeol', 'sehun']
Correlation of PROPN and normalisation -0.09. Very low - probably due to bad POS performance thanks to lower casing and other irregularities of the text - sort of a chicken and egg issue with downstream processing. A lot these could probably be found by expanding the lexicon instead - looking at POS tagging may be a red herring


In [94]:
a, b, c, d = norm_condition(full_raw, full_norm, lambda x: nlp(x)[0].ent_type_ in ['LOC', 'PERSON'], True, False)

In [126]:
print(f"Examples of unnormalised words with these entity types not in lexicon: {[k[0] for k in a.keys() if k[0] not in words and k[0].isalnum()][:10]}")
print(f"Correlation of normalisation and these entity types: {correlation_with_norm(a, b, c, d):.2f}. Zero correlation, probably due to the same reasons as above.")

Examples of unnormalised words with these entity types not in lexicon: ['grecia', 'directionas', 'homie', 'jimin', 'simbrinz', 'lantak', 'kakak', 'dubb', 'hala', 'wala']
Correlation of normalisation and these entity types: -0.00. Zero correlation, probably due to the same reasons as above.


In [194]:
def alphanum(raw_list, norm_list):
    raw_resp, norm_resp = [], []
    for raw_tweet, norm_tweet in zip(raw_list, norm_list):
        raw_resp_tweet, norm_resp_tweet = [], []
        for raw_tok, norm_tok in zip(raw_tweet, norm_tweet):
            if raw_tok.isalnum() and raw_tok != 'rt':
                raw_resp_tweet.append(raw_tok)
                norm_resp_tweet.append(norm_tok)
        raw_resp.append(raw_resp_tweet)
        norm_resp.append(norm_resp_tweet)
    return raw_resp, norm_resp

def lexicon_investigate(raw_list, norm_list, lex):
    """
    Gives statistics for a given lexicon
    """
    a, b, c, d = norm_condition(*alphanum(raw_list, norm_list), lambda x: x in lex, pair=True)
    print(f"Correlation of lexicon with normalisation: {correlation_with_norm(a, b, c, d):.2f}")
    print(f"Most common un-normalised raw alphanumeric tokens in lexicon: {a.most_common(20)}")
    print(f"Most common normalised raw alphanumeric tokens in lexicon: {b.most_common(20)}")
    print(f"Most common un-normalised raw alphanumeric tokens not in lexicon: {c.most_common(20)}")
    print(f"Most common normalised raw alphanumeric tokens not in lexicon: {d.most_common(20)}")

In [195]:
lexicon_investigate(full_raw, full_norm, words)

Correlation of lexicon with normalisation: 0.31
Most common un-normalised raw alphanumeric tokens in lexicon: [(('i', 'i'), 648), (('the', 'the'), 631), (('to', 'to'), 534), (('a', 'a'), 479), (('and', 'and'), 411), (('you', 'you'), 340), (('in', 'in'), 326), (('for', 'for'), 320), (('is', 'is'), 318), (('me', 'me'), 281), (('my', 'my'), 278), (('on', 'on'), 271), (('of', 'of'), 249), (('it', 'it'), 203), (('with', 'with'), 186), (('that', 'that'), 185), (('this', 'this'), 180), (('so', 'so'), 180), (('be', 'be'), 159), (('like', 'like'), 156)]
Most common normalised raw alphanumeric tokens in lexicon: [(('u', 'you'), 328), (('nigga', 'nigger'), 57), (('niggas', 'niggers'), 52), (('n', 'and'), 47), (('ur', 'your'), 33), (('gonna', 'going to'), 29), (('r', 'are'), 22), (('d', 'the'), 22), (('bout', 'about'), 21), (('b', 'be'), 17), (('wit', 'with'), 17), (('tho', 'though'), 17), (('cause', 'because'), 16), (('dat', 'that'), 16), (('bc', 'because'), 16), (('ya', 'you'), 15), (('da', 'the

In [215]:
lexicon_file = open(os.path.join(DATA_PATH, "interim/american-70.txt"))
lexicon = set()
for line in lexicon_file:
    word = line.strip().lower()
    if len(word) > 1 or word in ['a', 'i']:
        lexicon.add(word)
lexicon_investigate(full_raw, full_norm, lexicon)
print("STRONG CORRELATION!")
print("So filter lexicon by removing all single letter words but a and i")
print("Filter tokens by ignoring some rts, alphanumeric/apostrophe only")

Correlation of lexicon with normalisation: 0.40
Most common un-normalised raw alphanumeric tokens in lexicon: [(('i', 'i'), 648), (('the', 'the'), 631), (('to', 'to'), 534), (('a', 'a'), 479), (('and', 'and'), 411), (('you', 'you'), 340), (('in', 'in'), 326), (('for', 'for'), 320), (('is', 'is'), 318), (('me', 'me'), 281), (('my', 'my'), 278), (('on', 'on'), 271), (('of', 'of'), 249), (('it', 'it'), 203), (('with', 'with'), 186), (('that', 'that'), 185), (('this', 'this'), 180), (('so', 'so'), 180), (('be', 'be'), 159), (('like', 'like'), 156)]
Most common normalised raw alphanumeric tokens in lexicon: [(('nigga', 'nigger'), 57), (('niggas', 'niggers'), 52), (('ur', 'your'), 33), (('gonna', 'going to'), 29), (('bout', 'about'), 21), (('wit', 'with'), 17), (('tho', 'though'), 17), (('cause', 'because'), 16), (('dat', 'that'), 16), (('bc', 'because'), 16), (('ya', 'you'), 15), (('da', 'the'), 13), (('wanna', 'want to'), 12), (('cant', "can't"), 12), (('congrats', 'congratulations'), 12),