In [1]:
import requests, json
import string
import re
from itertools import chain

import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
from ekphrasis.dicts.noslang.slangdict import slangdict

from nltk.corpus import words
from nltk.corpus import stopwords
import spacy

nlp = spacy.load("en_core_web_sm")
en_words = words.words()
st_words = stopwords.words()

In [2]:
data_path = '../files/'

In [3]:
df = pd.read_csv('../files/train.csv')
test_df = pd.read_csv('../files/test.csv')
df_leak = pd.read_csv('../files/socialmedia-disaster-tweets-DFE.csv',  encoding ='ISO-8859-1')
df_leak['target'] = (df_leak['choose_one'] == 'Relevant').astype(np.int8)
df_leak['id'] = df_leak.index.astype(np.int16)
df_leak = df_leak[['target', 'id']]
test_df = test_df.merge(df_leak, on=['id'], how='left')

print(df.shape, test_df.shape)

(7613, 5) (3263, 5)


In [5]:
df.head(1)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1


In [6]:
def get_unk_words(txts):
    txt = ' '.join(txts)
    doc = nlp(txt)
    tokens = set([tok.lemma_ for tok in doc])

    word_dict = set([tok.lower() for tok in en_words])
    not_word_tokens = tokens - tokens.intersection(word_dict)
    print('Vocabs size', len(tokens))
    print('Unknown vocabs size', len(not_word_tokens))
    print(f'Tweets contain unknown words: {100 * len(not_word_tokens) / len(tokens):.2f}%')
    return not_word_tokens

In [7]:
pairs =  list(slangdict.items())
for i in range(10):
    print(pairs[i])

('*4u', 'Kiss for you')
('*67', 'unknown')
('*eg*', 'evil grin')
('07734', 'hello')
('0day', 'software illegally obtained before it was released')
('0noe', 'Oh No')
('0vr', 'over')
('10q', 'thank you')
('10tacle', 'tentacle')
('10x', 'thanks')


In [8]:
uncased_slang_dict = {}
for key, value in slangdict.items():
    value = re.sub(r'it means', '', value)
    value = re.sub(r'\*\*\*', 'uck', value)
    value = re.sub(r'\*\*', 'it', value)
    value = re.sub(r'it refers to', '', value)
    uncased_slang_dict[key.lower()] = value.lower()
print('Slang words number:', len(uncased_slang_dict))

Slang words number: 5429


In [10]:
##Source: https://www.kaggle.com/chardo/top-5-winning-automl-submission

replacement_patterns = [
    (r'won\'t', 'will not'),
    (r'can\'t', 'cannot'),
    (r'i\'m', 'i am'),
    (r'I\'m', 'I am'),
    (r'ain\'t', 'is not'),
    (r'(\w+)\'ll', '\g<1> will'),
    (r'(\w+)n\'t', '\g<1> not'),
    (r'(\w+)\'ve', '\g<1> have'),
    (r'(\w+)\'s', '\g<1> is'),
    (r'(\w+)\'re', '\g<1> are'),
    (r'(\w+)\'d', '\g<1> would'),
]

In [11]:
abbr = {
    "$" : " dollar ",
    "€" : " euro ",
    "4ao" : "for adults only",
    "a.m" : "before midday",
    "a3" : "anytime anywhere anyplace",
    "aamof" : "as a matter of fact",
    "acct" : "account",
    "adih" : "another day in hell",
    "afaic" : "as far as i am concerned",
    "afaict" : "as far as i can tell",
    "afaik" : "as far as i know",
    "afair" : "as far as i remember",
    "afk" : "away from keyboard",
    "app" : "application",
    "approx" : "approximately",
    "apps" : "applications",
    "asap" : "as soon as possible",
    "asl" : "age, sex, location",
    "atk" : "at the keyboard",
    "ave." : "avenue",
    "aymm" : "are you my mother",
    "ayor" : "at your own risk", 
    "b&b" : "bed and breakfast",
    "b+b" : "bed and breakfast",
    "b.c" : "before christ",
    "b2b" : "business to business",
    "b2c" : "business to customer",
    "b4" : "before",
    "b4n" : "bye for now",
    "b@u" : "back at you",
    "bae" : "before anyone else",
    "bak" : "back at keyboard",
    "bbbg" : "bye bye be good",
    "bbc" : "british broadcasting corporation",
    "bbias" : "be back in a second",
    "bbl" : "be back later",
    "bbs" : "be back soon",
    "be4" : "before",
    "bfn" : "bye for now",
    "blvd" : "boulevard",
    "bout" : "about",
    "brb" : "be right back",
    "bros" : "brothers",
    "brt" : "be right there",
    "bsaaw" : "big smile and a wink",
    "btw" : "by the way",
    "bwl" : "bursting with laughter",
    "c/o" : "care of",
    "cet" : "central european time",
    "cf" : "compare",
    "cia" : "central intelligence agency",
    "csl" : "can not stop laughing",
    "cu" : "see you",
    "cul8r" : "see you later",
    "cv" : "curriculum vitae",
    "cwot" : "complete waste of time",
    "cya" : "see you",
    "cyt" : "see you tomorrow",
    "dae" : "does anyone else",
    "dbmib" : "do not bother me i am busy",
    "diy" : "do it yourself",
    "dm" : "direct message",
    "dwh" : "during work hours",
    "e123" : "easy as one two three",
    "eet" : "eastern european time",
    "eg" : "example",
    "embm" : "early morning business meeting",
    "encl" : "enclosed",
    "encl." : "enclosed",
    "etc" : "and so on",
    "faq" : "frequently asked questions",
    "fawc" : "for anyone who cares",
    "fb" : "facebook",
    "fc" : "fingers crossed",
    "fig" : "figure",
    "fimh" : "forever in my heart", 
    "ft." : "feet",
    "ft" : "featuring",
    "ftl" : "for the loss",
    "ftw" : "for the win",
    "fwiw" : "for what it is worth",
    "fyi" : "for your information",
    "g9" : "genius",
    "gahoy" : "get a hold of yourself",
    "gal" : "get a life",
    "gcse" : "general certificate of secondary education",
    "gfn" : "gone for now",
    "gg" : "good game",
    "gl" : "good luck",
    "glhf" : "good luck have fun",
    "gmt" : "greenwich mean time",
    "gmta" : "great minds think alike",
    "gn" : "good night",
    "g.o.a.t" : "greatest of all time",
    "goat" : "greatest of all time",
    "goi" : "get over it",
    "gps" : "global positioning system",
    "gr8" : "great",
    "gratz" : "congratulations",
    "gyal" : "girl",
    "h&c" : "hot and cold",
    "hp" : "horsepower",
    "hr" : "hour",
    "hrh" : "his royal highness",
    "ht" : "height",
    "ibrb" : "i will be right back",
    "ic" : "i see",
    "icq" : "i seek you",
    "icymi" : "in case you missed it",
    "idc" : "i do not care",
    "idgadf" : "i do not give a damn fuck",
    "idgaf" : "i do not give a fuck",
    "idk" : "i do not know",
    "ie" : "that is",
    "i.e" : "that is",
    "ifyp" : "i feel your pain",
    "IG" : "instagram",
    "iirc" : "if i remember correctly",
    "ilu" : "i love you",
    "ily" : "i love you",
    "imho" : "in my humble opinion",
    "imo" : "in my opinion",
    "imu" : "i miss you",
    "iow" : "in other words",
    "irl" : "in real life",
    "j4f" : "just for fun",
    "jic" : "just in case",
    "jk" : "just kidding",
    "jsyk" : "just so you know",
    "l8r" : "later",
    "lb" : "pound",
    "lbs" : "pounds",
    "ldr" : "long distance relationship",
    "lmao" : "laugh my ass off",
    "lmfao" : "laugh my fucking ass off",
    "lol" : "laughing out loud",
    "ltd" : "limited",
    "ltns" : "long time no see",
    "m8" : "mate",
    "mf" : "motherfucker",
    "mfs" : "motherfuckers",
    "mfw" : "my face when",
    "mofo" : "motherfucker",
    "mph" : "miles per hour",
    "mr" : "mister",
    "mrw" : "my reaction when",
    "ms" : "miss",
    "mte" : "my thoughts exactly",
    "nagi" : "not a good idea",
    "nbc" : "national broadcasting company",
    "nbd" : "not big deal",
    "nfs" : "not for sale",
    "ngl" : "not going to lie",
    "nhs" : "national health service",
    "nrn" : "no reply necessary",
    "nsfl" : "not safe for life",
    "nsfw" : "not safe for work",
    "nth" : "nice to have",
    "nvr" : "never",
    "nyc" : "new york city",
    "oc" : "original content",
    "og" : "original",
    "ohp" : "overhead projector",
    "oic" : "oh i see",
    "omdb" : "over my dead body",
    "omg" : "oh my god",
    "omw" : "on my way",
    "p.a" : "per annum",
    "p.m" : "after midday",
    "pm" : "prime minister",
    "poc" : "people of color",
    "pov" : "point of view",
    "pp" : "pages",
    "ppl" : "people",
    "prw" : "parents are watching",
    "ps" : "postscript",
    "pt" : "point",
    "ptb" : "please text back",
    "pto" : "please turn over",
    "qpsa" : "what happens", #"que pasa",
    "ratchet" : "rude",
    "rbtl" : "read between the lines",
    "rlrt" : "real life retweet", 
    "rofl" : "rolling on the floor laughing",
    "roflol" : "rolling on the floor laughing out loud",
    "rotflmao" : "rolling on the floor laughing my ass off",
    "rt" : "retweet",
    "ruok" : "are you ok",
    "sfw" : "safe for work",
    "sk8" : "skate",
    "smh" : "shake my head",
    "sq" : "square",
    "srsly" : "seriously", 
    "ssdd" : "same stuff different day",
    "tbh" : "to be honest",
    "tbs" : "tablespooful",
    "tbsp" : "tablespooful",
    "tfw" : "that feeling when",
    "thks" : "thank you",
    "tho" : "though",
    "thx" : "thank you",
    "tia" : "thanks in advance",
    "til" : "today i learned",
    "tl;dr" : "too long i did not read",
    "tldr" : "too long i did not read",
    "tmb" : "tweet me back",
    "tntl" : "trying not to laugh",
    "ttyl" : "talk to you later",
    "u" : "you",
    "u2" : "you too",
    "u4e" : "yours for ever",
    "utc" : "coordinated universal time",
    "w/" : "with",
    "w/o" : "without",
    "w8" : "wait",
    "wassup" : "what is up",
    "wb" : "welcome back",
    "wtf" : "what the fuck",
    "wtg" : "way to go",
    "wtpa" : "where the party at",
    "wuf" : "where are you from",
    "wuzup" : "what is up",
    "wywh" : "wish you were here",
    "yd" : "yard",
    "ygtr" : "you got that right",
    "ynk" : "you never know",
    "zzz" : "sleeping bored and tired",
    "yr": "year",
    "u.s":"usa",
}
def replace_slang(txt, slang):
    ctxt = re.sub(r'\s+', ' ', txt)
    res = []
    for tok in ctxt.split():
        if tok.lower() in slang:
            res.append(slang[tok.lower()])
        else:
            res.append(tok)
    res = ' '.join(res)
    return res.strip()

sent = 'I want to go aamof home'
print(sent)
print(replace_slang(sent, abbr))

I want to go aamof home
I want to go as a matter of fact home


Define text preprocessor¶
extract emojis
replace numbers/date/money
extract hashtags
correct enlongated/repeated character

In [12]:
text_processor = TextPreProcessor(
# terms that will be normalized
    normalize = ['rest_emoticons', 'rtl_face', 'cashtag','url',
                 'email', 'percent', 'money', 'phone', 'user', 
                 'time', 'date', 'number', 'eastern_emoticons'],
    # terms that will be annotated
    annotate = set(["elongated", "repeated"]),
    fix_html = True,  # fix HTML tokens
    segmenter="twitter", 
    corrector = "twitter", 
    unpack_hashtags = True,  # perform word segmentation on hashtags
    unpack_contractions = True,  # Unpack contractions (can't -> can not)
    spell_correct_elong = False,  # spell correction for elongated words
    tokenizer=SocialTokenizer(lowercase=False).tokenize,
    dicts=[emoticons]
)

  self.tok = re.compile(r"({})".format("|".join(pipeline)))


Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


In [13]:
def preprocess(txt):
    # remove non-ascii characters
    res = txt.encode('ascii', 'ignore').decode()
    # replace slang token if the token is not an english word
    res = replace_slang(res, uncased_slang_dict)
    # replace shorten pattern i.e I'll--> I will
    for patt, rep in replacement_patterns:
        res = re.sub(patt, rep, res)
    # Extract emojis and hashtags and segment the txt
    res = ' '.join(text_processor.pre_process_doc(res)).strip()
    for patt in [r"<elongated>", r"<repeated>"]:
        res = re.sub(patt, '', res)
        
    # another try to replace the slangs after segmentation
    res = replace_slang(res, uncased_slang_dict)

    # remove punctuaions
    res = re.sub(r'[%s]' % re.escape(''.join(string.punctuation)), r' ',res)
    # lower case
    res = res.lower()
    # remove consecutive duplicated tokens
    res = re.sub(r'\b(\w+)(?:\W+\1\b)+', r'\1', res)
    #remove extra spaces
    res = re.sub(r'\s+', ' ', res)
    return res.strip()

In [14]:
df['ctext'] = df['text'].apply(preprocess)
test_df['ctext'] = test_df['text'].apply(preprocess)

In [15]:
for txt, ctxt in test_df[['text', 'ctext']].values[0:5]:
    print(txt)
    print(ctxt)
    print()

Just happened a terrible car crash
just happened a terrible car crash

Heard about #earthquake is different cities, stay safe everyone.
heard about earthquake is different cities stay safe everyone

there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all
there is a forest fire at spot pond geese are fleeing across the street i cannot save them all

Apocalypse lighting. #Spokane #wildfires
apocalypse lighting spokane wildfires

Typhoon Soudelor kills 28 in China and Taiwan
typhoon soudelor kills number in china and taiwan



In [16]:
df = df.fillna('unk')
test_df = test_df.fillna('unk')

In [17]:
df = df.groupby(by = ['ctext']).agg({
    'id': 'first',
    'location': lambda x:x.value_counts().index[0],
    'keyword':lambda x:x.value_counts().index[0],
    'target': lambda x:x.value_counts().index[0],
    'text': lambda x:x.value_counts().index[0],
}).reset_index()

In [18]:
print(df.shape)

(6861, 6)


In [19]:
df.head(2)

Unnamed: 0,ctext,id,location,keyword,target,text
0,0 npzp mhtw4fnet officials alabama home quaran...,7880,unk,quarantined,1,0nPzp mhtw4fnet\n\nOfficials: Alabama Home Qua...
1,1 0 news water main break disrupts trolley ser...,8648,San Diego,sinkhole,1,10News ? Water main break disrupts trolley ser...


In [20]:
def preprocess_kw(txt):
    res = txt.lower()
    res = re.sub(r'[^a-zA-Z]', r' ', res)
    res = re.sub(r'\s+', r' ', res)
    doc = nlp(res)
    res = ' '.join([token.lemma_ for token in doc])
    res = re.sub(r'\s+', r' ', res)
    return res.strip()

In [28]:
preprocess_kw(" the rain 778 99in in spain")

'the rain in in spain'

In [23]:
keyword = set(df['keyword'].values)
keyword = {
    key: preprocess_kw(key).lower() for key in keyword
}
    
df['ckeyword'] = df['keyword'].apply(lambda txt: keyword[txt])
test_df['ckeyword'] = test_df['keyword'].apply(lambda txt: keyword[txt])

In [24]:
df[df['keyword']!='unk'][['keyword', 'ckeyword']]

Unnamed: 0,keyword,ckeyword
0,quarantined,quarantine
1,sinkhole,sinkhole
2,outbreak,outbreak
3,suicide%20bomb,suicide bomb
4,suicide%20bomb,suicide bomb
...,...,...
6856,war%20zone,war zone
6857,electrocuted,electrocute
6858,flattened,flatten
6859,flattened,flatten


In [54]:
ids = [328,443,513,2619,3640,3900,4342,5781,6552,6554,6570,6701,6702,6729,6861,7226]

df.loc[df['id'].isin(ids), 'target'] = 0

In [57]:
df.to_csv(data_path+'progress/df.csv', index = False)
test_df.to_csv(data_path+'progress/test_df.csv', index = False)