In [2]:
import pandas as pd

import string
import re
from emot.emo_unicode import UNICODE_EMO, EMOTICONS # reference https://github.com/NeelShah18/emot

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

import requests
from bs4 import BeautifulSoup
from urlmarker import URL_REGEX # reference https://gist.github.com/gruber/8891611

import inflect

from spellchecker import SpellChecker

In [3]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/george/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/george/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/george/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [52]:
train_df = pd.read_csv('../dataset/train.csv')
train_df.set_index('id', inplace=True)
train_df

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...
10869,,,Two giant cranes holding a bridge collapse int...,1
10870,,,@aria_ahrary @TheTawniest The out of control w...,1
10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
10872,,,Police investigating after an e-bike collided ...,1


In [75]:
abbreviation_dict = {}
with open("../dictionaries/abbreviations.txt") as f:
    for line in f:
       (key, val) = line.split('\t')
       abbreviation_dict[(key)] = val.replace('\n', '')

contraction_dict = {}
with open("../dictionaries/contractions.txt") as f:
    for line in f:
       (key, val) = line.split(':')
       contraction_dict[(key)] = val.replace('\n', '')

# URL related functions

In [6]:
def removeURLs(tweet):
    """
    Replaces URLs in the tweet given with the string 'URL'.
    
    Parameters:
        tweet (string): tweet to be processed.

    Returns:
        string: given tweet with the URLs removed.
    """
    tweet = re.sub(URL_REGEX, 'URL', tweet)
    return tweet

def listURLs(tweet):
    """
    Returns a list of URLs contained in the given tweet.
            
    Parameters:
        tweet (string): tweet to be processed.

    Returns: 
        list: a list of URLs.
    """
    return re.findall(URL_REGEX, tweet)

def extractTextFromURLs(urls):
    """
    Returns text from the given list of URL filtering out some HTML tags.
        
    Parameters:
        url (list): list of URL to be processed.

    Returns: 
        string: text extracted from the given URLs.
    """
    extracted = ''
    for url in urls:
        try:
            res = requests.get(url)
        except Exception as e:
            print(e)
            continue
                
        html_page = res.content
        soup = BeautifulSoup(html_page, 'html.parser')
        text = soup.find_all(text=True)
        
        undesired = ['[document]', 'noscript',
	                'header', 'html',
	                'meta', 'head', 
                    'input', 'script',
                    'style', 'title']
        for t in text:
	        if t.parent.name not in undesired:
		        extracted += '{} '.format(t)

    return extracted

# Remove unwanted elements

In [89]:
def removeNonAscii(tweet):
    """
    Removes non ascii characters from given string.

    Parameters:
        tweet (string): tweet to be processed.
    
    Returns: 
        string: given tweet with non ascii characters removed.    
    """
    return tweet.encode('ascii', 'ignore').decode('ascii')

def removeNonPrintable(tweet):
    """
    Removes non printable characters from given string.

    Parameters:
        tweet (string): tweet to be processed.
    
    Returns: 
        string: given tweet with non printable removed.    
    """
    return ''.join(filter(lambda x: x in string.printable, tweet))

def removePunctuation(tweet):
    """
    Removes punctuations (removes # as well).

    Parameters:
        tweet (string): tweet to be processed.
    
    Returns:
        string: given tweet with punctuations removed.
    """
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    return tweet.translate(translator)

def removeNums(tweet):
    """
    Removes numeric values from the given string.
    
    Parameters:
        tweet (string): tweet to be processed.
    
    Returns: 
        string: given tweet with numeric values removed.    
    """
    return ''.join([char for char in tweet if not char.isdigit()])

def removeUsernames(tweet):
    """
    Removes usernames from given tweet.
    
    Parameters:
        tweet (string): tweet to be processed.
    
    Returns: 
        string: given tweet with usernames removed.   
    """
    return re.sub('@[^\s]+', '', tweet)

def removeRepeatedChars(tweet):
    """
    Reduces repeated consecutive characters from given tweet to only two.
    
    Parameters:
        tweet (string): tweet to be processed.
    
    Returns: 
        string: given tweet with repeated characters removed.   
    """
    return re.sub(r'(.)\1+', r'\1\1', tweet)

# Format related functions

In [8]:
def toLowerCase(tweet):
    """
    Separate camelCase to space delimited and convert tweet to lower-case.
    
    Parameters:
        tweet (string): tweet to be processed.
    
    Returns: 
        string: given tweet to lower case.
    """
    tweet = re.sub(r'((?<=[a-z])[A-Z]|(?<!\A)[A-Z](?=[a-z]))', r' \1', tweet)
    tweet = tweet.lower()
    return tweet

# Meaning related functions

In [9]:
def replaceEmojis(tweet):
    """
    Replace emojis in the text with their correspinding meaning.
    
    Parameters:
        tweet (string): tweet to be processed.
    
    Returns: 
        string: given tweet with emojis replaced.  
    """
    for emot in UNICODE_EMO:
        tweet = tweet.replace(emot, "_".join(UNICODE_EMO[emot].replace(",","").replace(":","").split()))
    return tweet

def replaceEmoticons(tweet):
    """
    Replace emoticons in the text with their correspinding meaning.
    
    Parameters:
        tweet (string): tweet to be processed.
    
    Returns: 
        string: given tweet with emoticons replaced.  
    """
    for emot in EMOTICONS:
        tweet = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), tweet)
    return tweet

def replaceNums(tweet):
    """
    Replace numerical values with their textual representation.
        
    Parameters:
        tweet (string): tweet to be processed.
    
    Returns: 
        string: given tweet with numerical values replaced.
    """
    infeng = inflect.engine()
    processed_tweet = []
    for word in tweet.split():
         processed_tweet.append(infeng.number_to_words(word) if word.isdigit() else word)
    return ' '.join(processed_tweet)          

def correctSpelling(tweet_list):
    """
    Corrects spelling in the given string.
    
    Parameters:
        tweet_list (list): list of string-words to be processed.
    
    Returns: 
        list: given tweet-list spelling-corrected.
    """
    spell = SpellChecker()
    spell.word_frequency.load_words(['url']) # add url to the dictionary
    # find those words that may be misspelled
    misspelled = spell.unknown(tweet_list)
    processed_tweet = []
    for word in tweet_list:
        # Replaced misspelled with the one most likely answer
        processed_tweet.append(spell.correction(word) if word in misspelled else word)
    return processed_tweet

def replaceAbbreviations(tweet_list, abbreviation_dict):
    """
    Replaces abbreviation with the corresponding full text from dictionary.
    
    Parameters:
        tweet_list (list): list of string-words to be processed.
        abbreviation_dict (dictionary): dictionary of abbreviation.
    
    Returns: 
        list: given tweet-list with the abbreviations replaced.
    """
    processed_list = []
    for word in tweet_list:
        if word in abbreviation_dict:
            if len(abbreviation_dict.get(word).split()) > 1: # in case of multiple words
                processed_list.extend(abbreviation_dict.get(word).split())
            else:
                processed_list.append(abbreviation_dict.get(word))
        else:
            processed_list.append(word)
    return processed_list   

def replaceContractions(tweet_list, contraction_dict):
    """
    Replaces contractions with the corresponding full text from dictionary.
        
    Parameters:
        tweet_list (list): list of string-words to be processed.
        contraction_dict (dictionary): dictionary of contractions.
    
    Returns: 
        list: given tweet-list with the contractions replaced.
    """
    processed_list = []
    for word in tweet_list:
        if word in contraction_dict:
            if len(contraction_dict.get(word).split()) > 1: # in case of multiple words
                processed_list.extend(contraction_dict.get(word).split())
            else:
                processed_list.append(contraction_dict.get(word))
        else:
            processed_list.append(word)
    return processed_list 

def removeStopWords(tweet_list):
    """
    Removes stop-words from the given tweet.
        
    Parameters:
        tweet_list (list): list of string-words to be processed.
    
    Returns: 
        list: given tweet with stop-words removed.
    """
    return [word for word in tweet_list if word not in stopwords.words('english')]

def stemming(tweet_list):
    """
    Stemming - reduces the word-forms by removing suffixes.

    Parameters:
        tweet_list (list): list of string-words to be processed.

    Returns: 
        list: given tweet stemmed.
    """
    return [PorterStemmer().stem(word) for word in tweet_list]

def lemmatization(tweet_list):
    """
    Lemmatization - reduces the word-forms to linguistically valid lemmas.

    Parameters:
        tweet_list (list): list of string-words to be processed.

    Returns: 
        list: given tweet lemmatized.
    """
    return [WordNetLemmatizer().lemmatize(word) for word in tweet_list]

In [92]:
def preprocess_tweet(tweet, abbreviation_dict, contraction_dict):
    tweet = removeURLs(tweet)
    tweet = removeUsernames(tweet)
    tweet = replaceEmojis(tweet)
    tweet = replaceEmoticons(tweet)
    tweet = removeNonAscii(tweet)
    tweet = removeNonPrintable(tweet)
    tweet = removeRepeatedChars(tweet)
    
    tweet = toLowerCase(tweet)

    tweet_list = tweet.split()
    tweet_list = replaceAbbreviations(tweet_list, abbreviation_dict)
    tweet_list = replaceContractions(tweet_list, contraction_dict)

    tweet_list = (removeNums(' '.join(tweet_list))).split()
    tweet_list = (removePunctuation(' '.join(tweet_list))).split()
    
    tweet_list = correctSpelling(tweet_list)
    
    tweet_list = removeStopWords(tweet_list)
    tweet_list = lemmatization(tweet_list)
    #tweet_list = stemming(tweet_list)
    return tweet_list

In [93]:
for index, row in train_df.iterrows():
    train_df.at[index, 'processed_text'] = ' '.join(preprocess_tweet(row['text'], abbreviation_dict, contraction_dict))
 
    urltext = extractTextFromURLs(listURLs(row['text']))
    if(not urltext):
        train_df.at[index, 'processed_URLs'] = 'NaN'
    elif any(word in urltext.lower() for word in ['not found', 'unavailable', 'error', '404', 'not available', 'is’t available', 'access is denied', 'page doesn’t exist']):
        train_df.at[index, 'processed_URLs'] = 'Page not found'
    else:
        train_df.at[index, 'processed_URLs'] = "Page found"
    print("record #{} processing finished".format(index))

train_df.to_csv('../dataset/train_processed.csv')

record #1 processing finished
record #4 processing finished
record #5 processing finished
record #6 processing finished
record #7 processing finished
record #8 processing finished
record #10 processing finished
record #13 processing finished
record #14 processing finished
record #15 processing finished
record #16 processing finished
record #17 processing finished
record #18 processing finished
record #19 processing finished
record #20 processing finished
record #23 processing finished
record #24 processing finished
record #25 processing finished
record #26 processing finished
record #28 processing finished
record #31 processing finished
record #32 processing finished
record #33 processing finished
record #34 processing finished
record #36 processing finished
record #37 processing finished
record #38 processing finished
record #39 processing finished
record #40 processing finished
record #41 processing finished
record #44 processing finished
record #48 processing finished
record #49 pro

In [94]:
train_df[:40]

Unnamed: 0_level_0,keyword,location,text,target,processed_text,processed_URLs
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquake may allah forgive Unite...,
4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la range ask canada,
5,,,All residents asked to 'shelter in place' are ...,1,resident asked shelter place notified officer ...,
6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfire evacuation order calif...,
7,,,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo ruby alaska smoke wildfire pour...,
8,,,#RockyFire Update => California Hwy. 20 closed...,1,rocky fire update california closed direction ...,
10,,,#flood #disaster Heavy rain causes flash flood...,1,flood disaster heavy rain cause flash flooding...,
13,,,I'm on top of the hill and I can see a fire in...,1,I top hill see fire wood,
14,,,There's an emergency evacuation happening now ...,1,emergency evacuation happening building across...,
15,,,I'm afraid that the tornado is coming to our a...,1,I afraid tornado coming area,
