In [112]:
import pandas as pd
import string
import re
import emoji

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

import requests
from bs4 import BeautifulSoup
from urlmarker import URL_REGEX # reference https://gist.github.com/gruber/8891611

import inflect

from spellchecker import SpellChecker

In [113]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/george/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/george/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/george/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [114]:
train_df = pd.read_excel('train.xlsx')
train_df.set_index('id', inplace=True)
train_df

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...
10869,,,Two giant cranes holding a bridge collapse int...,1
10870,,,@aria_ahrary @TheTawniest The out of control w...,1
10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
10872,,,Police investigating after an e-bike collided ...,1


In [208]:
abbreviation_dict = {}
with open("lingo.txt") as f:
    for line in f:
       (key, val) = line.split('\t')
       abbreviation_dict[(key)] = val.replace('\n', '')

contraction_dict = {}
with open("contractions.txt") as f:
    for line in f:
       (key, val) = line.split(':')
       contraction_dict[(key)] = val.replace('\n', '')

# URL related functions

In [116]:
def removeURLs(tweet):
    """Replaces URLs in the tweet given with the string 'URL' """
    tweet = re.sub(URL_REGEX, 'URL', tweet)
    return tweet

def listURLs(tweet):
    """Returns a list of URLs contained in the given tweet"""
    return re.findall(URL_REGEX, tweet)

def extractTextFromURL(url):
    """Returns text from the given URL"""
    res = requests.get(url)
    html_page = res.content
    soup = BeautifulSoup(html_page, 'html.parser')
    text = soup.find_all(text=True)
    output = ''
    undesired = ['[document]', 'noscript',
	             'header', 'html',
	             'meta', 'head', 
                 'input', 'script',
                 'style',]
    for t in text:
	    if t.parent.name not in undesired:
		    output += '{} '.format(t)
    return output

# Remove unwanted elements

In [117]:
def removeNonAscii(tweet):
    """Remove non ascii characters"""
    return tweet.encode('ascii', 'ignore').decode('ascii')

def removePunctuation(tweet):
    """Remove punctuations - removes # as well"""
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    return tweet.translate(translator)

def removeNums(tweet):
    """Remove numbers"""
    return ''.join([char for char in tweet if not char.isdigit()])

def removeUsernames(tweet):
    """Remove usernames"""
    return re.sub('@[^\s]+', '', tweet)

# Format related functions

In [118]:
def toLowerCase(tweet):
    """Separate camel case to space delimited and convert tweet to lower-case"""
    tweet = re.sub(r'((?<=[a-z])[A-Z]|(?<!\A)[A-Z](?=[a-z]))', r' \1', tweet)
    tweet = tweet.lower()
    return tweet

# Meaning related functions

In [210]:
def replaceEmojis(tweet):
    """Replace emojis in the text with their correspinding meaning"""
    return emoji.demojize(tweet) #.replace(':', '')

def replaceNums(tweet):
    """Replace integers with textual representation"""
    infeng = inflect.engine()
    processed_tweet = []
    for word in nltk.word_tokenize(tweet):
         processed_tweet.append(infeng.number_to_words(word) if word.isdigit() else word)
    return ' '.join(processed_tweet)          

def correctSpelling(tweet):
    """Corrects spelling in the given string"""
    spell = SpellChecker()
    spell.word_frequency.load_words(['url'])
    words = nltk.word_tokenize(tweet)
    # find those words that may be misspelled
    misspelled = spell.unknown(words)
    processed_tweet = []
    for word in words:
        # Replaced misspelled with the one most likely answer
        processed_tweet.append(spell.correction(word) if word in misspelled else word)
    return ' '.join(processed_tweet)

def replaceAbbreviations(tweet):
    """Replaces abbreviation with the corresponding full text from dictionary"""
    processed_tweet = []
    for word in tweet.split():
         processed_tweet.append(abbreviation_dict.get(word) if word in abbreviation_dict else word)
    return ' '.join(processed_tweet)      

def replaceContractions(tweet):
    """Replaces contractions with the corresponding full text from dictionary"""
    processed_tweet = []
    for word in tweet.split():
         processed_tweet.append(contraction_dict.get(word) if word in contraction_dict else word)
    return ' '.join(processed_tweet)  

def removeStopWords(tweet):
    """Remove stopwords"""
    return [word for word in nltk.word_tokenize(tweet) if word not in stopwords.words('english')]

def lemmatization(tweet):
    """Lemmatization - reduces the word-forms to linguistically valid lemmas"""
    return [WordNetLemmatizer().lemmatize(word) for word in tweet]

In [222]:
def preprocess_tweet(tweet):
    tweet = removeURLs(tweet)
    tweet = replaceEmojis(tweet)
    tweet = removeUsernames(tweet)
    tweet = removeNonAscii(tweet)
    tweet = toLowerCase(tweet)
    tweet = replaceAbbreviations(tweet)
    tweet = replaceContractions(tweet)
    tweet = correctSpelling(tweet)
    tweet = removePunctuation(tweet)
    tweet = removeNums(tweet)
    tweet = removeStopWords(tweet)
    tweet = lemmatization(tweet)
    return ' '.join(tweet)

In [223]:
train_df.loc[8491,'text']

'i dont even remember slsp happening i just remember being like wtf and then the lights turned off and everyone screamed for the encore'

In [224]:
preprocess_tweet(train_df.loc[8491,'text'])

'even remember slip happening remember like fuck light turned everyone screamed encore'

In [225]:
preprocess_tweet(extractTextFromURL('https://google.com/'))

'goole search image play mail drive calendar translate web logger web history setting sign advanced search goole offered take advertising program goole url copyright privacy term'

In [226]:
removeNums(replaceNums('test 12 3 t4'))

'test twelve three t'