#Preprocessor

Data cleaning for processing

based in the article: [tweet-topic-modeling-part-2-cleaning-and-preprocessing-tweets](https://pub.towardsai.net/tweet-topic-modeling-part-2-cleaning-and-preprocessing-tweets-e3a08a8b1770)

In [1]:
import pandas as pd
import re
import gensim
import unicodedata
import nltk

In [2]:
# define a string of punctuation symbols
punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@'

##Functions

###Remove emojis

In [3]:
def remove_emojis(row):
    """Takes a string and removes emojis from it"""
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',row)

###Remove accentuation

In [4]:
def remove_accentuation(row):
  row = ''.join(ch for ch in unicodedata.normalize('NFKD', row) 
      if not unicodedata.combining(ch))
  return row

###Remove unicode ascii

In [5]:
def remove_ascii(row):
  """Takes a string and removes unicode ascii from it"""
  row = row.encode('ascii', 'ignore').decode()
  return row

###Remove links

In [6]:
def remove_links(row):
  """Takes a string and removes web links from it"""
  row = re.sub(r'http\S+', '', row)   # remove http links
  row = re.sub(r'bit.ly/\S+', '', row)  # remove bitly links
  row = row.strip('[link]')   # remove [links]
  row = re.sub(r'pic.twitter\S+','', row)
  return row  

###Remove users

In [7]:
def remove_users(row):
  """Takes a string and removes retweet and @user information"""
  row = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', row)  # remove re-row
  row = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', row)  # remove rowed at
  return row

###Remove @ and # from users

In [8]:
def remove_hashtags_simple(row):
  """Takes a string and removes # and @"""
  row = re.sub('#', '', row)  # remove #
  row = re.sub('@', '', row)  # remove @
  return row

###Remove hashtags

In [9]:
def remove_hashtags(row):
  """Takes a string and removes any hash tags"""
  row = re.sub('(#[A-Za-z]+[A-Za-z0-9-_]+)', '', row)  # remove hash tags
  return row

###Remove audios and videos

In [10]:
def remove_av(row):
  """Takes a string and removes AUDIO/VIDEO tags or labels"""
  row = re.sub('VIDEO:', '', row)  # remove 'VIDEO:' from start of row
  row = re.sub('AUDIO:', '', row)  # remove 'AUDIO:' from start of row
  return row

###Tokenize

In [11]:
def tokenize(row):
  """Returns tokenized representation of words in lemma form excluding stopwords"""
  result = []
  for token in gensim.utils.simple_preprocess(row):
    if token not in gensim.parsing.preprocessing.STOPWORDS \
            and len(token) > 2:  # drops words with less than 3 characters
      result.append(lemmatize(token))
  return result

###lemmatize

In [12]:
def lemmatize(token):
  """Returns lemmatization of a token"""
  return nltk.WordNetLemmatizer().lemmatize(token, pos='v')

##Use functions

###Preprocess row

In [13]:
def preprocess_row(row):
  """Main master function to clean rows, stripping noisy characters, and tokenizing use lemmatization"""

  nltk.download('wordnet')

  row = remove_emojis(row)
  row = remove_accentuation(row)
  row = remove_ascii(row)
  row = remove_users(row)
  row = remove_links(row)
  row = remove_hashtags(row)
  row = remove_av(row)
  row = row.lower()  # lower case
  row = re.sub('[' + punctuation + ']+', ' ', row)  # strip punctuation
  row = re.sub('\s+', ' ', row)  # remove double spacing
  row = re.sub('([0-9]+)', '', row)  # remove numbers
  row_token_list = tokenize(row)  # apply lemmatization and tokenization
  row = ' '.join(row_token_list)
  return row

###Preprocessor row without tokenize

In [14]:
def preprocess_row_without_tokenize(row):
  """Main master function to clean rows only without tokenization or removal of stopwords"""
  row = remove_emojis(row)
  row = remove_accentuation(row)
  row = remove_ascii(row)
  row = remove_users(row)
  row = remove_links(row)
  row = remove_hashtags(row)
  row = remove_av(row)
  row = row.lower()  # lower case
  row = re.sub('[' + punctuation + ']+', ' ', row)  # strip punctuation
  row = re.sub('\s+', ' ', row)  # remove double spacing
  row = re.sub('([0-9]+)', '', row)  # remove numbers
  row = re.sub('📝 …', '', row)
  return row

###Preprocessor with simple remove # and @

In [15]:
def preprocess_row_simple_hashtags(row):
  """Main master function to clean rows only without tokenization or removal of stopwords"""
  row = remove_emojis(row)
  row = remove_accentuation(row)
  row = remove_ascii(row)
  row = remove_links(row)
  row = remove_hashtags_simple(row)
  row = remove_av(row)
  row = row.lower()  # lower case
  row = re.sub('[' + punctuation + ']+', ' ', row)  # strip punctuation
  row = re.sub('\s+', ' ', row)  # remove double spacing
  row = re.sub('([0-9]+)', '', row)  # remove numbers
  row = re.sub('📝 …', '', row)
  return row