In [1]:
from numpy import random
from google.colab import files
from sklearn.feature_extraction.text import TfidfVectorizer
import io
import re
import unicodedata
import string
import numpy as np
import pandas as pd

In [2]:
# Setup for tokenization/lemma/stem/stop word removal
!pip install -q wordcloud
import wordcloud
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [28]:
from google.colab import files

# Upload desired datasets
uploaded = files.upload()

Saving rhino_twitter_data_1.csv to rhino_twitter_data_1.csv


In [4]:
# -----------------------------------------------------------------------------
# This cell contains functions to read and cleanup a dataset
# -----------------------------------------------------------------------------
def read_and_cleanup_dataset(filename):
  # Read dataset into dataframe
  tweets = pd.read_csv(filename)

  # Drop none English tweets, comment out if pre-processed
  tweets = tweets[tweets.lang == "en"]
  tweets.reset_index(drop=True, inplace=True)

  # Define patterns to be excluded, currently the following:
  # special characters, @someone, &sth, rt, new line, link, any extra space
  pattern = r'[!@#\$%\^&\*\(\)\[\]{};:\'",.<>/?\\|_~`-]+|@\w+|&\w+|rt|\n|rhino|https://\S+'

  # pattern for white space
  ws = r'\s+'

  # non-ASCII specific quotes
  quotes_to_remove = ['“', '”', '‘', '‛', '’']

  # Define emoji patterns to be excluded
  emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # Emojis in the first group
                           u"\U0001F300-\U0001F5FF"  # Emojis in the second group
                           u"\U0001F680-\U0001F6FF"  # Emojis in the third group
                           u"\U0001F700-\U0001F77F"  # Emojis in the fourth group
                           u"\U0001F780-\U0001F7FF"  # Emojis in the fifth group
                           u"\U0001F800-\U0001F8FF"  # Emojis in the sixth group
                           u"\U0001F900-\U0001F9FF"  # Emojis in the seventh group
                           u"\U0001FA00-\U0001FA6F"  # Emojis in the eighth group
                           u"\U0001FA70-\U0001FAFF"  # Emojis in the ninth group
                           u"\U0001F200-\U0001F251"  # Emojis in the tenth group
                           u"\U0001F004-\U0001F0CF"  # Additional emojis
                           u"\U0001F10D-\U0001F10F"  # Additional emojis
                           u"\U0001F30D-\U0001F567"  # Additional emojis
                           "]+", flags=re.UNICODE)

  # Perform lower casing and remove any patterns
  tweets['tweet'] = tweets['tweet'].apply(str.lower)
  tweets['tweet'] = tweets['tweet'].str.replace(pattern, '', regex=True)
  tweets['tweet'] = tweets['tweet'].apply(lambda x: emoji_pattern.sub(r'', x))
  tweets['tweet'] = tweets['tweet'].str.replace(ws, ' ', regex=True).str.strip()
  for quote in quotes_to_remove:
    tweets['tweet'] = tweets['tweet'].str.replace(quote, '')

  return tweets

In [27]:
# -----------------------------------------------------------------------------
# This cell contains functions to perform tokenization and remove stop words
# -----------------------------------------------------------------------------
# Constants
# POS (Parts Of Speech) for: nouns, adjectives, verbs and adverbs
DI_POS_TYPES = {'NN':'n', 'JJ':'a', 'VB':'v', 'RB':'r'}
POS_TYPES = list(DI_POS_TYPES.keys())

# Constraints on tokens
MIN_STR_LEN = 3
RE_VALID = '[a-zA-Z]'

# Remove accents function
def remove_accents(data):
    return ''.join(x for x in unicodedata.normalize('NFKD', data) if x in string.ascii_letters or x == " ")

def Remove_Stop_Words(li_tweets):
  stopwords = nltk.corpus.stopwords.words('english')
  stemmer = nltk.stem.PorterStemmer()
  lemmatizer = nltk.stem.WordNetLemmatizer()

  # Process all quotes
  li_tokens = []
  li_token_lists = []
  li_lem_strings = []

  for i,text in enumerate(li_tweets):
      # Tokenize by sentence, then by lowercase word
      tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]

      # Process all tokens per quote
      li_tokens_quote = []
      li_tokens_quote_lem = []
      for token in tokens:
          # Remove accents
          t = remove_accents(token)

          # Remove punctuation
          t = str(t).translate(string.punctuation)
          li_tokens_quote.append(t)

          # Add token that represents "no lemmatization match"
          li_tokens_quote_lem.append("-") # this token will be removed if a lemmatization match is found below

          # Process each token
          if t not in stopwords:
              if re.search(RE_VALID, t):
                  if len(t) >= MIN_STR_LEN:
                      # Note that the POS (Part Of Speech) is necessary as input to the lemmatizer
                      # (otherwise it assumes the word is a noun)
                      pos = nltk.pos_tag([t])[0][1][:2]
                      pos2 = 'n'  # set default to noun
                      if pos in DI_POS_TYPES:
                        pos2 = DI_POS_TYPES[pos]

                      stem = stemmer.stem(t)
                      lem = lemmatizer.lemmatize(t, pos=pos2)  # lemmatize with the correct POS

                      if pos in POS_TYPES:
                          li_tokens.append((t, stem, lem, pos))

                          # Remove the "-" token and append the lemmatization match
                          li_tokens_quote_lem = li_tokens_quote_lem[:-1]
                          li_tokens_quote_lem.append(lem)

      # Build list of token lists from lemmatized tokens
      li_token_lists.append(li_tokens_quote)

      # Build list of strings from lemmatized tokens
      str_li_tokens_quote_lem = ' '.join(li_tokens_quote_lem)
      cleaned_str_li_tokens_quote_lem = str_li_tokens_quote_lem.replace('-', '')
      li_lem_strings.append(cleaned_str_li_tokens_quote_lem)
  return li_lem_strings

In [29]:
fn = 'rhino_twitter_data_1.csv'
tweets = read_and_cleanup_dataset(io.BytesIO(uploaded[fn]))
li_tweets = tweets['tweet'].tolist()
li_lem_strings = Remove_Stop_Words(li_tweets)
tweets['tweet'] = li_lem_strings
tweets['tweet'] = tweets['tweet'].str.replace('\s+', ' ', regex=True)

In [30]:
for i in range(10):
  print(tweets['tweet'][i])

leeds rohan smith name man squad face warrington wolf thursday betfred super league opening game 
olpejeta happy valentine daytheres well way celebrate heawarming news love ceainly air 
jackiec nikki haley announces presidential campaign time new generation leadership
carolinadarren vote ron desantis trump guy want draw line sand rock ron desantis
bosnerdley chris christie big know dont come big dems order 
toutantgustave itsthedr he big time
bikersamerica lil cheburekiman russia never problem corrupt nato amp globalists always problem yes
write dont want forgetpredaconsbreakdown white wild rider water bisondead end big as vulturedrag strip cheetahoff road low torso crocodilemurdermaster whiteback gorilla ormasterdon mammoth
nikki haley announces presidential campaign time new generation leadershipcongratulations nikki cant vote youyoure vote go trump trump
titan john cena workout need body time world wrestling championsme blazeds smal
