In [None]:
# This file is used for preprocessing twitter data
# Saving into relevant data shape and data cleaning

import re
import nltk
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from gensim.parsing.preprocessing import remove_stopwords, strip_non_alphanum
from gensim.parsing.preprocessing import strip_numeric, strip_punctuation, strip_short

# import gensim
# print(gensim.parsing.preprocessing.STOPWORDS)

In [None]:
# Function to count total token words in a DataFrame or a column
def count_words(var):
    if isinstance(var, pd.Series):
        tokens = 0
        for items in var.iteritems():
            tokens += len(items[1].split())
        return tokens
    elif isinstance(var, pd.DataFrame):
        return var.count().sum()
    
# Map POS tag to first character lemmatize() accepts
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

# Custom functionfor text lemmatization and removing non-dictionary words
def lemmatize_custom(my_list, cwords):
    lemma = nltk.wordnet.WordNetLemmatizer()
    words = set(nltk.corpus.words.words())
    
    for i in cwords:
        words.add(i)
        
    tweets_ll = []
    for item in my_list:
        word_list = item.split()
        word_list = [lemma.lemmatize(x, get_wordnet_pos(x)) for x in word_list]
        word_list = [x for x in word_list if x in words or not x.isalpha()]
        tweets_ll.append(word_list)
        word_list = None
    
    return pd.DataFrame(tweets_ll)

In [None]:
# Importing raw tweets data from csv

tweets = pd.read_csv("tweets_raw.csv", index_col=0)
words_earlier = count_words(tweets["Text"])
tweets.head()

In [None]:
# Removing links and ampersand attached text from the tweets
tweets_text = [re.sub(r"(?:\@|\&|http)\S+", "", item) for item in tweets["Text"]]

# Removing non-alphabetic and numeric characters
tweets_text = [strip_numeric(item) for item in tweets_text]
tweets_text = [strip_non_alphanum(item) for item in tweets_text]

# Removing punctuation characters
tweets_text = [strip_punctuation(item) for item in tweets_text]

# Short words removal, minsize 3
tweets_text = [strip_short(item, minsize=3) for item in tweets_text]

# All text to lower case
tweets_text = [item.lower() for item in tweets_text]

# Removing the stopwords from the tweets
tweets_text = [remove_stopwords(item) for item in tweets_text]
                                   
# Remove everything except text
# tweets_text["text"] = [re.sub(r"[^a-zA-Z]+", ' ', item) for item in tweets_text["text"]]
# tweets_text["text"] = [re.sub(r"[^a-zA-Z0-9]+", ' ', item) for item in tweets_text["text"]]

pd.DataFrame(tweets_text, columns=["Text"]).head()

In [None]:
# Converting words in the text into tokens and putting into a dataframe
# Each row has useful words from a single tweet (like a transaction)

custom_words = ["lockdown"]
tweets_tokenized = lemmatize_custom(tweets_text, custom_words)

In [None]:
words_after = count_words(tweets_tokenized)
print("Words before preprocessing: {}".format(words_earlier))
print("Words after preprocessing: {}".format(words_after))
print("Words removed: {}".format(words_earlier-words_after))

tweets_tokenized.head(5)

In [None]:
tweets_tokenized.to_csv("tweets_tokenized_py.csv", index = False, header = True)
# tweets_tokenized.index += 1
# tweets_tokenized.to_csv("tweets_tokenized_r.csv", index = True, header = False)