In [1]:
# This file is used for preprocessing twitter data
# Saving into relevant data shape and data cleaning

import re
import nltk
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from gensim.parsing.preprocessing import remove_stopwords, strip_non_alphanum
from gensim.parsing.preprocessing import strip_numeric, strip_punctuation, strip_short

# import gensim
# print(gensim.parsing.preprocessing.STOPWORDS)

In [2]:
# Function to count total token words in a DataFrame column
def count_words(text_column):
    tokens = 0
    for items in text_column.iteritems():
        n = len(items[1].split())
        tokens += n
    return tokens

# Map POS tag to first character lemmatize() accepts
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

In [3]:
# Importing raw tweets data from csv

tweets = pd.read_csv("tweets_raw.csv", index_col=0)
words_earlier = count_words(tweets["Text"])
tweets.head()

Unnamed: 0,Text,Date,Retweets,Favorites,Mentions,HashTags
0,Is it landlord that has been struggling since ...,2020-08-09 23:55:14+00:00,13,47,,
1,Victorian new cases trending down as impact of...,2020-08-09 23:53:42+00:00,8,29,,#auspol
2,Hello Monday. As Melbourne starts week two in ...,2020-08-09 23:48:01+00:00,7,23,,
3,The ironic part is the same people patting you...,2020-08-09 23:47:48+00:00,5,76,,
4,"In several states, positive coronavirus tests ...",2020-08-09 23:46:39+00:00,13,16,,


In [4]:
# Removing links and ampersand attached text from the tweets
tweets_text = pd.DataFrame([re.sub(r"(?:\@|\&|http)\S+", "", item) for item in tweets["Text"]],
                              columns = ["text"])

# Removing non-alphabetic and numeric characters
tweets_text["text"] = [strip_numeric(item) for item in tweets_text["text"]]
tweets_text["text"] = [strip_non_alphanum(item) for item in tweets_text["text"]]

# Removing punctuation characters
tweets_text["text"] = [strip_punctuation(item) for item in tweets_text["text"]]

# Short words removal, minsize 3
tweets_text["text"] = [strip_short(item, minsize=3) for item in tweets_text["text"]]

# All text to lower case
tweets_text["text"] = [item.lower() for item in tweets_text["text"]]

# Removing the stopwords from the tweets
tweets_text["text"] = [remove_stopwords(item) for item in tweets_text["text"]]
                                   
# Remove everything except text
# tweets_text["text"] = [re.sub(r"[^a-zA-Z]+", ' ', item) for item in tweets_text["text"]]
# tweets_text["text"] = [re.sub(r"[^a-zA-Z0-9]+", ' ', item) for item in tweets_text["text"]]

tweets_text.head()

Unnamed: 0,text
0,landlord struggling beginning lockdown source ...
1,victorian new cases trending impact stage lock...
2,hello monday melbourne starts week lockdown da...
3,ironic people patting tweet probably agreeing ...
4,states positive coronavirus tests created prob...


In [5]:
# Converting words in the text into tokens and putting into a dataframe
# Each row has useful words from a single tweet (like a transaction)

tweets_ll = []
words_after = 0
lemma = nltk.wordnet.WordNetLemmatizer()
words = set(nltk.corpus.words.words())

for items in tweets_text["text"].iteritems():
    word_list = items[1].split()
    # Text lemmatization and removing non-dictionary words
    word_list = [lemma.lemmatize(x, get_wordnet_pos(x)) for x in word_list]
    word_list = [x for x in word_list if x in words or not x.isalpha()]
    words_after += len(word_list)
    tweets_ll.append(word_list)
    word_list = None

print("Words before preprocessing: {}".format(words_earlier))
print("Words after preprocessing: {}".format(words_after))
print("Words removed: {}".format(words_earlier-words_after))
    
tweets_tokenized = pd.DataFrame(tweets_ll)
tweets_tokenized.head(5)

Words before preprocessing: 15334
Words after preprocessing: 6349
Words removed: 8985


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,landlord,struggle,begin,lockdown,source,income,want,beg,people,well,...,,,,,,,,,,
1,new,case,trend,impact,stage,lockdown,kick,thanks,stay,strong,...,,,,,,,,,,
2,hello,start,week,lockdown,day,million,remote,learn,middle,grade,...,,,,,,,,,,
3,people,pat,tweet,probably,agree,people,protest,lockdown,end,mess,...,,,,,,,,,,
4,state,positive,test,create,problem,reopen,school,send,teacher,student,...,,,,,,,,,,


In [6]:
tweets_tokenized.to_csv("tweets_tokenized_py.csv", index = False, header = True)
tweets_tokenized.index += 1
tweets_tokenized.to_csv("tweets_tokenized_r.csv", index = True, header = False)