In [1]:
import pandas as pd
import re
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_non_alphanum
from gensim.parsing.preprocessing import strip_numeric
from gensim.parsing.preprocessing import strip_punctuation
from gensim.parsing.preprocessing import strip_short

In [2]:
# Function to count total token words in a DataFrame column
def count_words(text_column):
    tokens = 0
    for items in text_column.iteritems():
        n = len(items[1].split())
        tokens += n
    return tokens

In [3]:
tweets = pd.read_csv("tweets_df.csv", index_col=0)
words_earlier = count_words(tweets["Text"])
tweets.head()

Unnamed: 0,Text,Date,Retweets,Favorites,Mentions,HashTags
0,Is it landlord that has been struggling since ...,2020-08-09 23:55:14+00:00,13,46,,
1,Victorian new cases trending down as impact of...,2020-08-09 23:53:42+00:00,8,29,,#auspol
2,Hello Monday. As Melbourne starts week two in ...,2020-08-09 23:48:01+00:00,7,23,,
3,The ironic part is the same people patting you...,2020-08-09 23:47:48+00:00,5,76,,
4,"In several states, positive coronavirus tests ...",2020-08-09 23:46:39+00:00,13,16,,


In [4]:
# Removing the stopwords from the tweets and storing in a new dataframe
tweets_text = pd.DataFrame([remove_stopwords(item) for item in tweets["Text"]],
                              columns = ["text"])

# Removing links, hashtags and ampersand attached text from the tweets
tweets_text["text"] = [re.sub(r"(?:\@|\#|\&|http)\S+", "", item) for item in tweets_text["text"]]

# Removing non-alphabetic characters
tweets_text = pd.DataFrame([strip_non_alphanum(item) for item in tweets_text["text"]],
                              columns = ["text"])

# Removing numeric characters
tweets_text = pd.DataFrame([strip_numeric(item) for item in tweets_text["text"]],
                              columns = ["text"])

# Removing punctuation characters
tweets_text = pd.DataFrame([strip_punctuation(item) for item in tweets_text["text"]],
                              columns = ["text"])

# Short words removal, minsize 3
tweets_text = pd.DataFrame([strip_short(item, minsize=3) for item in tweets_text["text"]],
                              columns = ["text"])

# All text to lower case
tweets_text["text"] = [item.lower() for item in tweets_text["text"]]

'''
# Remove everything except text
tweets_text["text"] = [re.sub(r"[^a-zA-Z]+", ' ', item) for item in tweets_text["text"]]
#tweets_text["text"] = [re.sub(r"[^a-zA-Z0-9]+", ' ', item) for item in tweets_text["text"]]

# Remove single character words
tweets_text["text"] = [re.sub(r"\b[a-zA-Z]\b", "", item) for item in tweets_text["text"]]
'''

words_after = count_words(tweets_text["text"])

print("Words before preprocessing: {}".format(words_earlier))
print("Words after preprocessing: {}".format(words_after))
print("Words removed: {}".format(words_earlier-words_after))

tweets_text.head()

Words before preprocessing: 672
Words after preprocessing: 353
Words removed: 319


Unnamed: 0,text
0,landlord struggling beginning lockdown source ...
1,victorian new cases trending impact stage lock...
2,hello monday melbourne starts week lockdown da...
3,the ironic people patting tweet probably agree...
4,states positive coronavirus tests created prob...


In [5]:
tweets_ll = []

for items in tweets_text["text"].iteritems():
    word_list = items[1].split()
    tweets_ll.append(word_list)
    word_list = None
    
tweets_tokenized = pd.DataFrame(tweets_ll)
tweets_tokenized.index += 1
tweets_tokenized

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
1,landlord,struggling,beginning,lockdown,source,income,want,beg,you,people,...,,,,,,,,,,
2,victorian,new,cases,trending,impact,stage,lockdown,kicks,thanks,staying,...,,,,,,,,,,
3,hello,monday,melbourne,starts,week,lockdown,day,million,remote,learning,...,readers,entertained,you,zoom,call,,,,,
4,the,ironic,people,patting,tweet,probably,agreeing,people,protesting,lockdown,...,,,,,,,,,,
5,states,positive,coronavirus,tests,created,problems,reopened,schools,sending,teachers,...,shut,,,,,,,,,
6,bonding,with,our,pets,recent,survey,surveyed,said,pet,helped,...,cut,working,home,pet,,,,,,
7,happy,birthday,the,boy,longford,carved,stellar,career,travelling,world,...,school,pretty,day,forgetful,can,wait,celebrate,lockdown,,
8,bts,lockdown,antis,kpoppies,report,block,check,countries,fanbase,charting,...,available,enjoy,comeback,army,,,,,,
9,world,tiniest,rarest,wild,pigs,swine,fever,lockdown,,,...,,,,,,,,,,
10,peers,claimed,taxpayer,funded,attendance,allowances,working,home,lockdown,since,...,participation,soared,record,levels,,,,,,


In [6]:
tweets_tokenized.to_csv("tweets_tokenized.csv", index = True, header = False)