In [72]:
# This file is used for preprocessing twitter data
# Saving into relevant data shape and data cleaning

import re
import nltk
import time
import numpy as np
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from gensim.parsing.preprocessing import remove_stopwords, strip_non_alphanum
from gensim.parsing.preprocessing import strip_numeric, strip_punctuation, strip_short

# import gensim
# print(gensim.parsing.preprocessing.STOPWORDS)

In [108]:
# Function to count total token words in a DataFrame or a column
def count_words(var):
    if isinstance(var, pd.Series):
        tokens = 0
        for items in var.iteritems():
            tokens += len(items[1].split())
        return tokens
    elif isinstance(var, pd.DataFrame):
        return var.count().sum()
    
# Map POS tag to first character lemmatize() accepts
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

# Custom functionfor text lemmatization and removing non-dictionary words
def lemmatize_custom(my_list, cwords):
    start = time.time()
    lemma = nltk.wordnet.WordNetLemmatizer()
    words = set(nltk.corpus.words.words())
    
    with open("behaviour_words.txt","r") as f:
        lines = [line.strip() for line in f]
    behaviour_words = []
    for i in lines:
        for ix in i.split():
            behaviour_words.append(ix)
    cwords = cwords + behaviour_words
    
    for i in cwords:
        words.add(i)
        
    tweets_ll = []
    for item in my_list:
        word_list = item.split()
        word_list = [lemma.lemmatize(x, get_wordnet_pos(x)) for x in word_list]
        word_list = [x for x in word_list if x in words]
        word_list = [x for x in word_list if x in behaviour_words]
        tweets_ll.append(word_list)
        word_list = None
    end = time.time()
    print("Total time taken in lemmatization: {:.2f} seconds".format(end-start))
    return pd.DataFrame(tweets_ll)

### Data importing from our saved csv

In [100]:
# Importing raw tweets data from csv

tweets = pd.read_csv("tweets_raw_berk.csv", index_col=0)
words_earlier = count_words(tweets["Text"])
print("Number of rows: {}".format(tweets.shape[0]))
tweets.head()

Number of rows: 4750


Unnamed: 0,Text,Date,Retweets,Favorites,Mentions,HashTags
2835,Lockdown Day 7: Banks begin loan moratorium me...,2020-04-01 00:07:13+00:00,0,0,,
2834,It’s not a surprise that people had to get gra...,2020-04-01 00:33:33+00:00,0,1,,
495,bought a new phone bc I ran out of storage for...,2020-04-01 01:17:18+00:00,0,1,,
3483,"Looking forward to ending the lockdown, Britai...",2020-04-01 01:40:23+00:00,0,0,,
494,u know i bought stardew valley on my phone bc ...,2020-04-01 02:45:03+00:00,0,0,,


### Data cleaning and normalization

In [101]:
# Removing links and ampersand attached text from the tweets
tweets_text = [re.sub(r"(?:\@|\&|http)\S+", "", item) for item in tweets["Text"]]

# Removing non-alphabetic and numeric characters
tweets_text = [strip_numeric(item) for item in tweets_text]
tweets_text = [strip_non_alphanum(item) for item in tweets_text]

# Removing punctuation characters
tweets_text = [strip_punctuation(item) for item in tweets_text]

# Short words removal, minsize 3
tweets_text = [strip_short(item, minsize=3) for item in tweets_text]

# All text to lower case
tweets_text = [item.lower() for item in tweets_text]

# Removing the stopwords from the tweets
tweets_text = [remove_stopwords(item) for item in tweets_text]
                                   
# Remove everything except text
# tweets_text["text"] = [re.sub(r"[^a-zA-Z]+", ' ', item) for item in tweets_text["text"]]
# tweets_text["text"] = [re.sub(r"[^a-zA-Z0-9]+", ' ', item) for item in tweets_text["text"]]

pd.DataFrame(tweets_text, columns=["Text"]).head()

Unnamed: 0,Text
0,lockdown day banks begin loan moratorium measu...
1,surprise people grants mobile pay points like ...
2,bought new phone ran storage photos bout month...
3,looking forward ending lockdown britain german...
4,know bought stardew valley phone wanted play a...


In [None]:
# Adding sentiment label of tweets
with open("lexicon/positive.txt", "r") as f:
    positive = [line.strip() for line in f]
with open("lexicon/negative.txt", "r") as f:
    negative = [line.strip() for line in f]

tweets_senti = []
for item in tweets_text:
    word_list = item.split()
    p = 0; n = 0; neu = 0
    for i in word_list:
        if i in positive:
            p+=1
        elif i in negative:
            n+=1
        else:
            neu+=1
    if p>=n:
        tweets_senti.append("positive")
    elif n>p:
        tweets_senti.append("negative")
    
    word_list = None
    
sentiment_labels = pd.Series(tweets_senti, name = "sentiment")
sentiment_labels.head(5)

### Runnning our custom lemmatization function

In [109]:
# Converting words in the text into tokens and putting into a dataframe
# Each row has useful words from a single tweet (like a transaction)

# custom_words = ["lockdown"]
custom_words = []
tweets_tokenized = lemmatize_custom(tweets_text, custom_words)

Total time taken in lemmatization: 13.27 seconds


In [110]:
words_after = count_words(tweets_tokenized)
print("Words before preprocessing: {}".format(words_earlier))
print("Words after preprocessing: {}".format(words_after))
print("Words removed: {}".format(words_earlier-words_after))

tweets_tokenized.dropna(how="all")
tweets_tokenized.head(5)

Words before preprocessing: 188916
Words after preprocessing: 10642
Words removed: 178274


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,pay,mobile,,,,,,,,,,
1,mobile,pay,,,,,,,,,,
2,bought,phone,phone,,,,,,,,,
3,smartphone,,,,,,,,,,,
4,bought,phone,want,,,,,,,,,


In [111]:
tweets_tokenized.to_csv("tweets_tokenized_py.csv", index = False, header = True)
# tweets_tokenized.index += 1
# tweets_tokenized.to_csv("tweets_tokenized_r.csv", index = True, header = False)