In [1]:
# This file is used for preprocessing twitter data
# Saving into relevant data shape and data cleaning

import re
import nltk
import time
import numpy as np
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from gensim.parsing.preprocessing import remove_stopwords, strip_non_alphanum
from gensim.parsing.preprocessing import strip_numeric, strip_punctuation, strip_short

# import gensim
# print(gensim.parsing.preprocessing.STOPWORDS)

### Custom function for data cleaning

In [2]:
def data_cleaning(tweets_struct):
    # Removing links and ampersand attached text from the tweets
    text_list = [re.sub(r"(?:\@|\&|http)\S+", "", item) for item in tweets_struct["Text"]]

    # Removing non-alphabetic and numeric characters
    text_list = [strip_numeric(item) for item in text_list]
    text_list = [strip_non_alphanum(item) for item in text_list]

    # Removing punctuation characters
    text_list = [strip_punctuation(item) for item in text_list]

    # Short words removal, minsize 3
    text_list = [strip_short(item, minsize=3) for item in text_list]

    # All text to lower case
    text_list = [item.lower() for item in text_list]

    # Removing the stopwords from the tweets
    text_list = [remove_stopwords(item) for item in text_list]

    # Remove everything except text
    # text_list["text"] = [re.sub(r"[^a-zA-Z]+", ' ', item) for item in text_list["text"]]
    # text_list["text"] = [re.sub(r"[^a-zA-Z0-9]+", ' ', item) for item in text_list["text"]]
    
    return text_list

In [35]:
# Function to count total token words in a DataFrame or a column
def count_words(var):
    if isinstance(var, pd.Series):
        tokens = 0
        for items in var.iteritems():
            tokens += len(items[1].split())
        return tokens
    elif isinstance(var, pd.DataFrame):
        return var.count().sum()
    
# Map POS tag to first character lemmatize() accepts
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

# Custom functionfor text lemmatization and removing non-dictionary words
def lemmatize_custom(my_list, cwords, use):
    start = time.time()
    lemma = nltk.wordnet.WordNetLemmatizer()
    words = set(nltk.corpus.words.words())
    
    # check for sentiment or behaviour
    if use=="sentiment":
        with open("sentiment_words.txt","r") as f:
            lines = [line.strip() for line in f]
    elif use=="behaviour":
        with open("behaviour_words.txt","r") as f:
            lines = [line.strip() for line in f]
    
    temp_words = []
    for i in lines:
        for ix in i.split():
            temp_words.append(ix)
            
    cwords = cwords + temp_words
    cwords = [lemma.lemmatize(x, get_wordnet_pos(x)) for x in cwords]
    # for i in cwords:
    #    words.add(i)
        
    tweets_ll = []
    for item in my_list:
        word_list = item.split()
        word_list = [lemma.lemmatize(x, get_wordnet_pos(x)) for x in word_list]
        # word_list = [x for x in word_list if x in words]
        word_list = [x for x in word_list if x in cwords]
        # word_list = [x for x in word_list if x in behaviour_words]
        tweets_ll.append(word_list)
        word_list = None
    end = time.time()
    print("Total time taken in lemmatization: {:.2f} seconds".format(end-start))
    return pd.DataFrame(tweets_ll)

## Preprocessing for Part 1 - Sentiment analysis

In [25]:
tweets_sentiment = pd.read_csv("tweets_sentiment.csv", index_col=0)
words_earlier = count_words(tweets_sentiment["Text"])
print("Number of rows: {}".format(tweets_sentiment.shape[0]))
tweets_sentiment.head()

Number of rows: 1001


Unnamed: 0,Text,Date,Retweets,Favorites,Mentions,HashTags
0,"@XiaomiIndia @Xiaomi we hve ordered Mi 40"" TV ...",2020-04-01 03:59:59+00:00,0,0,@XiaomiIndia @Xiaomi,
1,"@Xiaomi Mi A2 Packed on 2018, July Purchased O...",2020-04-01 06:27:55+00:00,0,0,@Xiaomi,
2,@flipkartsupport recently just before the coun...,2020-04-01 08:17:43+00:00,0,0,@flipkartsupport,
3,#covid19UK I find that a lot of teenagers seem...,2020-04-01 10:26:16+00:00,0,0,,#covid19UK
4,Lockdown making me spend so much money - Just ...,2020-04-01 13:50:51+00:00,0,2,,


In [26]:
tweets_text_sent = data_cleaning(tweets_sentiment)
pd.DataFrame(tweets_text_sent, columns=["Text"]).head()

Unnamed: 0,Text
0,hve ordered purchased emi bank started emi ind...
1,packed july purchased october months got packe...
2,recently country wide lockdown purchased realm...
3,coviduk lot teenagers think game badge honor l...
4,lockdown making spend money bought iphone


In [36]:
# custom_words are compulsory words not to be removed in lemmatization
custom_words = []

tweets_tokenized_sent = lemmatize_custom(tweets_text_sent, custom_words, use="sentiment")
print("Number of rows: {}".format(tweets_tokenized_sent.shape[0]))
tweets_tokenized_sent.head(5)

Total time taken in lemmatization: 3.11 seconds
Number of rows: 1001


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,order,purchase,,,,,,,,
1,purchase,,,,,,,,,
2,purchase,realme,,,,,,,,
3,think,honor,,,,,,,,
4,bought,iphone,,,,,,,,


In [48]:
words_after = count_words(tweets_tokenized_sent)
print("Words before preprocessing: {}".format(words_earlier))
print("Words after preprocessing: {}".format(words_after))
print("Words removed: {}".format(words_earlier-words_after))

# Removing duplicate word from rows
df1 = tweets_tokenized_sent.stack().reset_index().drop(columns='level_1').drop_duplicates()
df1['col'] = df1.groupby('level_0').cumcount()
tweets_tokenized_sent = (df1.pivot(index='level_0', columns='col', values=0)
          .rename_axis(index=None, columns=None))

tweets_tokenized_sent.dropna(axis = 0,how = 'all',inplace = True, thresh=2)

print("Number of rows: {}".format(tweets_tokenized_sent.shape[0]))
tweets_tokenized_sent.to_csv("tweets_tokenized_sent.csv", index = False, header = True)
tweets_tokenized_sent.head(5)

Words before preprocessing: 174550
Words after preprocessing: 2059
Words removed: 172491
Number of rows: 883


Unnamed: 0,0,1,2,3,4,5
0,order,purchase,,,,
2,purchase,realme,,,,
3,think,honor,,,,
4,bought,iphone,,,,
5,buy,redmi,purchase,,,


## Preprocessing for Part 2 - Behaviour analysis

In [41]:
tweets_behaviour = pd.read_csv("tweets_behaviour.csv", index_col=0)
words_earlier = count_words(tweets_behaviour["Text"])
print("Number of rows: {}".format(tweets_behaviour.shape[0]))
tweets_behaviour.head()

Number of rows: 5512


Unnamed: 0,Text,Date,Retweets,Favorites,Mentions,HashTags
0,"To make this stop, we need a complete lockdown...",2020-04-01 00:17:50+00:00,0,1,,
1,Lockdown http://wpsbrittanyp.blogspot.com/2020...,2020-04-01 02:31:54+00:00,0,0,,
2,Day 8 of Lockdown: Recommending ‘USS Indianapo...,2020-04-01 02:32:13+00:00,2,8,,
3,@netflix @hulu @PrimeVideo yes I'm still watch...,2020-04-01 02:33:29+00:00,0,0,@netflix @hulu @PrimeVideo,#lockdown
4,could've been at a @dodgers game with a michi ...,2020-04-01 02:57:57+00:00,0,1,@Dodgers,


In [42]:
tweets_text_beha = data_cleaning(tweets_behaviour)
pd.DataFrame(tweets_text_beha, columns=["Text"]).head()

Unnamed: 0,Text
0,stop need complete lockdown happen people time...
1,lockdown kia ora bloggers today going tell tim...
2,day lockdown recommending uss indianapolis men...
3,yes watching stop asking lockdown
4,game michi hand instead lockdown watching netf...


In [46]:
# custom_words are compulsory words not to be removed in lemmatization
custom_words = []

tweets_tokenized_beha = lemmatize_custom(tweets_text_beha, custom_words, use="behaviour")
print("Number of rows: {}".format(tweets_tokenized_beha.shape[0]))
tweets_tokenized_beha.head(5)

Total time taken in lemmatization: 13.37 seconds
Number of rows: 5512


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,shopping,amazon,,,,,,,,,,,
1,watch,play,xbox,,,,,,,,,,
2,netflix,watch,,,,,,,,,,,
3,watch,,,,,,,,,,,,
4,game,watch,netflix,,,,,,,,,,


In [49]:
words_after = count_words(tweets_tokenized_beha)
print("Words before preprocessing: {}".format(words_earlier))
print("Words after preprocessing: {}".format(words_after))
print("Words removed: {}".format(words_earlier-words_after))

# Removing duplicate word from rows
df1 = tweets_tokenized_beha.stack().reset_index().drop(columns='level_1').drop_duplicates()
df1['col'] = df1.groupby('level_0').cumcount()
tweets_tokenized_beha = (df1.pivot(index='level_0', columns='col', values=0)
          .rename_axis(index=None, columns=None))

tweets_tokenized_beha.dropna(axis = 0,how = 'all',inplace = True, thresh=2)

print("Number of rows: {}".format(tweets_tokenized_beha.shape[0]))
tweets_tokenized_beha.to_csv("tweets_tokenized_beha.csv", index = False, header = True)
tweets_tokenized_beha.head(5)

Words before preprocessing: 174550
Words after preprocessing: 12128
Words removed: 162422
Number of rows: 5084


Unnamed: 0,0,1,2,3,4,5,6,7
0,shopping,amazon,,,,,,
1,watch,play,xbox,,,,,
2,netflix,watch,,,,,,
4,game,watch,netflix,,,,,
5,watch,netflix,,,,,,
