In [2]:
# This file is used for preprocessing twitter data
# Saving into relevant data shape and data cleaning

import re
import nltk
import time
import numpy as np
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from gensim.parsing.preprocessing import remove_stopwords, strip_non_alphanum
from gensim.parsing.preprocessing import strip_numeric, strip_punctuation, strip_short

# import gensim
# print(gensim.parsing.preprocessing.STOPWORDS)

In [82]:
# Function to count total token words in a DataFrame or a column
def count_words(var):
    if isinstance(var, pd.Series):
        tokens = 0
        for items in var.iteritems():
            tokens += len(items[1].split())
        return tokens
    elif isinstance(var, pd.DataFrame):
        return var.count().sum()
    
# Map POS tag to first character lemmatize() accepts
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

# Custom functionfor text lemmatization and removing non-dictionary words
def lemmatize_custom(my_list, cwords):
    start = time.time()
    lemma = nltk.wordnet.WordNetLemmatizer()
    words = set(nltk.corpus.words.words())
    
    with open("behaviour_words.txt","r") as f:
        lines = [line.strip() for line in f]
    behaviour_words = []
    for i in lines:
        for ix in i.split():
            behaviour_words.append(ix)
    cwords = cwords + behaviour_words
    
    for i in cwords:
        words.add(i)
        
    tweets_ll = []
    for item in my_list:
        word_list = item.split()
        # word_list = [lemma.lemmatize(x, get_wordnet_pos(x)) for x in word_list]
        # word_list = [x for x in word_list if x in words]
        word_list = [x for x in word_list if x in cwords]
        # word_list = [x for x in word_list if x in behaviour_words]
        tweets_ll.append(word_list)
        word_list = None
    end = time.time()
    print("Total time taken in lemmatization: {:.2f} seconds".format(end-start))
    return pd.DataFrame(tweets_ll)

### Data importing from our saved csv

In [5]:
# Importing raw tweets data from csv

tweets = pd.read_csv("tweets_raw.csv", index_col=0)
words_earlier = count_words(tweets["Text"])
print("Number of rows: {}".format(tweets.shape[0]))
tweets.head()

Number of rows: 1217


Unnamed: 0,Text,Date,Retweets,Favorites,Mentions,HashTags
629,"@XiaomiIndia @Xiaomi we hve ordered Mi 40"" TV ...",2020-04-01 03:59:59+00:00,0,0,@XiaomiIndia @Xiaomi,
532,@OnlineGpsc I had purchased my online class fr...,2020-04-01 05:48:42+00:00,0,0,@OnlineGpsc,
209,@SamsungIndia @Samsung @amazon Worst experienc...,2020-04-01 06:05:48+00:00,0,0,@SamsungIndia @Samsung @amazon @amazon,
628,"@Xiaomi Mi A2 Packed on 2018, July Purchased O...",2020-04-01 06:27:55+00:00,0,0,@Xiaomi,
688,@flipkartsupport recently just before the coun...,2020-04-01 08:17:43+00:00,0,0,@flipkartsupport,


### Data cleaning and normalization

In [91]:
# Removing links and ampersand attached text from the tweets
tweets_text = [re.sub(r"(?:\@|\&|http)\S+", "", item) for item in tweets["Text"]]

# Removing non-alphabetic and numeric characters
tweets_text = [strip_numeric(item) for item in tweets_text]
tweets_text = [strip_non_alphanum(item) for item in tweets_text]

# Removing punctuation characters
tweets_text = [strip_punctuation(item) for item in tweets_text]

# Short words removal, minsize 3
tweets_text = [strip_short(item, minsize=3) for item in tweets_text]

# All text to lower case
tweets_text = [item.lower() for item in tweets_text]

# Removing the stopwords from the tweets
tweets_text = [remove_stopwords(item) for item in tweets_text]
                                   
# Remove everything except text
# tweets_text["text"] = [re.sub(r"[^a-zA-Z]+", ' ', item) for item in tweets_text["text"]]
# tweets_text["text"] = [re.sub(r"[^a-zA-Z0-9]+", ' ', item) for item in tweets_text["text"]]

pd.DataFrame(tweets_text, columns=["Text"]).head()

Unnamed: 0,Text
0,hve ordered purchased emi bank started emi ind...
1,purchased online class phone samsung phone dam...
2,worst experience lockdown bought device receiv...
3,packed july purchased october months got packe...
4,recently country wide lockdown purchased realm...


In [None]:
# Adding sentiment label of tweets
with open("lexicon/positive.txt", "r") as f:
    positive = [line.strip() for line in f]
with open("lexicon/negative.txt", "r") as f:
    negative = [line.strip() for line in f]

tweets_senti = []
for item in tweets_text:
    word_list = item.split()
    p = 0; n = 0; neu = 0
    for i in word_list:
        if i in positive:
            p+=1
        elif i in negative:
            n+=1
        else:
            neu+=1
    if p>=n:
        tweets_senti.append("positive")
    elif n>p:
        tweets_senti.append("negative")
    
    word_list = None
    
sentiment_labels = pd.Series(tweets_senti, name = "sentiment")
sentiment_labels.head(5)

### Runnning our custom lemmatization function

In [97]:
# Converting words in the text into tokens and putting into a dataframe
# Each row has useful words from a single tweet (like a transaction)

# custom_words = ["lockdown"]
custom_words = []
tweets_tokenized = lemmatize_custom(tweets_text, custom_words)

Total time taken in lemmatization: 0.16 seconds


In [99]:
words_after = count_words(tweets_tokenized)
print("Words before preprocessing: {}".format(words_earlier))
print("Words after preprocessing: {}".format(words_after))
print("Words removed: {}".format(words_earlier-words_after))

tweets_tokenized.dropna(axis = 0,how = 'all',inplace = True, thresh=2)

df1 = tweets_tokenized.stack().reset_index().drop(columns='level_1').drop_duplicates()
df1['col'] = df1.groupby('level_0').cumcount()
tweets_tokenized = (df1.pivot(index='level_0', columns='col', values=0)
          .rename_axis(index=None, columns=None))

print("Number of rows: {}".format(tweets_tokenized.shape[0]))
tweets_tokenized.head(5)

Words before preprocessing: 46849
Words after preprocessing: 2482
Words removed: 44367
Number of rows: 1013


Unnamed: 0,0,1,2,3,4,5
1,purchased,samsung,,,,
4,purchased,realme,,,,
5,samsung,purchased,,,,
6,think,honor,,,,
7,want,buy,,,,


In [100]:
tweets_tokenized.to_csv("tweets_tokenized.csv", index = False, header = True)
# tweets_tokenized.index += 1
# tweets_tokenized.to_csv("tweets_tokenized_r.csv", index = True, header = False)