In [1]:
import re
import pandas as pd  
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk
#nltk.download('all')
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem import WordNetLemmatizer
from functools import partial
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.metrics import confusion_matrix , classification_report
%matplotlib inline

from wordcloud import WordCloud

#sentiment calculation
import emoji
from nltk.corpus import sentiwordnet as swn

#Weighted Additive
import math

# mysql
import mysql.connector

In [2]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt

In [3]:
# Replaces url
def replaceURL(text):
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','url',text)
#    text = re.sub(r'#([^\s]+)', r'\1', text)
    return text

In [4]:
# Replace slangs

# Creates a dictionary with slangs and their equivalents and replaces them
with open('slang.txt') as file:
    slang_map = dict(map(str.strip, line.partition('\t')[::2])
    for line in file if line.strip())

slang_words = sorted(slang_map, key=len, reverse=True) # longest first for regex
regex = re.compile(r"\b({})\b".format("|".join(map(re.escape, slang_words))))
replaceSlang = partial(regex.sub, lambda m: slang_map[m.group(1)])

In [5]:
# separate the hash tags

# function to collect hashtags
def hashtag_extract(x):
    words = x.split()
    hashtags = []
    # Loop over the words in the tweet
    for i in words:
        ht = re.findall(r"#(\w+)", i)
        hashtags.append(ht)
    hashtags = sum(hashtags,[])
    return hashtags

In [6]:
#Remove hashtag from tidy tweet
def remove_hashtags(sentence):
    return ' '.join(re.sub("#(\w+)"," ",sentence).split())

In [7]:
#Hashtag decompositon(CamelCase)
def camel_case_split(identifier):
    matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
    return ' '.join([m.group(0) for m in matches])

In [8]:
def extract_emojis(s):
    return ' '.join(c for c in s if c in emoji.UNICODE_EMOJI['en'])

In [9]:
def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

In [10]:
emojiSentiment = pd.read_csv('Emoji_Sentiment_Data.csv')
emojiSentiment['emoji value'] = emojiSentiment['emoji value'].apply(lambda x : chr(int(x,16)))

In [11]:
#Assign sentiment value to the emoji
all_emojis = list(emojiSentiment['emoji value'])

def getIndexOfEmoji(emoji):
    for i in range(len(all_emojis)):
        if(emoji == all_emojis[i]):
            return i
    return 1
  #return -1

def getEmojiSentimentScore(emoji):
    index = getIndexOfEmoji(emoji)
    return emojiSentiment.loc[index, 'sentiment score']

def getSentimentScore(emojis):
    if(len(emojis) == 0):
        return ""
    emojis = emojis.split(" ")
    sum = 0
    count = 0
    for emoji in emojis:
        sum += getEmojiSentimentScore(emoji)
        count = count + 1
    return sum/count

In [12]:
# Removing RT
def removeRTInFrontOfWord(text):
    text = re.sub(r'\brt\b', '', text).strip()
    return text

In [13]:
def runFun(ls, func):
    newls = []
    for i in ls:
        newls.append(func(i))
    return newls

In [14]:
# Elongated
def replaceElongated(word):
    repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
    repl = r'\1\2\3'
    if wordnet.synsets(word):
        return word
    repl_word = repeat_regexp.sub(repl, word)
    if repl_word != word:      
        return replaceElongated(repl_word)
    else:       
        return repl_word

In [16]:
# removing stop words

stoplist = stopwords.words('english')
my_stopwords = "multiexclamation multiquestion multistop url atuser st rd nd th am pm" # my extra stopwords
stoplist = stoplist + my_stopwords.split()

def removeStopWords(ls):
    return list(filter(lambda w: w not in stoplist, ls))

In [17]:
wnl = nltk.WordNetLemmatizer()

# calucalting sentiScore- tidy tweet
def calculateSentScore(taggedsent):
    score_list=[]
    for idx2,t in enumerate(taggedsent):
        newtag=''
        lemmatized=wnl.lemmatize(t[0])
        if t[1].startswith('NN'):
            newtag='n'
        elif t[1].startswith('JJ'):
            newtag='a'
        elif t[1].startswith('V'):
            newtag='v'
        elif t[1].startswith('R'):
            newtag='r'
        else:
            newtag=''       
        if(newtag!=''):    
            synsets = list(swn.senti_synsets(lemmatized, newtag))
            #Getting average of all possible sentiment        
            score=0
            if(len(synsets)>0):
                for syn in synsets:
                    score+=syn.pos_score()-syn.neg_score()
                score_list.append(score/len(synsets))
    if(len(score_list) != 0):
        return (sum([word_score for word_score in score_list])/len(score_list))

In [18]:
# calucalting sentiScore-hashtags
def calculateHashtagsSentiScore(hashtags):
    taggedlist=[]
    for hashtag in hashtags: 
        hashtagWords =  hashtag.split()
        taggedlist.append(nltk.pos_tag(hashtagWords))
    score_list=[]
    for idx,taggedsent in enumerate(taggedlist):
        score_list.append([])
        for idx2,t in enumerate(taggedsent):
            newtag=''
            lemmatized=wnl.lemmatize(t[0])
            if t[1].startswith('NN'):
                newtag='n'
            elif t[1].startswith('JJ'):
                newtag='a'
            elif t[1].startswith('V'):
                newtag='v'
            elif t[1].startswith('R'):
                newtag='r'
            else:
                newtag=''       
            if(newtag!=''):    
                synsets = list(swn.senti_synsets(lemmatized, newtag))
                #Getting average of all possible sentiments        
                score=0
                if(len(synsets)>0):
                    for syn in synsets:
                        score+=syn.pos_score()-syn.neg_score()
                    score_list[idx].append(score/len(synsets))
    sentence_sentiment=[]
    for score_sent in score_list:
        if(len(score_sent) != 0):
            sentence_sentiment.append(sum([word_score for word_score in score_sent])/len(score_sent))
    
    if(len(sentence_sentiment) > 0):
        return sum(sentence_sentiment)/len(sentence_sentiment)
    else:
        return ''

In [19]:
def sentiment(final_score):
    if final_score is None:
        return np.nan
    elif final_score > 0.0:
        return "positive"
    elif final_score < 0.0:
        return "negative"
    else:
        return "neutral"

In [20]:
def lexicon_sentiment(data):
    tweet = data[0]
    hashtags = data[1]
    emojis = data[2]
    tokenized_tweet = tweet.split()
    
    # POS Tagging
    tokenized_tweet_POS = nltk.pos_tag(tokenized_tweet)
    
    # get tweet score
    tweet_score = calculateSentScore(tokenized_tweet_POS)
    
    # get hashtag score
    hashtag_score = calculateHashtagsSentiScore(hashtags)
    
    # get emoji score
    emoji_score = getSentimentScore(emojis)
    
    # get score
    tweet_len = len(tokenized_tweet)
    hashtag_len = len(hashtags)
    
    if(emojis != ''):
        emoji_len = len(emojis.split(" "))
    else:
        emoji_len = 0
        
    total_len = tweet_len + hashtag_len + emoji_len
    
    if(total_len == 0):
        return

    tweet_weight = tweet_len/total_len
    hashtag_weight = hashtag_len/total_len
    emoji_weight = emoji_len/total_len

    if(type(tweet_score) != float):
        tweet_score = 0
    if(emoji_score == ''):
        emoji_score = 0
    if(hashtag_score == ''):
        hashtag_score = 0

    return tweet_weight*tweet_score + hashtag_weight*hashtag_score + emoji_weight*emoji_score

def preprocess(tweet):
     # remove twitter handles (@user)
    tidy_tweet = remove_pattern(tweet, "@[\w]*")
    
    # replaces url
    tidy_tweet = replaceURL(tidy_tweet)
    
    # removing short words
    tidy_tweet = ' '.join([w for w in tidy_tweet.split() if len(w)>3])
    
    # replace slang words
    
    # separate the hash tags
    hashtags = hashtag_extract(tidy_tweet)
    
    # remove hashtag from tidy tweet
    tidy_tweet = remove_hashtags(tidy_tweet)                      
    
    # hashtag decompositon(CamelCase)
    hashtags = [camel_case_split(hashtag) for hashtag in hashtags]
    
    # seperate emojis
    emojis = extract_emojis(tidy_tweet)
    
    # remove emoj from tidy tweet
    tidy_tweet = deEmojify(tidy_tweet)
    
    # lowercase
    tidy_tweet = tidy_tweet.lower()
    hashtags = [hashtag.lower() for hashtag in hashtags]
    
    # removing RT
    tidy_tweet = removeRTInFrontOfWord(tidy_tweet)
    
    # remove special characters, numbers, punctuations
    tidy_tweet = tidy_tweet.replace("[^a-zA-Z#]", " ")
    
    # tokenization
    tokenized_tweet = tidy_tweet.split()
    
    # elongated
    tokenized_tweet = runFun(tokenized_tweet, replaceElongated)
    
    # spell correction
    #tokenized_tweet = runFun(tokenized_tweet, spellCorrection)
    
    # removing stop words
    tokenized_tweet = removeStopWords(tokenized_tweet)
    
    # lemmatizer
    lemmatizer = WordNetLemmatizer()
    tokenized_tweet = [lemmatizer.lemmatize(i) for i in tokenized_tweet]
    
    # POS Tagging
    #tokenized_tweet_POS = nltk.pos_tag(tokenized_tweet)
    
    # stitch tokens back together
    tidy_tweet = ' '.join(tokenized_tweet)
    
    return (tidy_tweet, hashtags, emojis)

In [21]:
# calculate tweet sentiment and store it in database
mydb = mysql.connector.connect(host="localhost", user="root", password="Chathu97@", database="data_schema")
mycursor = mydb.cursor()

sql = "INSERT INTO final_table(tweet, preprocssed_tweet, sentiment_intnsity) VALUES (%s, %s, %s)"

def tweetToDB(tweet):
    preprocessed_tweet_obj = preprocess(tweet)
    sentiment_intensity =  lexicon_sentiment(preprocessed_tweet_obj)
    if(sentiment_intensity is not None):
        sentiment_intensity = float(sentiment_intensity)                 
    val = (tweet, preprocessed_tweet_obj[0], sentiment_intensity)
    mycursor.execute(sql, val)
    mydb.commit()
    print(mycursor.rowcount, "record inserted.")

In [22]:
combi = pd.read_csv('train22_data.csv', usecols=['tweet'])
tweets = list(combi['tweet'])
tweets

['So far 24 people infected with the Delta COVID variant in Sri Lanka - Director General of Health Services /‚Ä¶ https://t.co/qKZBdVWf3p',
 'The highest number of vaccines per day of 232,526 admininistred on July 12.  So far over 4 million first doses of t‚Ä¶ https://t.co/3hpBOvdJjJ',
 'Police Media says investigations are continuing and speed-up in connection with sexually abusing the minor girl. So‚Ä¶ https://t.co/N4cNsUxQct',
 'Sri Lanka üá±üá∞ introduces Digital Tourism and planning to launch promotions to attract digital tourists to the countr‚Ä¶ https://t.co/laxAo3XkCL',
 'Tragedyüò¢\nMother kills her newborn baby and burnt the body immediately after delivery in Kantale area- Police.‚Ä¶ https://t.co/NxlUGaDhHq',
 'The United States continues to support small businesses in #SriLanka as they work to overcome the economic challeng‚Ä¶ https://t.co/AP6Fn6ADwi',
 'Drug dealer "Kudu Noni" arrested for transporting Heroin islandwide through a Courier service after packed heroin i‚Ä¶ h

In [23]:
for tweet in tweets:
    tweetToDB(tweet)
print("completed")

1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record ins

1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record ins

1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record ins

1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record inserted.
1 record ins