### VADER Mulitlingual
#### https://github.com/brunneis/vader-multi

#### Install VADER Mulitlingual

In [None]:
!pip install vaderSentiment

### Import Libraries

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import pandas as pd
import time
from tqdm.notebook import tqdm
tqdm.pandas()

### Load Dataset

In [2]:
data_file = '../datasets/OpeNERes/opener_es.xlsx'
dfTrain = pd.read_excel(data_file, sheet_name='Sheet1')

In [3]:
print(len(dfTrain))

107863


In [4]:
dfTrain.head()

Unnamed: 0,ID,Title,Review
0,0,"Excelente servicio, buenas instalaciones, limp...","Mi estancia fue tan sólo de 2 días, el precio ..."
1,1,Lo pasamos genial,Pasamos unos días muy agradables en el memorie...
2,2,Muy congestionado,Demasiada gente en un vagón aunque es un medio...
3,3,Fantastica,zona restaurada. Hay una cervecería artesanal ...
4,4,Por algo es una de las maravillas del mundo,Visite la zona arqueológica en compañía de mi ...


In [5]:
# Drop rows where "Review" column is empty
dfTrain = dfTrain.dropna(subset=["Review"])

In [11]:
import re
def sentence_case(text):
    # Split into sentences. Therefore, find all text that ends
    # with punctuation followed by white space or end of string.
    sentences = re.findall(r'(?:\d+\.\d+|\b[A-Z](?:\.[A-Z])*\b\.?|[^.!?])+[.!?](?:\s|\Z)', text)

    # Capitalize the first letter of each sentence
    sentences = [x[0].upper() + x[1:] for x in sentences]
    #print(sentences)
    # Combine sentences
    return ''.join(sentences)

def fix_punctuation(text):
  try:
    text = text.lower()  #lower case
    text = text.replace('\n', '')
    text = text.strip()
    #add space after punctuation
    text1 = re.sub(r'(\d+\.\d+|\b[A-Z](?:\.[A-Z])*\b\.?)|([.,;:!?)])\s*', lambda x: x.group(1) or f'{x.group(2)} ', text)
    return text1
  except:
    return text

### Remove Emojis from text

In [12]:
def remove_words_and_emojis(text):
    # Sample list of words/emojis to remove
    words_to_remove = ['$:', '%)', '%-)', '&-:', '&:', "( '}{' )", '(%', "('-:", "(':", '((-:', '(*', '(-%', '(-*', '(-:', '(-:0', '(-:<', '(-:o', '(-:O', '(-:{', '(-:|>*', '(-;', '(-;|', '(8', '(:', '(:0', '(:<', '(:o', '(:O', '(;', '(;<', '(=', '(?:', '(^:', '(^;', '(^;0', '(^;o', '(o:', ")':", ")-':", ')-:', ')-:<', ')-:{', '):', '):<', '):{', ');<', '*)', '*-)', '*-:', '*-;', '*:', '*<|:-)', '*\\0/*', '*^:', ',-:', "---'-;-{@", '--<--<@', '.-:', '..###-:', '..###:', '/-:', '/:', '/:<', '/=', '/^:', '/o:', '0-8', '0-|', '0:)', '0:-)', '0:-3', '0:03', '0;^)', '0_o', '10q', '1337', '143', '1432', '14aa41', '182', '187', '2g2b4g', '2g2bt', '2qt', '3:(', '3:)', '3:-(', '3:-)', '4col', '4q', '5fs', '8)', '8-d', '8-o', '86', '8d', ':###..', ':$', ':&', ":'(", ":')", ":'-(", ":'-)", ':(', ':)', ':*', ':-###..', ':-&', ':-(', ':-)', ':-))', ':-*', ':-,', ':-.', ':-/', ':-<', ':-d', ':-D', ':-o', ':-p', ':-[', ':-\\', ':-c', ':-p', ':-|', ':-||', ':-Þ', ':/', ':3', ':<', ':>', ':?)', ':?c', ':@', ':d', ':D', ':l', ':o', ':p', ':s', ':[', ':\\', ':]', ':^)', ':^*', ':^/', ':^\\', ':^|', ':c', ':c)', ':o)', ':o/', ':o\\', ':o|', ':P', ':{', ':|', ':}', ':Þ', ';)', ';-)', ';-*', ';-]', ';d', ';D', ';]', ';^)', '</3', '<3', '<:', '<:-|', '=)', '=-3', '=-d', '=-D', '=/', '=3', '=d', '=D', '=l', '=\\', '=]', '=p', '=|', '>-:', '>.<', '>:', '>:(', '>:)', '>:-(', '>:-)', '>:/', '>:o', '>:p', '>:[', '>:\\', '>;(', '>;)', '>_>^', '@:', '@>-->--', "@}-;-'---", 'aas', 'aayf', 'afu', 'alol', 'ambw', 'aml', 'atab', 'awol', 'ayc', 'ayor', 'aug-00', 'bfd', 'bfe', 'bff', 'bffn', 'bl', 'bsod', 'btd', 'btdt', 'bz', 'b^d', 'cwot', "d-':", 'd8', 'd:', 'd:<', 'd;', 'd=', 'doa', 'dx', 'ez', 'fav', 'fcol', 'ff', 'ffs', 'fkm', 'foaf', 'ftw', 'fu', 'fubar', 'fwb', 'fyi', 'fysa', 'g1', 'gg', 'gga', 'gigo', 'gj', 'gl', 'gla', 'gn', 'gr8', 'grrr', 'gt', 'h&k', 'hagd', 'hagn', 'hago', 'hak', 'hand', 'heart', 'hearts', 'hho1/2k', 'hhoj', 'hhok', 'hugz', 'hi5', 'idk', 'ijs', 'ilu', 'iluaaf', 'ily', 'ily2', 'iou', 'iyq', 'j/j', 'j/k', 'j/p', 'j/t', 'j/w', 'j4f', 'j4g', 'jho', 'jhomf', 'jj', 'jk', 'jp', 'jt', 'jw', 'jealz', 'k4y', 'kfy', 'kia', 'kk', 'kmuf', 'l', 'l&r', 'laoj', 'lmao', 'lmbao', 'lmfao', 'lmso', 'lol', 'lolz', 'lts', 'ly', 'ly4e', 'lya', 'lyb', 'lyl', 'lylab', 'lylas', 'lylb', 'm8', 'mia', 'mml', 'mofo', 'muah', 'mubar', 'musm', 'mwah', 'n1', 'nbd', 'nbif', 'nfc', 'nfw', 'nh', 'nimby', 'nimjd', 'nimq', 'nimy', 'nitl', 'nme', 'noyb', 'np', 'ntmu', 'o-8', 'o-:', 'o-|', 'o.o', 'O.o', 'o.O', 'o:', 'o:)', 'o:-)', 'o:-3', 'o:3', 'o:<', 'o;^)', 'ok', 'o_o', 'O_o', 'o_O', 'pita', 'pls', 'plz', 'pmbi', 'pmfji', 'pmji', 'po', 'ptl', 'pu', 'qq', 'qt', 'r&r', 'rofl', 'roflmao', 'rotfl', 'rotflmao', 'rotflmfao', 'rotflol', 'rotgl', 'rotglmao', 's:', 'sapfu', 'sete', 'sfete', 'sgtm', 'slap', 'slaw', 'smh', 'snafu', 'sob', 'swak', 'tgif', 'thks', 'thx', 'tia', 'tmi', 'tnx', 'true', 'tx', 'txs', 'ty', 'tyvm', 'urw', 'vbg', 'vbs', 'vip', 'vwd', 'vwp', 'wag', 'wd', 'wilco', 'wp', 'wtf', 'wtg', 'wth', 'x-d', 'x-p', 'xd', 'xlnt', 'xoxo', 'xoxozzz', 'xp', 'xqzt', 'xtc', 'yolo', 'yoyo', 'yvw', 'yw', 'ywia', 'zzz', '[-;', '[:', '[;', '[=', '\\-:', '\\:', '\\:<', '\\=', '\\^:', '\\o/', '\\o:', ']-:', ']:', ']:<', '^<_<', '^urs', '{:', '|-0', '|-:', '|-:>', '|-o', '|:', '|;-)', '|=', '|^:', '|o:', '||-:', '}:', '}:(', '}:)', '}:-(', '}:-)', 'x-d', 'x-p', 'xd', 'xp', 'yay']
    words = text.split()
    neat_words = []
    for word in words:
        if word not in words_to_remove:
            neat_words.append(word)
    return ' '.join(neat_words)

In [14]:
import re

def remove_emojis(text):
    # Define regex pattern to match emojis
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    # Remove emojis from the text
    cleaned_text = emoji_pattern.sub(r'', text)
    return cleaned_text

Cleaned Sentence:  -- utf-8 encoded emojis such as  and  and 


In [16]:
dfTrain['reviewCorr'] = dfTrain['Review'].progress_apply(remove_words_and_emojis)
dfTrain['reviewCorr'] = dfTrain['reviewCorr'].progress_apply(remove_emojis)
dfTrain['reviewCorr'] = dfTrain['reviewCorr'].progress_apply(fix_punctuation)

  0%|          | 0/107863 [00:00<?, ?it/s]

  0%|          | 0/107863 [00:00<?, ?it/s]

  0%|          | 0/107863 [00:00<?, ?it/s]

In [17]:
dfTrain[0:3]

Unnamed: 0,ID,Title,Review,Title_Review,reviewCorr
0,0,"Excelente servicio, buenas instalaciones, limp...","Mi estancia fue tan sólo de 2 días, el precio ...","Excelente servicio, buenas instalaciones, limp...","excelente servicio, buenas instalaciones, limp..."
1,1,Lo pasamos genial,Pasamos unos días muy agradables en el memorie...,Lo pasamos genial Pasamos unos días muy agrada...,lo pasamos genial pasamos unos días muy agrada...
2,2,Muy congestionado,Demasiada gente en un vagón aunque es un medio...,Muy congestionado Demasiada gente en un vagón ...,muy congestionado demasiada gente en un vagón ...


In [23]:
def vader_polarity_label(compound):
    polarity_label = ''
    if(compound >= 0.05):
        polarity_label = "Positive"
    elif((compound > -0.05) and (compound < 0.05)):
        polarity_label = "Neutral"
    elif(compound <= -0.05):
        polarity_label = "Negative"
    return polarity_label

In [24]:
def gold_polarity_label(score):
    polarity_label = ''
    if(score >= 3.0 and score <= 5.0):
        polarity_label = "Positive"
    elif((score >= 0.0) and (score <= 2.0)):
        polarity_label = "Negative"
    elif((score >= 2.0) and (score <= 3.0)):
        polarity_label = "Neutral"
    return polarity_label

In [25]:
def normalize_vader_polarity(compound):
    polarity_label = ''
    if(compound > 0.6 and compound <= 1.0):
        polarity_label = 5
    elif((compound > 0.2) and (compound <= 0.6)):
        polarity_label = 4
    elif((compound > -0.2) and (compound <= 0.2)):
        polarity_label = 3
    elif((compound > -0.6) and (compound <= -0.2)):
        polarity_label = 2
    elif((compound > -1.0) and (compound <= -0.6)):
        polarity_label = 1
    return polarity_label

In [None]:
dfTrain['gold_polarity_label'] = dfTrain['Polarity'].progress_apply(gold_polarity_label)

In [27]:
t0 = time.time()
# Running in Title
analyzer = SentimentIntensityAnalyzer()
for i, row in tqdm(dfTrain.iterrows()):
    #print(i, row.reviewCorr)
    try:
        vs = analyzer.polarity_scores(row.reviewCorr)
        dfTrain.at[i,'neg'] = vs['neg']
        dfTrain.at[i,'neu'] = vs['neu']
        dfTrain.at[i,'pos'] = vs['pos']
        dfTrain.at[i,'compound'] = vs['compound']
        dfTrain.at[i,'polarity_label'] = vader_polarity_label(vs['compound'])
        dfTrain.at[i,'Polarity'] = normalize_vader_polarity(vs['compound'])
    except Exception as e:
        #dfTrain1.at[i,'Score'] = -100
        print("vaderSentiment Error: " + str(e))
    #break
t1 = time.time()
total = t1-t0
print('Total Time (Seconds): ' + str(total))

0it [00:00, ?it/s]

  dfTrain.at[i,'polarity_label'] = vader_polarity_label(vs['compound'])


vaderSentiment Error: list index out of range
vaderSentiment Error: list index out of range
vaderSentiment Error: list index out of range
vaderSentiment Error: list index out of range
vaderSentiment Error: HTTP Error 400: Bad Request
vaderSentiment Error: HTTP Error 400: Bad Request
vaderSentiment Error: HTTP Error 400: Bad Request
vaderSentiment Error: HTTP Error 400: Bad Request
vaderSentiment Error: HTTP Error 400: Bad Request
vaderSentiment Error: HTTP Error 400: Bad Request
vaderSentiment Error: HTTP Error 400: Bad Request
vaderSentiment Error: HTTP Error 400: Bad Request
vaderSentiment Error: HTTP Error 400: Bad Request
vaderSentiment Error: HTTP Error 400: Bad Request
vaderSentiment Error: HTTP Error 400: Bad Request
vaderSentiment Error: HTTP Error 400: Bad Request
Total Time (Seconds): 30223.190549850464


In [28]:
dfTrain.to_excel("/sentiment-analysis-results/Tweets_Spanish_VADER.xlsx")

In [2]:
print("Sentiment Analysis Output file save successfully..!")

Sentiment Analysis Output file save successfully..!
