## In questa versione estendo il discorso delle polarity agganciandomi anche a SentiWordnet

### Loading the file with the sentences

In [2]:
import sys
import pandas as pd
import numpy as np

from utils import *
from parsed_sentence import *

def load_classes(input_file_path):
    res = []
    with open(input_file_path, encoding="utf-8") as fp:
        for line in fp:
            res.append(line.strip())
    return res

def load_file(input_file_path):
    res = []
    psr = ParsedSentenceReader(input_file_path)
    while True:
        ps = psr.read_next_sentence()
        if not ps:
            break
        res.append(ps)
    return res


    
#====== MAIN =============================================

if not len(sys.argv) == 3:
    print("Usage: class_file text_file")
    exit()

#TRAIN
#input_file_path1= 'C:/Users/feder/Desktop/Master/Project_NLP/resource/semeval2013/train_categories.txt'
#input_file_path2= 'C:/Users/feder/Desktop/Master/Project_NLP/resource/semeval2013/train_tweets_parsed.txt'

#DEV
input_file_path1= 'C:/Users/feder/Desktop/Master/Project_NLP/resource/semeval2013/dev_categories.txt'
input_file_path2= 'C:/Users/feder/Desktop/Master/Project_NLP/resource/semeval2013/dev_tweets_parsed.txt'

class_array = load_classes(input_file_path1)
parsed_sent_array = load_file(input_file_path2)

In [3]:
len(parsed_sent_array) == len(class_array)

True

### USING POLARITY LEXICON TO ASSIGN TO EACH WORD A POLARITY SCORE

http://sag.art.uniroma2.it/demo-software/distributional-polarity-lexicon/
- positive, negative and neutral

In [4]:
# load the Polarity Lexicon
polwords = pd.read_csv("""DPL-EN_lrec2016.txt""",delimiter='\t', header=None)
polwords.head()

Unnamed: 0,0,1
0,</s>,"0.17247152,0.6118044,0.21572405"
1,!,"0.4407992,0.22381033,0.33539042"
2,i,"0.15485209,0.7418242,0.10332363"
3,the,"0.30322838,0.27008215,0.42668942"
4,",","0.5385329,0.29464877,0.16681834"


In [5]:
polaritywords = {}
for i in range(len(polwords)):
    polaritywords[polwords.iloc[i,0]]=list(map(float, polwords.iloc[i,1].split(",")))

In [11]:
### For each word there are three scores: positive polarity, neutral and negative

polaritywords['remembered']

[0.22627498, 0.708745, 0.06498006]

### Extracting all the smiles types from all the tweets in the training set

In [12]:
tweets_train = []
with open("train_tweets.txt", encoding='utf-8') as f:
    for line in f:
        tweets_train.append(line.strip())
len(tweets_train)   

8654

In [13]:
df_tweets_train = pd.DataFrame(tweets_train).apply(lambda x: [word.lower() for word in x])
df_tweets_train.head(2)

Unnamed: 0,0
0,uh drunk and tired and just remembered i 'm go...
1,@theboltonnews pls rt - bolton society for bli...


In [14]:
import re
smiles= []
for i in range(len(tweets_train)):
    text= df_tweets_train.iloc[i,:].values[0]
    match= re.search(r'smile_\w+', text)
    if match is not None:
        smiles.append(match.group())
smiles= set(smiles)
smiles

{'smile_cry',
 'smile_disgust',
 'smile_embarassed',
 'smile_happy',
 'smile_kiss',
 'smile_laugh',
 'smile_love',
 'smile_sad',
 'smile_skeptical',
 'smile_surprise',
 'smile_tongue',
 'smile_wink'}

### Assigning to each of these smiles a polarity, modifying that manually
- **FOR SMILE_CRY, SMILE_HAPPY, SMILE_SAD, SMILE_DISGUST AND SMILE_LOVE, I CHOOSE TO PUT 100% OF POLARITY IN ONE OF THE 3 SCORES, GIVEN THE FACT THEY WOULD BE HIGHLY CONNECTED TO THE SENTIMENT OF THE SENTENCE**

In [39]:
import re
smiles= []
for i in range(len(tweets_train)):
    text= df_tweets_train.iloc[i,:].values[0]
    match= re.search(r'smile_\w+', text)
    if match is not None:
        smiles.append(match.group())
smiles= set(smiles)
smiles

for smile in smiles:
    type_smile= smile[len('smile_'):]
    if type_smile in ['happy', 'cry', 'sad', 'disgust', 'love']:
        #initialize with zero polarities
        polaritywords[smile]= [0, 0, 0]  
        max_index_polarity= np.argmax(polaritywords.get(type_smile))
        polaritywords[smile][max_index_polarity]= 1
    else:
        polaritywords[smile]= polaritywords.get(type_smile)
    print(smile, '\t', polaritywords[smile])

smile_happy 		 [1, 0, 0]
smile_cry 		 [0, 1, 0]
smile_sad 		 [0, 1, 0]
smile_disgust 		 [0, 0, 1]
smile_love 		 [1, 0, 0]
smile_skeptical [0.2750377, 0.2475464, 0.47741592]
smile_surprise 	 [0.3524347, 0.1533607, 0.49420467]
smile_wink 	 [0.54961383, 0.20071952, 0.24966662]
smile_tongue 	 [0.18763916, 0.3646318, 0.44772902]
smile_laugh 	 [0.55425787, 0.11650112, 0.32924098]
smile_embarassed [0.11972178, 0.6342528, 0.2460254]
smile_kiss 	 [0.262494, 0.31871325, 0.4187927]


## Connecting to SentiWordnet to get synonimous of each word and the related polarity

In [69]:
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
list(swn.senti_synsets('happy' , 'a')), list(swn.senti_synsets('slow'))

([SentiSynset('happy.a.01'),
  SentiSynset('felicitous.s.02'),
  SentiSynset('glad.s.02'),
  SentiSynset('happy.s.04')],
 [SentiSynset('decelerate.v.01'),
  SentiSynset('slow.v.02'),
  SentiSynset('slow.v.03'),
  SentiSynset('slow.a.01'),
  SentiSynset('slow.a.02'),
  SentiSynset('dense.s.04'),
  SentiSynset('slow.a.04'),
  SentiSynset('boring.s.01'),
  SentiSynset('dull.s.08'),
  SentiSynset('slowly.r.01'),
  SentiSynset('behind.r.03')])

In [70]:
def penn_to_wn(tag):
    """
    Convert between the PennTreebank tags to simple Wordnet tags
    """
    if tag.startswith('J'):
        return wn.ADJ    # 'a','adjective'
    elif tag.startswith('N'):
        return wn.NOUN   # 'n',  'noun'
    elif tag.startswith('R'):
        return wn.ADV    # 'r',  'adverb'
    elif tag.startswith('V'):
        return wn.VERB   # 'v', 'verb'
    return 'n'  

In [71]:
def polarity_sentiwordnet(word,tag, lb= 0, sinonimi= 'half'):
    """ tag must be 'a', 'n', 'r' or 'v'
    
    sinonimi can be 'average' or n° or 'half'
    If 'average':
    to know a word polarity, I look at its synonymous and I compute average polarity for those ones.
    Note: you cannot get the polarity of one single word from sentiwordnet library
    
    
    If numero: I look only at the first synonyms closer to the word and I take polarity of that one
    If 'half': I comput average of first half of synonymous"""
    
    
    diz= {'pos': 0, 'neg': 0, 'neutral':0}
    lb= 0
    #ub= 0.66
    
    if sinonimi== 'average':
        synsets= list(swn.senti_synsets(word,tag))
        
    elif sinonimi== 'half':
        try:
            n_sin= len(list(swn.senti_synsets(word,tag)))
            half= round(n_sin/2)

            synsets= list(swn.senti_synsets(word,tag))[:half]
            
        except:  
            diz['pos']+= swn.senti_synsets(word,tag).pos_score()
            diz['neg']+= swn.senti_synsets(word,tag).neg_score()
            diz['neutral']+= swn.senti_synsets(word,tag).obj_score()
            return dix
            
    
    else:  
        try:
            if len(list(swn.senti_synsets(word,tag)))> sinonimi:  
                synsets= list(swn.senti_synsets(word,tag))[:sinonimi] 
            else:
                n_sin= len(list(swn.senti_synsets(word,tag)))
                synsets= list(swn.senti_synsets(word,tag))[: n_sin-1]
        except UnboundLocalError:
            print('Error', list(swn.senti_synsets(word,tag)))
        
    
    
    num_synsets= len(synsets)
    
    if num_synsets == 0:
        return {}
    
    elif num_synsets >0:
        for syn in synsets:
            #if (syn.pos_score()>lb or syn.neg_score()>lb):
            diz['pos']+= syn.pos_score()
            diz['neg']+= syn.neg_score()
            diz['neutral']+= syn.obj_score()  #* lb / ub
                
            """else:
                if num_synsets >=2:
                    num_synsets= num_synsets -1   #se una parola ha polarità neutra, non la conto per farci la media.."""
                
    for key in diz:
        diz[key]/= num_synsets
    
    norm= sum(diz.values())
    
    #if norm>0:
    
    for key in diz:
        #diz[key] /= norm
        diz[key] = round(diz[key],2)
    
    #print(word, diz)
        
    return diz

#### Trials

In [72]:
polarity_sentiwordnet('love', 'n', 0.5)

{'pos': 0.38, 'neg': 0.0, 'neutral': 0.62}

In [73]:
list(swn.senti_synsets('knife', 'n'))

[SentiSynset('knife.n.01'),
 SentiSynset('knife.n.02'),
 SentiSynset('tongue.n.03')]

In [74]:
lb=0 
for i in range(0,5):
    parsed_sentence= parsed_sent_array[i]
    _class = class_array[i]
    
    print (_class)
    #print (get_tokens(parsed_sentence))
    print('\n')
    #print( get_lemmas(parsed_sentence))
    #print('\n')
    """tokens = get_tokens(parsed_sentence)
    tokens = remove_duplicates(tokens)
    tokens = lower(tokens)"""
    
    ps= parsed_sentence
          
    indexes = parsed_sentence.get_indexes()
    ps= parsed_sentence
    # get the surface, lemma and pos for each token
    for index in indexes:
        #surface = ps.get_surface(index);
        lemma = ps.get_lemma(index).lower();
        pos = ps.get_postag(index);
        print("{} {} {}".format(lemma, pos, penn_to_wn(pos)))
        pos_wordnet= penn_to_wn(pos)
        sentiword_pol= polarity_sentiwordnet(lemma, pos_wordnet, lb)
        if len(sentiword_pol.items())== 3:  #ovvero se ci sono polarity su sentiwordnet
            print(polarity_sentiwordnet(lemma, pos_wordnet, lb))
            print('\n')

negative


uh RB r
drunk JJ a
{'pos': 0.0, 'neg': 0.0, 'neutral': 1.0}


and CC n
tired JJ a
{'pos': 0.25, 'neg': 0.0, 'neutral': 0.75}


and CC n
just RB r
{'pos': 0.04, 'neg': 0.0, 'neutral': 0.96}


remember VBD v
{'pos': 0.16, 'neg': 0.0, 'neutral': 0.84}


i PRP n
{'pos': 0.0, 'neg': 0.0, 'neutral': 1.0}


be VBP v
{'pos': 0.06, 'neg': 0.04, 'neutral': 0.9}


go VBG v
{'pos': 0.0, 'neg': 0.0, 'neutral': 1.0}


to TO n
ireland NNP n
{'pos': 0.0, 'neg': 0.0, 'neutral': 1.0}


in IN n
{'pos': 0.0, 'neg': 0.0, 'neutral': 1.0}


the DT n
morning NN n
{'pos': 0.0, 'neg': 0.0, 'neutral': 1.0}


and CC n
need VB v
{'pos': 0.19, 'neg': 0.19, 'neutral': 0.62}


to TO n
pack VB v
{'pos': 0.0, 'neg': 0.0, 'neutral': 1.0}


. . n
ireland NNP n
{'pos': 0.0, 'neg': 0.0, 'neutral': 1.0}


smile_sad NNP n
neutral


@theboltonnews NNS n
pl NNS n
rt SYM n
- : n
bolton NNP n
society NNP n
{'pos': 0.0, 'neg': 0.0, 'neutral': 1.0}


for IN n
blind NNP n
{'pos': 0.0, 'neg': 0.0, 'neutral': 1.0}


people

## GENERATION OF POLARITY BASED BAG OF WORD MODEL FOR EACH TWEET 

1. First I try to look into the lexicon to get the polarity
2. If there is no, I will pick it from Sentiwordnet: if the highest polarity among the synonymous is higher than a threhsold(0.5), the average polarity of the synonymous is picked and assigned to the word
3. Other BOW representations are built:
    * only looking at certain POS structures
    * Bigrams-based
    * Trigrams-based

In [None]:
global_out_file = open('C:/Users/feder/Desktop/dev_v4.klp', 'w+', encoding="utf-8")

treshold= 0.5
stop_words_len= 2
sentiwordnet_strategy= 'half'

for i in range(0, len(parsed_sent_array)):
    parsed_sentence= parsed_sent_array[i]
    _class = class_array[i]
    
    print (_class)
    
    print( get_lemmas(parsed_sentence))
    print('\n')
    tokens = get_tokens(parsed_sentence)
    tokens = remove_duplicates(tokens)
    tokens = lower(tokens)
    
    indexes = parsed_sentence.get_indexes()
    #print(indexes)
    ps= parsed_sentence
    # get the surface, lemma and pos for each token
    
    repr_polar="|BV:polb| ";
    lemmas= get_lemmas(parsed_sentence)
    lemmas= lower(lemmas)
    
    for index in indexes:
        print('fkefkdkfdk', index)
        
        lemma = ps.get_lemma(index);
        pos = ps.get_postag(index);
        print("{} {} {}".format(index, lemma, pos))
        print('\n')
        
        lemma= lemma.lower()
        
        trial = "_"+lemma+":0.0 "  #it is the default for each word: polarity= 0.0 means it is neutral
        
        if len(lemma)> stop_words_len:  # I take off all the words wich have either 1 or 2 letters
            if lemma in polaritywords:
                print('It is in polarity lexicon')
                polarities= polaritywords.get(lemma)
                max_polarity= max(np.array(list(map(float, polarities))))

                index = np.argmax(np.array(list(map(float, polarities))))
                #print('token {}, max polarity {}, index {}'.format(token, float(max_polarity.round(2)), index))
            ### IF POLARITY IS MAX, I PUT 1, IF NEGATIVE -1, OTHERWISE 0
                if index == 0 and max_polarity> treshold:
                    trial= "_"+lemma+":{} ".format(max_polarity)
                    print('token {}, max polarity {}, POSITIVE'.format(token, float(max_polarity.round(2))))
                elif index == 1 and max_polarity> treshold:
                    trial= "_"+lemma+":{} ".format(- max_polarity)
                    print('token {}, max polarity {}, NEGATIVE'.format(token, float(-max_polarity.round(2))))

            else:  #If word is not in Lexicon, I try in SentiWordnet
                pos_wordnet= penn_to_wn(pos)
                sentiword_pols= polarity_sentiwordnet(lemma, pos_wordnet, treshold, sentiwordnet_strateg)
                if len(sentiword_pols.items())== 3:  #means: if there are polarity (which are 3 for each word)
                    max_polarity= max(sentiword_pols.values())
                    print('The word is in SentiWordnet',max_polarity)
                    if max_polarity > treshold:
                        for key in sentiword_pols:
                            if sentiword_pols[key]== max_polarity:
                                if key== 'pos':
                                    trial= "_"+lemma+":{} ".format(max_polarity)
                                elif key== 'neg':
                                    trial= "_"+lemma+":{} ".format(- max_polarity)

            repr_polar += trial
    repr_polar +="|EV|"
    
    
    #BUILD a BOW representation of polarity of smiles
    repr_pol_smiles="|BV:pol_smiles| ";
    for lemma in lemmas:
        if lemma in smiles:
            polarities= polaritywords.get(lemma)
            max_polarity= max(np.array(list(map(float, polarities))))
            index = np.argmax(np.array(list(map(float, polarities))))
            if index == 0 and max_polarity> treshold:
                repr_pol_smiles+= "_"+lemma+":{} ".format(max_polarity)
            elif index == 1 and max_polarity> treshold:
                repr_pol_smiles+= "_"+lemma+":{} ".format(- max_polarity)
    repr_pol_smiles +="|EV|"
    
    
    #BUILD a BOW representation with lemmas of only nouns, verbs and adjectives
    tokens = get_lemmas(parsed_sentence, "[n,v,j]")
    tokens = remove_duplicates(tokens)
    tokens = lower(tokens)
    repr_bow_nvj="|BV:bow_nvj| ";
    for token in tokens:
        repr_bow_nvj += "_"+token+":1.0 "
    repr_bow_nvj +="|EV|"
    
    
    #BUILD a BOW representation with bigrams
    
    repr_bigrams="|BV:bigrams| ";
    for j in range(len(lemmas)-1):
        current_lemma= lemmas[j]
        next_lemma= lemmas[j+1]
        repr_bigrams += "_"+current_lemma+ "_" +next_lemma+":1.0 "
    repr_bigrams +="|EV|"
    
    #BUILD a BOW representation with trigrams
    
    repr_trigrams="|BV:trigrams| ";
    for j in range(len(lemmas)-2):
        current_lemma= lemmas[j]
        next_lemma= lemmas[j+1]
        third_lemma= lemmas[j+2]
        repr_trigrams += "_"+current_lemma+ "_" +next_lemma+ "_" +
        third_lemma+":1.0 "
    repr_trigrams +="|EV|"
    
    example = _class + " " + repr_polar + " " + repr_bow_nvj+ " " + repr_bigrams+ " " + repr_trigrams+ " " + repr_pol_smiles
    
    global_out_file.write(example)
    global_out_file.write('\n')
    global_out_file.flush()
global_out_file.close()