In [147]:
import xml.etree.ElementTree as ET  
import numpy as np
import time

In [148]:
from gensim.models.keyedvectors import KeyedVectors
wordvectors_file_vec = '../../SBW-vectors-300-min5.txt'
cantidad = 10000000
start = time.time()
print ("Time to process word vectors with W2V with "+str(cantidad)+"words:")

wordvectors = KeyedVectors.load_word2vec_format(wordvectors_file_vec, limit=cantidad)

end = time.time()
print (end - start)

Time to process word vectors with W2V with 10000000words:
276.2462651729584


In [149]:
fastText_file_vec = '../../fasttext-sbwc.3.6.e20.vec'
cantidad = 10000000
start = time.time()
print ("Time to process word vectors with fastText with "+str(cantidad)+" words:")

fastTextVectors = KeyedVectors.load_word2vec_format(fastText_file_vec, limit=cantidad)

end = time.time()
print (end - start)

Time to process word vectors with fastText with 10000000 words:
243.37566304206848


In [162]:
glove_file_vec = '../../glove-sbwc.i25.vec'
cantidad = 10000000
start = time.time()
print ("Time to process word vectors with glove with "+str(cantidad)+" words:")

glovevectors = KeyedVectors.load_word2vec_format(glove_file_vec, limit=cantidad)

end = time.time()
print (end - start)

Time to process word vectors with glove with 10000000 words:
240.00987815856934


In [151]:
#Testing word2vec
try:
    print(glovevectors.get_vector('hiperbole'))
except:
    print('palabra no esta en dict')

palabra no esta en dict


In [152]:
import nltk
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

nltk.download('stopwords')
stopWords = set(stopwords.words('spanish'))
spanishStemmer = SnowballStemmer("spanish")

#print("y" in stopWords)
#stemmer.stem("cordenada")

[nltk_data] Downloading package stopwords to /Users/Ruizo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [153]:
Tokenizer = RegexpTokenizer(r'\w+')
lines= 'hola cómoo , estáas'
words = Tokenizer.tokenize(lines)

wordsStemmed = []
for word in words:
    word=spanishStemmer.stem(word)
    wordsStemmed.append(word)
print(words)
print(wordsStemmed)

['hola', 'cómoo', 'estáas']
['hol', 'como', 'esta']


In [155]:
tree = ET.parse('general-tweets-train-tagged.xml')  
root = tree.getroot()
tweets=[]
polarities=['NONE', 'NEU', 'P', 'N+', 'P+', 'N']
polarityTypes=['AGREEMENT', 'DISAGREEMENT']
tweetCount=0

start = time.time()
print ("Time to process data:")

for tweet in root:
    tweetCount+=1
    wordsStemmed = []
    polarityNumber=0
    tweetText=tweet.find('content').text
    
    if tweetText:
        words = Tokenizer.tokenize(tweetText)

        for word in words:
            word=spanishStemmer.stem(word)
            if word not in stopWords:
                wordsStemmed.append(word)

        polarity=tweet.find('sentiments').find('polarity').find('value').text
        position=polarities.index(polarity)
        polarityNumber+=position

        polarityType=tweet.find('sentiments').find('polarity').find('type').text
        position=polarityTypes.index(polarityType)
        polarityNumber+=6*position

        newEntry=[wordsStemmed,polarityNumber]

        tweets.append(newEntry)

end = time.time()
print (end - start)
print ("Output as [[[ListOfWords],[polarityNumber]],[[ListOfWords],[polarityNumber]],...]")
print('Cantidad de tweets \n'+ str(tweetCount))

Time to process data:
6.259454965591431
Output as [[[ListOfWords],[polarityNumber]],[[ListOfWords],[polarityNumber]],...]
Cantidad de tweets 
7219


In [156]:
#Word2Vect
vectorizedTweetsInfoW2V=[]
vectorizedPolarityInfo=[]

#FastText
vectorizedTweetsInfoFT=[]

#Glove
vectorizedTweetsInfoG=[]

start = time.time()
print("Time to vectorize tweets:")

for tweet in tweets:
    vectorizedTweetsW2V=[]
    vectorizedTweetsFT=[]
    vectorizedTweetsG=[]
    for word in tweet[0]:
        try:
            vectorW2V = wordvectors.get_vector(word)
            vectorizedTweetsW2V.append(vectorW2V)
            
            vectorFT = fastTextVectors.get_vector(word)
            vectorizedTweetsFT.append(vectorFT)
            
            vectorG = glovevectors.get_vector(word)
            vectorizedTweetsG.append(vectorG)
        except:
            pass    
    vectorMeanW2V = np.mean(np.array(vectorizedTweetsW2V),axis=0)
    vectorMeanFT = np.mean(np.array(vectorizedTweetsFT),axis=0)
    vectorMeanG = np.mean(np.array(vectorizedTweetsG),axis=0)
    try:
        len(vectorMeanW2V)
        vectorizedTweetsInfoW2V.append(vectorMeanW2V.tolist())        
        vectorizedTweetsInfoFT.append(vectorMeanFT.tolist())        
        vectorizedTweetsInfoG.append(vectorMeanG.tolist())
        
        vectorizedPolarityInfo.append(tweet[1])
    except:
        pass

end = time.time()
print(end-start)

Time to vectorize tweets:


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


1.8740031719207764


In [157]:
print(len(vectorizedTweetsInfo))
#for val in vectorizedPolarityInfo:
#    print (val)
#print(len(vectorizedPolarityInfo))

7065


In [158]:
from sklearn import svm
from sklearn.model_selection import cross_val_score

#Entrenar
clf = svm.SVC(kernel='linear', C=1)

In [159]:
#W2V
start = time.time()
print ("Time to train model with W2V:")
    
scoresW2V = cross_val_score(clf, vectorizedTweetsInfoW2V, vectorizedPolarityInfo, cv=10)
print(scoresW2V, sum(scoresW2V)/len(scoresW2V))

end = time.time()
print(end-start)

Time to train model with W2V:




[ 0.3033241   0.30833333  0.32684284  0.36211699  0.35244755  0.33099579
  0.34831461  0.35302391  0.32489451  0.34739803] 0.335769166929
210.39200901985168


In [160]:
#FT
start = time.time()
print ("Time to train model with FastText:")
    
scoresFT = cross_val_score(clf, vectorizedTweetsInfoFT, vectorizedPolarityInfo, cv=10)
print(scoresFT, sum(scoresFT)/len(scoresFT))

end = time.time()
print(end-start)

Time to train model with FastText:




[ 0.31855956  0.31805556  0.3449235   0.34679666  0.35244755  0.38569425
  0.38483146  0.37271449  0.36990155  0.33192686] 0.352585143469
198.28459882736206


In [161]:
#G
start = time.time()
print ("Time to train model with Glove:")
    
scoresG = cross_val_score(clf, vectorizedTweetsInfoG, vectorizedPolarityInfo, cv=10)
print(scoresG, sum(scoresG)/len(scoresG))

end = time.time()
print(end-start)

Time to train model with Glove:




[ 0.33379501  0.30694444  0.32127955  0.3551532   0.37762238  0.35063114
  0.35533708  0.34317862  0.33895921  0.3347398 ] 0.341764044602
222.26583313941956
