In [44]:
import pandas as pd  
pd.options.mode.chained_assignment = None
import numpy as np  
from copy import deepcopy
from string import punctuation
from random import shuffle
from sklearn.preprocessing import scale

import gensim
from gensim.models.word2vec import Word2Vec # the word2vec model gensim class
LabeledSentence = gensim.models.doc2vec.LabeledSentence # we'll talk about this down below

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
tokenizer = TweetTokenizer()

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [42]:
# Funcion para tokenizar cada tweet "asd dfg sad" -> ['asd', 'dfg', 'sad']
def tokenize(tweet):
    try:
        tweet  = str(tweet).lower()
        tokens = tokenizer.tokenize(tweet)
        tokens = filter(lambda t: not t.startswith('@'), tokens)
        tokens = filter(lambda t: not t.startswith('#'), tokens)
        tokens = filter(lambda t: not t.startswith('http'), tokens)
        return tokens
    except:
        return 'NC'
    
def tokenizing_and_cleaning(data):
    tokens = []
    for tweet in tqdm(data[:, 1]):
        tokens.append(list(tokenize(tweet)))
    tokens = np.array(tokens)
    tokens[tokens != 'NC']
    return tokens

# Etiquetando tokens si corresponden a Training o Test
def labelizeTweets(tweets, label_type):
    labelized = []
    for i,v in tqdm(enumerate(tweets)):
        label = '%s_%s'%(label_type,i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized

# Construyendo el Word Vector
def buildWordVector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += tweet_w2v[word].reshape((1, size)) * tfidf[word]
            count += 1.
        except KeyError: # handling the case where the token is not
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec

In [9]:
data = pd.read_csv("datasetcsv/general-tweets-test-tagged.csv", encoding='utf-8')

In [93]:
data_v = np.array(data)
sentiment = data_v[:, 1]
text = data_v[:, 0]
sentiment[sentiment == 'P'] = 1
sentiment[sentiment == 'P+'] = 1
sentiment[sentiment == 'N'] = 0
sentiment[sentiment == 'N+'] = 0
sentiment[sentiment == 'NONE'] = 2
sentiment[sentiment == 'NEU'] = 2

In [94]:
import collections, numpy

collections.Counter(sentiment)

Counter({0: 15844, 1: 22233, 2: 22721})

In [95]:
tweets = np.c_[ sentiment, text]

In [98]:
tweets = tweets[tweets[:, 0] != 2]

### Pre processing

In [100]:
#n = 10000
#tweets = np.concatenate((tweets[:n], tweets[-n:]))
tweets = tweets[:1000000]
tokens = tokenizing_and_cleaning(tweets)

100%|██████████| 38077/38077 [00:17<00:00, 2129.39it/s]


In [101]:
tweets = np.c_[ tweets, tokens]

In [132]:
tweets

array([[0,
        "Portada 'Público', viernes. Fabra al banquillo por 'orden' del Supremo; Wikileaks 'retrata' a 160 empresas espías. http://t.co/YtpRU0fd",
        ['portada', "'", 'público', "'", ',', 'viernes', '.', 'fabra', 'al', 'banquillo', 'por', "'", 'orden', "'", 'del', 'supremo', ';', 'wikileaks', "'", 'retrata', "'", 'a', '160', 'empresas', 'espías', '.']],
       [1,
        'Gonzalo Altozano tras la presentación de su libro 101 españoles y Dios. Divertido, emocionante y brillante. http://t.co/4BdljMhB',
        ['gonzalo', 'altozano', 'tras', 'la', 'presentación', 'de', 'su', 'libro', '101', 'españoles', 'y', 'dios', '.', 'divertido', ',', 'emocionante', 'y', 'brillante', '.']],
       [0,
        'Mañana en Gaceta: TVE, la que pagamos tú y yo, culpa a una becaria de su falsa información sobre el cierre de @gaceta',
        ['mañana', 'en', 'gaceta', ':', 'tve', ',', 'la', 'que', 'pagamos', 'tú', 'y', 'yo', ',', 'culpa', 'a', 'una', 'becaria', 'de', 'su', 'falsa', 'inform

In [102]:
x_train, x_test, y_train, y_test = train_test_split(tweets[:, 2],tweets[:, 0], test_size=0.05)
x_train = labelizeTweets(x_train, 'TRAIN')
x_test = labelizeTweets(x_test, 'TEST')

36173it [00:00, 42052.78it/s]
1904it [00:00, 27911.31it/s]


In [103]:
tweet_w2v = Word2Vec(size=200, min_count=10)
tweet_w2v.build_vocab([x.words for x in tqdm(x_train)])
tweet_w2v.train([x.words for x in tqdm(x_train)],total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter)

100%|██████████| 36173/36173 [00:00<00:00, 171374.86it/s]
100%|██████████| 36173/36173 [00:00<00:00, 536763.72it/s]


2078036

In [104]:
tweet_w2v.save("Spanish_version_w2v")

In [105]:
tweet_w2v = Word2Vec.load("Spanish_version_w2v")

In [106]:
print ('building tf-idf matrix ...')
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x.words for x in x_train])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print ('vocab size :', len(tfidf))

building tf-idf matrix ...
vocab size : 5174


In [107]:
print("Train")
train_vecs_w2v = np.concatenate([buildWordVector(z, 200) for z in (map(lambda x: x.words, x_train))])
train_vecs_w2v = scale(train_vecs_w2v)

Train


In [108]:
print("Test")
test_vecs_w2v = np.concatenate([buildWordVector(z, 200) for z in (map(lambda x: x.words, x_test))])
test_vecs_w2v = scale(test_vecs_w2v)

Test


In [109]:
from keras.models import Sequential, load_model
from keras.layers import Dense
from keras.utils.np_utils import to_categorical
y_binary = to_categorical(y_train)

In [113]:
model = Sequential()
model.add(Dense(500, activation='relu', input_dim=200))
model.add(Dense(500, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()
model.fit(train_vecs_w2v, y_train, epochs=9, batch_size=32, )

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_33 (Dense)             (None, 500)               100500    
_________________________________________________________________
dense_34 (Dense)             (None, 500)               250500    
_________________________________________________________________
dense_35 (Dense)             (None, 1)                 501       
Total params: 351,501
Trainable params: 351,501
Non-trainable params: 0
_________________________________________________________________
Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9


<keras.callbacks.History at 0x11582da58>

In [114]:
model.save("my_model_spanish.h5")

In [115]:
model = load_model("my_model_spanish.h5") # Dense(500)

In [128]:
def predict_sentiment_one_tweet(tweet):
    w2v = list(tokenize(tweet))
    w2v = labelizeTweets([w2v], 'TEST')
    prediction = model.predict_classes(buildWordVector(w2v[0].words, 200))[0][0]
    print(model.predict(buildWordVector(w2v[0].words, 200)))
    if prediction == 1:
        print("Positivo")
    elif prediction == 0:
        print("Negativo")

In [145]:
tweet = "Qué triste e impresentable que la televisión pública, TVN, no trasmitiera el debate del Frente Amplio. Un horror democrático."
predict_sentiment_one_tweet(tweet)

1it [00:00, 1659.14it/s]

[[ 0.44130301]]
Negativo





In [146]:
tweet = "Así da gusto, este es el nivel de debate que merecemos"
predict_sentiment_one_tweet(tweet)

1it [00:00, 1129.93it/s]

[[ 0.60440314]]
Positivo



