In [None]:
import pandas as pd  
pd.options.mode.chained_assignment = None
import numpy as np  
from copy import deepcopy
from string import punctuation
from random import shuffle
from sklearn.preprocessing import scale

import gensim
from gensim.models.word2vec import Word2Vec # the word2vec model gensim class
LabeledSentence = gensim.models.doc2vec.LabeledSentence # we'll talk about this down below

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
tokenizer = TweetTokenizer()

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

On a more general level, word2vec embeds non trivial semantic and syntaxic relationships between words. This results in preserving a rich context

In [None]:
# Funcion para tokenizar cada tweet "asd dfg sad" -> ['asd', 'dfg', 'sad']
def tokenize(tweet):
    try:
        tweet  = str(tweet).lower()
        tokens = tokenizer.tokenize(tweet)
        tokens = filter(lambda t: not t.startswith('@'), tokens)
        tokens = filter(lambda t: not t.startswith('#'), tokens)
        tokens = filter(lambda t: not t.startswith('http'), tokens)
        return tokens
    except:
        return 'NC'
    
def tokenizing_and_cleaning(data):
    tokens = []
    for tweet in tqdm(data[:, 1]):
        tokens.append(list(tokenize(tweet)))
    tokens = np.array(tokens)
    tokens[tokens != 'NC']
    return tokens

# Etiquetando tokens si corresponden a Training o Test
def labelizeTweets(tweets, label_type):
    labelized = []
    for i,v in tqdm(enumerate(tweets)):
        label = '%s_%s'%(label_type,i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized

# Construyendo el Word Vector
def buildWordVector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += tweet_w2v[word].reshape((1, size)) * tfidf[word]
            count += 1.
        except KeyError: # handling the case where the token is not
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec

## Data

In [None]:
data = pd.read_csv("training.1600000.processed.noemoticon.csv", encoding='latin')

In [None]:
data_v = np.array(data)
sentiment = data_v[:, 0]
text = data_v[:, 5]
sentiment[sentiment == 4] = 1
tweets = np.c_[ sentiment, text]

### Pre processing

In [None]:
#n = 10000
#tweets = np.concatenate((tweets[:n], tweets[-n:]))
tweets = tweets[:1000000]
tokens = tokenizing_and_cleaning(tweets)

Agregamos los tokens a nuestro array quedando de la forma [sentiment, tweet, token]

In [None]:
tweets = np.c_[ tweets, tokens]

Separamos la data en training y test set y los etiquetamos

In [None]:
x_train, x_test, y_train, y_test = train_test_split(tweets[:, 2],tweets[:, 0], test_size=0.2)
x_train = labelizeTweets(x_train, 'TRAIN')
x_test = labelizeTweets(x_test, 'TEST')

Creamos el vocabulario de palabras, pasando de tokens a vectores por cada palabra

In [None]:
tweet_w2v = Word2Vec(size=200, min_count=10)
tweet_w2v.build_vocab([x.words for x in tqdm(x_train)])
tweet_w2v.train([x.words for x in tqdm(x_train)],total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter)

In [None]:
# tweet_w2v.save("English_version_w2v")

In [None]:
tweet_w2v = Word2Vec.load("English_version_w2v_mlp2")

Un poco de emellecimiento de los vectores para hacerlos leibles para la red

In [None]:
print ('building tf-idf matrix ...')
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x.words for x in x_train])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print ('vocab size :', len(tfidf))

In [None]:
print("Train")
train_vecs_w2v = np.concatenate([buildWordVector(z, 200) for z in (map(lambda x: x.words, x_train))])
train_vecs_w2v = scale(train_vecs_w2v)

In [None]:
print("Test")
test_vecs_w2v = np.concatenate([buildWordVector(z, 200) for z in (map(lambda x: x.words, x_test))])
test_vecs_w2v = scale(test_vecs_w2v)

Creamos el modelo y lo entrenamos

### Modelo MLP 

In [17]:
from keras.models import Sequential, load_model
from keras.layers import Dense

Using TensorFlow backend.


In [18]:
model = Sequential()
model.add(Dense(500, activation='relu', input_dim=200))
model.add(Dense(500, activation='relu', ))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()
#model.fit(train_vecs_w2v, y_train, epochs=9, batch_size=32, verbose=2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 500)               100500    
_________________________________________________________________
dense_2 (Dense)              (None, 500)               250500    
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 501       
Total params: 351,501
Trainable params: 351,501
Non-trainable params: 0
_________________________________________________________________


Hora de probar el modelo

In [19]:
model = load_model("my_model_mlp2.h5") # Dense(500)

**MLP1** = loss: 0.3248 - acc: 0.8673 | **MLP2** = loss: 1.0812 - acc: 0.8635 

In [20]:
def predict_sentiment_one_tweet(tweet):
    w2v = list(tokenize(tweet))
    w2v = labelizeTweets([w2v], 'TEST')
    prediction = model.predict_classes(buildWordVector(w2v[0].words, 200))[0][0]
    if prediction:
        print("Positivo")
    else:
        print("Negativo")

In [25]:
tweet = "How sad and unpresentable that the public television, TVN, did not broadcast the Broad Front debate. A democratic horror."
predict_sentiment_one_tweet(tweet)

1it [00:00, 2874.78it/s]

Negativo





In [26]:
tweet = "So glad, this is the level of debate we deserve"
predict_sentiment_one_tweet(tweet)

1it [00:00, 1170.94it/s]

Positivo





0 : negativo
1 : positivo

In [42]:
tweet = "kast knows nothing"
predict_sentiment_one_tweet(tweet)

1it [00:00, 3666.35it/s]

Negativo





Nota cuando haga otra prueba necesito recuperar el diccionario de palabras y el modelo

### Modelo LSTM (a la noshe)

Cambiar nombre diccionario y modelo

In [None]:
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Embedding

In [None]:
dim1 = train_vecs_w2v.shape[0]
dim2 = train_vecs_w2v.shape[1]

In [None]:
model = Sequential()
model.add(LSTM(150, input_shape=(200,1)))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()
#model.fit(train_vecs_w2v.reshape(dim1, dim2, 1), y_train, epochs=9, batch_size=32, )

### Translate

In [None]:
from googletrans import Translator

In [None]:
translator=Translator()
translator.translate(data_v[2][5], src='en', dest='es').text

In [None]:
data_v = np.array(data)
sentiment = data_v[:, 0]
text = data_v[:, 5]

In [None]:
translator=Translator()
spanish_text = []
for i in tqdm(text):
    spanish_text.append(translator.translate(i, src='en', dest='es').text)