<a href="https://colab.research.google.com/github/dauberson/hello-world/blob/master/Briefing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Para realiza o briefing eu optei usar o Word2Vec, é uma técnica de processamento de linguagem natural(NLP) que tem a ideia de transformar uma palavra em um vetor númerico que consiga o representar semanticamente, cada palavra tem uma unica representação. Para fazer essa representação podemos usar o encoding, porem esse metodo não leva em consideração a similaridade entre as palavras. Mas temos uma solução para isso, embedding; Esse metodo consegue considerar a similaridade entre as palavras. Word2Vec usa o metodo de embedding e a similaridade entre as palavras vem atraves de palavras "vizinhas". Então para prosseguir com esse metodo, o nosso banco de dados foi preenchido com manchetes. 
Dado as definições acimas, usamos o Skip Gram para definir esses "vizinhos", para essa tecnica foi necessario definir um raio de vizinhança, conhecido como windows size, que no desafio foi definido como 10. Não foi necessario usar uma função para remover stop words, o banco de dados ja vem com essa classe de palavras removidas

In [0]:
import pandas as pd 
data = pd.read_csv("dados.csv") 
data = data['manchetes'].values #data

In [0]:
palavras = []
for texto in data:
  for palavra in texto.split(' '):
      palavras.append(palavra)

palavras = set(palavras) #separando cada palavra do dataset

Skip Gram em ação! windows size = 10


In [0]:
word2int = {}

for i,palavra in enumerate(palavras):
    word2int[palavra] = i

sentencas = []
for sentenca in data:
    sentencas.append(sentenca.split())
    
WINDOW_SIZE = 10

data2 = []
for sentenca in sentencas:
    for idx, palavra in enumerate(sentenca):
        for vizinho in sentenca[max(idx - WINDOW_SIZE, 0) : min(idx + WINDOW_SIZE, len(sentenca)) + 1] : 
            if vizinho != palavra:
                data2.append([palavra, vizinho]) #construindo a vizinha de cada palavra, num raio de 10, windows_size.

In [0]:
import pandas as pd

df = pd.DataFrame(data2, columns = ['palavra', 'vizinho'])

In [13]:
df.head(10) #previa das palavras e seus vizinhos

Unnamed: 0,palavra,vizinho
0,diretor,petrobras
1,diretor,nega
2,diretor,organização
3,diretor,criminosa
4,diretor,estatal
5,diretor,notícias
6,diretor,brasil
7,petrobras,diretor
8,petrobras,nega
9,petrobras,organização


In [0]:
import tensorflow as tf
import numpy as np

ONE_HOT_DIM = len(palavras)

# function to convert numbers to one hot vectors
def to_one_hot_encoding(data_point_index):
    one_hot_encoding = np.zeros(ONE_HOT_DIM)
    one_hot_encoding[data_point_index] = 1
    return one_hot_encoding

X = [] # input word
Y = [] # target word

for x, y in zip(df['palavra'], df['vizinho']):
    X.append(to_one_hot_encoding(word2int[ x ]))
    Y.append(to_one_hot_encoding(word2int[ y ]))

X_train = np.asarray(X)
Y_train = np.asarray(Y)

x = tf.placeholder(tf.float32, shape=(None, ONE_HOT_DIM))
y_label = tf.placeholder(tf.float32, shape=(None, ONE_HOT_DIM))

EMBEDDING_DIM = 16

# hidden layer: which represents word vector eventually
W1 = tf.Variable(tf.random_normal([ONE_HOT_DIM, EMBEDDING_DIM]))
b1 = tf.Variable(tf.random_normal([1])) #bias
hidden_layer = tf.add(tf.matmul(x,W1), b1)

# output layer
W2 = tf.Variable(tf.random_normal([EMBEDDING_DIM, ONE_HOT_DIM]))
b2 = tf.Variable(tf.random_normal([1]))
prediction = tf.nn.softmax(tf.add( tf.matmul(hidden_layer, W2), b2))

# loss function: cross entropy
loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(prediction), axis=[1]))

# training operation
train_op = tf.train.GradientDescentOptimizer(0.05).minimize(loss)

In [15]:
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init) 

iteration = 3000
for i in range(iteration):
    # input is X_train which is one hot encoded word
    # label is Y_train which is one hot encoded neighbor word
    sess.run(train_op, feed_dict={x: X_train, y_label: Y_train})
    if i % 300 == 0:
        print('iteration '+str(i)+' loss is : ', sess.run(loss, feed_dict={x: X_train, y_label: Y_train}))

iteration 0 loss is :  14.70251
iteration 300 loss is :  13.804077
iteration 600 loss is :  13.267253
iteration 900 loss is :  12.858142
iteration 1200 loss is :  12.527161
iteration 1500 loss is :  12.248286
iteration 1800 loss is :  12.006498
iteration 2100 loss is :  11.792816
iteration 2400 loss is :  11.601514
iteration 2700 loss is :  11.428628


In [16]:
# Now the hidden layer (W1 + b1) is actually the word look up table
vectors = sess.run(W1 + b1)
linhas = len(vectors)
colunas = len(vectors[0])
vectors

        

array([[ 1.255359  , -0.52080166, -0.50276345, ..., -1.1171277 ,
         0.65750355, -0.4551038 ],
       [-0.06646907, -0.60671633,  0.6570565 , ...,  1.3313273 ,
        -1.361109  , -0.7637092 ],
       [-0.11208901, -0.01965748, -0.01901527, ...,  0.31761658,
        -0.31216806,  0.46359956],
       ...,
       [-0.49521172,  0.49478543, -0.60808086, ..., -1.0510644 ,
         0.12885165,  0.54251754],
       [-1.1309541 ,  0.26152104, -0.08922659, ...,  1.2928083 ,
         1.1811688 , -0.49136633],
       [-1.3589094 , -0.80418366, -1.1735821 , ...,  1.0300416 ,
        -1.1887814 , -1.264911  ]], dtype=float32)

Construção dos vetores associados a cada palavra

In [17]:
w2v_df = pd.DataFrame(vectors, columns = ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13', 'x14', 'x15', 'x16'])
w2v_df['palavra'] = palavras
w2v_df = w2v_df[['palavra', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13', 'x14', 'x15', 'x16']]
w2v_df


Unnamed: 0,palavra,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16
0,,1.255359,-0.520802,-0.502763,0.981672,-0.341696,-0.364461,-0.536677,1.344214,-0.004991,0.461131,2.292317,0.870123,1.296588,-1.117128,0.657504,-0.455104
1,reter,-0.066469,-0.606716,0.657057,1.960007,-2.982978,0.523689,0.150464,1.163507,-2.365527,0.605053,1.009219,-1.964705,0.086884,1.331327,-1.361109,-0.763709
2,infomoney,-0.112089,-0.019657,-0.019015,-0.470011,0.099113,-0.367444,0.047196,-0.683762,-0.262133,0.319313,-0.039996,-0.298571,0.094526,0.317617,-0.312168,0.463600
3,cair,-0.276426,0.004193,-1.970239,0.632414,1.483052,-0.317908,-0.797483,-0.164617,-0.419488,-0.272668,-0.280760,-1.772191,0.971081,0.316289,0.410714,-1.092828
4,pedir,-0.642872,1.114552,0.298639,0.830782,-1.192118,0.899174,0.342365,-0.214228,1.042447,0.396128,1.225741,-0.756759,-0.969425,-1.216452,0.714174,-0.135105
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1645,negra,0.587953,-1.432679,-2.887429,0.356008,-1.186241,-0.938067,-1.311168,-0.640874,0.838862,-0.254578,-0.155816,-0.753237,1.346732,0.426177,0.302166,-0.673310
1646,supervisor,-0.494749,-0.726946,1.601808,0.693650,-0.501509,0.358264,0.963258,-0.906863,0.321863,-1.755081,0.725751,-1.163746,0.417186,-0.497902,-0.984042,0.393289
1647,público,-0.495212,0.494785,-0.608081,0.365968,0.210625,-1.317109,2.777832,-0.687007,0.332343,0.000435,0.622183,0.478485,-0.123267,-1.051064,0.128852,0.542518
1648,mídia,-1.130954,0.261521,-0.089227,0.380828,0.122371,0.573752,-0.780007,0.077798,-1.779162,-1.129784,1.092693,2.512996,-0.797869,1.292808,1.181169,-0.491366


Nessa ultima etapa, sera exportado as palavras e seus respectivos vetores, divididas em dois arquivos. Para construir a **view**, optei em usar o Embedding Projector do TensorFlow, http://projector.tensorflow.org. Basta clicar no botão load e selecionar os dois arquivos que foram exportados.

In [0]:
import io

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

for j in palavras:
  out_m.write(j + "\n")
for ix1 in range(linhas):
  for ix2 in range(colunas):
    out_v.write(str((vectors[ix1,ix2])) + "\t")
  out_v.write(str('\n'))
out_v.close()
out_m.close()
try:
  from google.colab import files
except ImportError:
   pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')