# Link notebook to google drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
%cd /content/gdrive/My Drive/Colab Notebooks

/content/gdrive/My Drive/Colab Notebooks


# Import preprocessed Tweets from drive

In [None]:
import pickle
with open('dataset_files/PreprocessedTrainTweets_pr2', 'rb') as filehandle:
    # store the encoded documents as binary data
    train_tweets = pickle.load(filehandle)

with open('dataset_files/PreprocessedTestTweets_pr2', 'rb') as filehandle:
    # store the encoded documents as binary data
    test_tweets = pickle.load(filehandle)

In [None]:
train_tweets[0], test_tweets[0]

(['easyjet',
  'quiere',
  'duplicar',
  'el',
  'número',
  'de',
  'mujeres',
  'piloto',
  'verás',
  'tú',
  'para',
  'aparcar',
  'el',
  'avión',
  'enlace'],
 ['me',
  'estoy',
  'comiendo',
  'la',
  'picada',
  'árabe',
  'más',
  'rica',
  'de',
  'mi',
  'vida'])

In [None]:
from gensim.models.fasttext import FastText

In [62]:
model = FastText(size=100, window=2, min_count=3, sorted_vocab=1)  # instantiate
model.build_vocab(sentences=train_tweets)
model.train(sentences=train_tweets, total_examples=len(train_tweets), epochs=50)

In [41]:
model.wv.vocab

{'quiere': <gensim.models.keyedvectors.Vocab at 0x7fa0fa844dd8>,
 'duplicar': <gensim.models.keyedvectors.Vocab at 0x7fa0fa8449b0>,
 'el': <gensim.models.keyedvectors.Vocab at 0x7fa0fa844630>,
 'número': <gensim.models.keyedvectors.Vocab at 0x7fa0fa844fd0>,
 'de': <gensim.models.keyedvectors.Vocab at 0x7fa1029c2048>,
 'mujeres': <gensim.models.keyedvectors.Vocab at 0x7fa112ac0208>,
 'verás': <gensim.models.keyedvectors.Vocab at 0x7fa102a5c940>,
 'tú': <gensim.models.keyedvectors.Vocab at 0x7fa102a5c048>,
 'para': <gensim.models.keyedvectors.Vocab at 0x7fa102a5ce80>,
 'avión': <gensim.models.keyedvectors.Vocab at 0x7fa102a5c320>,
 'enlace': <gensim.models.keyedvectors.Vocab at 0x7fa102a5c780>,
 'gobierno': <gensim.models.keyedvectors.Vocab at 0x7fa102a5cba8>,
 'debe': <gensim.models.keyedvectors.Vocab at 0x7fa102a5ce48>,
 'crear': <gensim.models.keyedvectors.Vocab at 0x7fa102a5ccc0>,
 'un': <gensim.models.keyedvectors.Vocab at 0x7fa102a5c6a0>,
 'control': <gensim.models.keyedvectors.Voc

In [None]:
vocab_list =[]
for w in model.wv.vocab:
  vocab_list.append(w)


In [38]:
vocab_list[2]

'el'

In [None]:
# Defining values for parameters
embedding_size = 300
window_size = 5
min_word = 10
down_sampling = 1e-2
 
%time
fast_Text_model = FastText(train_tweets,
                           size=embedding_size,
                           window=window_size,
                           min_count=min_word,
                           sample=down_sampling,
                           workers = 4,
                           sg=1,
                           iter=50)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.87 µs


In [73]:
words = model.wv.most_similar(positive=['todo'],topn=5)
words

  if np.issubdtype(vec.dtype, np.int):


[('modo', 0.688302755355835),
 ('top', 0.6672989130020142),
 ('miedo', 0.6213207244873047),
 ('do', 0.617906928062439),
 ('pedo', 0.6161102056503296)]

## Model construction

In [None]:
# returns a dictionary of words and the corresponding embeddings
def glove_model(d = 300):
    import numpy as np
    print("Loading GloVe model")

    path = 'Embeddings/Pretrained_Embeddings/GloVe_1/glove.6B.{}d.txt'.format(d)

    with open(path,'r', encoding='utf8') as reader:
        #words_count = 0
        words_list = []
        embeddings_dict = {}
        for line in reader:
            w, *v = line.rstrip(' \n').split(' ')
            words_list.append(w) 
            embeddings_dict[w] = np.array(list(map(float, v)))
            #words_count += 1
      
    print("OK => ",len(embeddings_dict)," palabras")
    return (words_list, embeddings_dict)

In [None]:
w_list, model = glove_model(d = 300)

Loading GloVe model
OK =>  400001  palabras


In [None]:
model['hola'].shape

(300,)

# Encoding

In [None]:
def tweet_encoding_bow(preprocessed_tweet, d = 300):
  import numpy as np
  encoded_tweet = np.zeros((d,))

  for w in preprocessed_tweet:
    if w in model.keys():
      encoded_tweet = encoded_tweet + model[w]

  return encoded_tweet.reshape(1,-1)

In [None]:
embeddings_dim = model['hola'].shape[0]
encoded_train_tweets = []
encoded_test_tweets = []

for i, tweet in enumerate(train_tweets):
  encoded_train_tweets.append(tweet_encoding_bow(tweet, d = embeddings_dim))

for i, tweet in enumerate(test_tweets):
  encoded_test_tweets.append(tweet_encoding_bow(tweet, d = embeddings_dim))

encoded_train_tweets[0].shape, encoded_test_tweets[0].shape

((1, 100), (1, 100))

# Save encoded tweets

In [None]:
# save encoded tweets into disk
import pickle

embeddings_dim = model['hola'].shape[0]
encoded_train_tweets_path = 'dataset_files/EncodedTrainTweets_GloVe{}d_pr2'.format(embeddings_dim)
encoded_test_tweets_path = 'dataset_files/EncodedTestTweets_GloVe{}d_pr2'.format(embeddings_dim)

with open(encoded_train_tweets_path, 'wb') as filehandle:
    # store the encoded documents as binary data
    pickle.dump(encoded_train_tweets, filehandle)

with open(encoded_test_tweets_path, 'wb') as filehandle:
    # store the encoded documents as binary data
    pickle.dump(encoded_test_tweets, filehandle)