# Link notebook to google drive

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
%cd /content/gdrive/My Drive/Colab Notebooks

/content/gdrive/My Drive/Colab Notebooks


# Load W2V model into memory

To download embeddings file check the notebook GettingPretrainedModels.ipynb

In [6]:
def getW2VModel(d):
  import numpy as np
  print("retrieving embeddings from .txt file")

  paths_dict = {300: 'Embeddings/Pretrained_Embeddings/Word2Vec_1/SBW-vectors-300-min5.txt',
                100: 'Embeddings/Pretrained_Embeddings/Word2Vec_2/model.txt'}

  path = paths_dict[d]
  words_count = 0
  words_list = []
  embeddings_dict = {}

  with open(path,'r', encoding='utf8') as reader:
      while True:
        try:
            line = next(reader)
            w, *v = line.rstrip(' \n').split(' ')
            words_list.append(w) 
            embeddings_dict[w] = np.array(list(map(float, v)))
            words_count += 1
        except StopIteration:
            break
        except UnicodeDecodeError:
            pass

  return (words_list, embeddings_dict)

# Importar dataset preprocesado

### Train set

In [7]:
# 5 FOLDS SPLITTING
import pandas as pd
train_df = pd.read_pickle('./dataset_files/preprocessed_train_dataset_5_folds.data', None)

In [8]:
# 7 FOLDS SPLITTING
import pandas as pd
train_df = pd.read_pickle('./dataset_files/preprocessed_train_dataset_7_folds.data', None)

In [None]:
train_df.head()

### Test set

In [9]:
test_df = pd.read_pickle('./dataset_files/preprocessed_test_dataset.data', None)

In [None]:
test_df.head()

# Encoding

## Encoding Functions

In [20]:
import numpy as np
MAX_WORDS = 55
EMBEDDINGS_SIZE = 300

def transformTweets(preprocessed_tweets, encoding_format, embeddings_model):

  if encoding_format == 'SINGLE-VEC':

    n_tweets = len(preprocessed_tweets)
    encoded_tweets = np.zeros((n_tweets, EMBEDDINGS_SIZE))

    for i, tweet in enumerate(preprocessed_tweets):
      encoded_tweets[i] = toSingleVectorEmbedding(tweet,embeddings_model)

    print("Tweets were succesfully transformed.")
    return encoded_tweets

  elif encoding_format == 'EMB-SEQ':
    n_tweets = len(preprocessed_tweets)
    encoded_tweets = np.zeros((n_tweets, MAX_WORDS, EMBEDDINGS_SIZE))

    for i, tweet in enumerate(preprocessed_tweets):
      encoded_tweets[i] = toEmbedingsSequence(tweet,embeddings_model)

    print("Tweets were succesfully transformed.")
    return encoded_tweets

  else:
    print("Invalid format. (Expected 'SINGLE-VEC' or 'EMB-SEQ')")


def toSingleVectorEmbedding(preprocessed_tweet, embeddings_model):
  encoded_tweet = np.zeros((EMBEDDINGS_SIZE,))
  for w in preprocessed_tweet:
    word_vector = getWordVector(w, embeddings_model)
    encoded_tweet = encoded_tweet + word_vector
  return encoded_tweet.reshape(1,-1)

def toEmbedingsSequence(preprocessed_tweet, embeddings_model):
  embeddings_sequence = np.zeros((1, MAX_WORDS, EMBEDDINGS_SIZE))
  for i, w in enumerate(preprocessed_tweet[:MAX_WORDS]):
    word_vector = getWordVector(w, embeddings_model).reshape(1,-1)
    embeddings_sequence[0][i] = word_vector
  return embeddings_sequence

def getWordVector(w, embeddings_model):
  if w in embeddings_model.keys():
      return embeddings_model[w]
  else:
      return np.zeros((EMBEDDINGS_SIZE,))

## encodeDataFolds()

In [22]:
def encodeDataFolds(train_df, n_folds, prep_format, encoding_format, embeddings_model, embeddings_label):
  encoded_folds = []
  for K in range (n_folds):
    fold_mask = train_df.kfold==K
    fold_tweets = train_df.loc[fold_mask,f'preprocessed_text_{prep_format}']
    encoded_tweets = transformTweets(fold_tweets, 
                                    encoding_format,
                                    embeddings_model)
    
    saveEncodedTweets(encoded_tweets,
                      encoding_format, 
                      file_name=f'{n_folds}FOLDS/{embeddings_label}_F{K}_P{prep_format}.data')

## saveEncodedTweets()

In [14]:
# save encoded tweets into disk
import pickle
def saveEncodedTweets(encoded_tweets, encoding_format, file_name):
  path = './dataset_files/Encoded/{}/{}'.format(encoding_format, file_name)
  print("Saving data to:")
  print(path)

  with open(path, 'wb') as filehandle:
      # store the encoded documents as binary data
      pickle.dump(encoded_tweets, filehandle)
      
  print("\nCOMPLETE\n")

# W2V 300D

In [16]:
words, w2v_model = getW2VModel(d=300)

retrieving embeddings from .txt file


In [18]:
w2v_model['hola'].shape

(300,)

## Single vector encodings

### Train-set

In [None]:
encodeDataFolds(train_df, 
                n_folds=7, 
                prep_format=2, 
                encoding_format='SINGLE-VEC', 
                embeddings_model=w2v_model,
                embeddings_label='W2V300')

### Test-set

In [24]:
prep_format = 2

test_tweets = test_df['preprocessed_text_{}'.format(prep_format)]

encoded_test_tweets = transformTweets(test_tweets, 
                                 encoding_format='SINGLE-VEC',
                                 embeddings_model=w2v_model)

saveEncodedTweets(encoded_test_tweets,
                  encoding_format='SINGLE-VEC', 
                  file_name=f'W2V300_TEST_P{prep_format}.data')

Tweets were succesfully transformed.
Saving data to:
./dataset_files/Encoded/SINGLE-VEC/W2V300_TEST_P2.data

COMPLETE



In [25]:
encoded_test_tweets[0].shape

(300,)

## Embeddings-sequence encodings

### Train-set

In [None]:
encodeDataFolds(train_df, 
                n_folds=7, 
                prep_format=2, 
                encoding_format='EMB-SEQ', 
                embeddings_model=w2v_model,
                embeddings_label='W2V300')

### Test-set

In [27]:
prep_format = 2

test_tweets = test_df['preprocessed_text_{}'.format(prep_format)]

encoded_test_tweets = transformTweets(test_tweets, 
                                 encoding_format='EMB-SEQ',
                                 embeddings_model=w2v_model)

saveEncodedTweets(encoded_test_tweets,
                  encoding_format='EMB-SEQ', 
                  file_name=f'W2V300_TEST_P{prep_format}.data')

Tweets were succesfully transformed.
Saving data to:
./dataset_files/Encoded/EMB-SEQ/W2V300_TEST_P2.data

COMPLETE



In [28]:
encoded_test_tweets[0].shape

(55, 300)