# Link notebook to google drive

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
%cd /content/gdrive/My Drive/Colab Notebooks

/content/gdrive/My Drive/Colab Notebooks


## Model construction

In [3]:
# returns a dictionary of words and the corresponding embeddings
def getGloVeModel(d):
    import numpy as np
    print("Loading GloVe model")

    path = 'Embeddings/Pretrained_Embeddings/GloVe_1/glove.6B.{}d.txt'.format(d)

    with open(path,'r', encoding='utf8') as reader:
        #words_count = 0
        words_list = []
        embeddings_dict = {}
        for line in reader:
            w, *v = line.rstrip(' \n').split(' ')
            words_list.append(w) 
            embeddings_dict[w] = np.array(list(map(float, v)))
            #words_count += 1
      
    print("OK => ",len(embeddings_dict)," palabras")
    return (words_list, embeddings_dict)

# Importar dataset preprocesado

### Train set

In [4]:
# 5 FOLDS SPLITTING
import pandas as pd
train_df = pd.read_pickle('./dataset_files/preprocessed_train_dataset_5_folds.data', None)

In [5]:
# 7 FOLDS SPLITTING
import pandas as pd
train_df = pd.read_pickle('./dataset_files/preprocessed_train_dataset_7_folds.data', None)

### Test set

In [6]:
test_df = pd.read_pickle('./dataset_files/preprocessed_test_dataset.data', None)

In [7]:
train_df.head()

Unnamed: 0,id,text,preprocessed_text_1,preprocessed_text_2,HS,TR,AG,HTA,kfold
0,22083,"@pnique Inmigrante motorizado,eres una rata de...","[nombre, inmigrante, motorizadoeres, una, rata...","[nombre, inmigrante, motorizadoeres, una, rata...",1,1,1,4,0
1,20095,la sacaron justo a l dia siguiente del documen...,"[la, sacaron, justo, a, l, dia, siguiente, del...","[la, sacaron, justo, a, l, dia, siguiente, del...",0,0,0,0,0
2,22568,@FioreSalo Callate vs puta,"[nombre, callate, vs, puta]","[nombre, callate, vs, puta]",1,1,1,4,0
3,22107,@Pontifex_es @Pontifex @valealazraki precisame...,"[nombre, nombre, nombre, precisamente, en, mi,...","[nombre, nombre, nombre, precisamente, en, mi,...",0,0,0,0,0
4,20895,QUE haces con cc mogolica pili — Puta tomi cal...,"[que, haces, con, cc, mogolica, pili, puta, to...","[que, haces, con, cc, mogolica, pili, puta, to...",1,1,1,4,0


In [8]:
test_df.head()

Unnamed: 0,id,text,preprocessed_text_1,preprocessed_text_2,HS,TR,AG,HTA
0,30344,#CadaMañana cállate la puta que me pario Kohan...,"[htg, cállate, la, puta, que, me, pario, kohan...","[cállate, la, puta, que, me, pario, kohan, vos...",0,0,0,0
1,30466,Estas navidades mi polla mereces,"[estas, navidades, mi, polla, mereces]","[estas, navidades, mi, polla, mereces]",0,0,0,0
2,31084,Si no aprobas te pego una cojida que no te la ...,"[si, no, aprobas, te, pego, una, cojida, que, ...","[si, no, aprobas, te, pego, una, cojida, que, ...",1,1,1,4
3,34552,"@AlecitoGamer @falklands_utd A, no entendiste ...","[nombre, nombre, a, no, entendiste, nada, ud, ...","[nombre, nombre, a, no, entendiste, nada, ud, ...",0,0,0,0
4,32538,"@deborahhq1973 Es normal, en Cataluña los Inde...","[nombre, es, normal, en, cataluña, los, indepe...","[nombre, es, normal, en, cataluña, los, indepe...",1,0,1,2


# Encoding

## Encoding Functions

In [9]:
import numpy as np
MAX_WORDS = 55
EMBEDDINGS_SIZE = 300

def transformTweets(preprocessed_tweets, encoding_format, embeddings_model):

  if encoding_format == 'SINGLE-VEC':

    n_tweets = len(preprocessed_tweets)
    encoded_tweets = np.zeros((n_tweets, EMBEDDINGS_SIZE))

    for i, tweet in enumerate(preprocessed_tweets):
      encoded_tweets[i] = toSingleVectorEmbedding(tweet,embeddings_model)

    print("Tweets were succesfully transformed.")
    return encoded_tweets

  elif encoding_format == 'EMB-SEQ':
    n_tweets = len(preprocessed_tweets)
    encoded_tweets = np.zeros((n_tweets, MAX_WORDS, EMBEDDINGS_SIZE))

    for i, tweet in enumerate(preprocessed_tweets):
      encoded_tweets[i] = toEmbedingsSequence(tweet,embeddings_model)

    print("Tweets were succesfully transformed.")
    return encoded_tweets

  else:
    print("Invalid format. (Expected 'SINGLE-VEC' or 'EMB-SEQ')")


def toSingleVectorEmbedding(preprocessed_tweet, embeddings_model):
  encoded_tweet = np.zeros((EMBEDDINGS_SIZE,))
  for w in preprocessed_tweet:
    word_vector = getWordVector(w, embeddings_model)
    encoded_tweet = encoded_tweet + word_vector
  return encoded_tweet.reshape(1,-1)

def toEmbedingsSequence(preprocessed_tweet, embeddings_model):
  embeddings_sequence = np.zeros((1, MAX_WORDS, EMBEDDINGS_SIZE))
  for i, w in enumerate(preprocessed_tweet[:MAX_WORDS]):
    word_vector = getWordVector(w, embeddings_model).reshape(1,-1)
    embeddings_sequence[0][i] = word_vector
  return embeddings_sequence

def getWordVector(w, embeddings_model):
  if w in embeddings_model.keys():
      return embeddings_model[w]
  else:
      return np.zeros((EMBEDDINGS_SIZE,))

## encodeDataFolds()

In [10]:
def encodeDataFolds(train_df, n_folds, prep_format, encoding_format, 
                    embeddings_model, embeddings_label):
  encoded_folds = []
  for K in range (n_folds):
    fold_mask = train_df.kfold==K
    fold_tweets = train_df.loc[fold_mask,f'preprocessed_text_{prep_format}']
    encoded_tweets = transformTweets(fold_tweets, 
                                    encoding_format,
                                    embeddings_model)
    
    saveEncodedTweets(encoded_tweets,
                      encoding_format, 
                      file_name=f'{n_folds}FOLDS/{embeddings_label}_F{K}_P{prep_format}.data')

## saveEncodedTweets()

In [11]:
# save encoded tweets into disk
import pickle
def saveEncodedTweets(encoded_tweets, encoding_format, file_name):
  path = './dataset_files/Encoded/{}/{}'.format(encoding_format, file_name)
  print("Saving data to:")
  print(path)

  with open(path, 'wb') as filehandle:
      # store the encoded documents as binary data
      pickle.dump(encoded_tweets, filehandle)
      
  print("\nCOMPLETE\n")

# GloVe 300-d

In [12]:
w_list, GloVe_model = getGloVeModel(d = 300)

Loading GloVe model
OK =>  400001  palabras


In [14]:
GloVe_model['hola'].shape

(300,)

## Single vector encodings

### Train-set

In [None]:
encodeDataFolds(train_df, 
                n_folds=7, 
                prep_format=2, 
                encoding_format='SINGLE-VEC', 
                embeddings_model=GloVe_model,
                embeddings_label='GloVe300')

### Test-set

In [16]:
prep_format = 2

test_tweets = test_df['preprocessed_text_{}'.format(prep_format)]

encoded_test_tweets = transformTweets(test_tweets, 
                                 encoding_format='SINGLE-VEC',
                                 embeddings_model=GloVe_model)

saveEncodedTweets(encoded_test_tweets,
                  encoding_format='SINGLE-VEC', 
                  file_name=f'GloVe300_TEST_P{prep_format}.data')

Tweets were succesfully transformed.
Saving data to:
./dataset_files/Encoded/SINGLE-VEC/GloVe300_TEST_P2.data

COMPLETE



In [17]:
encoded_test_tweets[0].shape

(300,)

## Embeddings-sequence encodings

### Train-set

In [None]:
encodeDataFolds(train_df, 
                n_folds=7, 
                prep_format=2, 
                encoding_format='EMB-SEQ', 
                embeddings_model=GloVe_model,
                embeddings_label='GloVe300')

### Test-set

In [19]:
prep_format = 2

test_tweets = test_df['preprocessed_text_{}'.format(prep_format)]

encoded_test_tweets = transformTweets(test_tweets, 
                                 encoding_format='EMB-SEQ',
                                 embeddings_model=GloVe_model)

saveEncodedTweets(encoded_test_tweets,
                  encoding_format='EMB-SEQ', 
                  file_name=f'GloVe300_TEST_P{prep_format}.data')

Tweets were succesfully transformed.
Saving data to:
./dataset_files/Encoded/EMB-SEQ/GloVe300_TEST_P2.data

COMPLETE



In [20]:
encoded_test_tweets[0].shape

(55, 300)