# Generate tweets text embeddings

In [23]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer


In [29]:
df_train = pd.read_csv('./train_enriched.csv', index_col='id')
df_train['text_content'].head()

id
0    Our Deeds are the Reason of this #earthquake M...
1               Forest fire near La Ronge Sask. Canada
2    All residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    Just got sent this photo from Ruby #Alaska as ...
Name: text_content, dtype: object

In [10]:
model = SentenceTransformer('avsolatorio/GIST-small-Embedding-v0')

# Sentences we want to encode. Example:
sentence = ['This framework generates embeddings for each input sentence']

# Sentences are encoded by calling model.encode()
embedding = model.encode(sentence)

modules.json: 100%|██████████| 349/349 [00:00<00:00, 669kB/s]
config_sentence_transformers.json: 100%|██████████| 124/124 [00:00<00:00, 630kB/s]
README.md: 100%|██████████| 68.0k/68.0k [00:00<00:00, 603kB/s]
sentence_bert_config.json: 100%|██████████| 52.0/52.0 [00:00<00:00, 208kB/s]
config.json: 100%|██████████| 719/719 [00:00<00:00, 4.28MB/s]
model.safetensors: 100%|██████████| 133M/133M [00:04<00:00, 30.8MB/s] 
tokenizer_config.json: 100%|██████████| 1.24k/1.24k [00:00<00:00, 4.09MB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.04MB/s]
tokenizer.json: 100%|██████████| 711k/711k [00:00<00:00, 1.63MB/s]
special_tokens_map.json: 100%|██████████| 695/695 [00:00<00:00, 2.30MB/s]
1_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 1.23MB/s]


In [11]:
embedding

array([[-2.99634710e-02,  3.97600746e-03, -1.36479279e-02,
        -8.05453584e-03,  2.25224765e-03,  4.80554961e-02,
        -4.25897278e-02, -2.83252122e-03,  3.77881415e-02,
         1.11437235e-02, -5.73201925e-02, -2.43431330e-02,
         9.18903723e-02, -4.43571992e-03,  4.93532382e-02,
         3.56133133e-02, -2.09863000e-02,  3.98139283e-02,
        -2.86895279e-02, -4.34059389e-02,  4.82565835e-02,
        -2.37481967e-02,  2.03373376e-02, -6.73861876e-02,
        -4.68946472e-02,  2.92156897e-02, -1.92536488e-02,
        -2.02594865e-02, -3.21837254e-02, -2.06036225e-01,
         1.74786970e-02, -4.50783558e-02,  8.02152678e-02,
         2.81342752e-02, -3.43690999e-02,  1.16153164e-02,
        -7.23941699e-02,  3.18346545e-02, -1.84725970e-02,
         2.20387336e-02,  1.19550759e-02,  1.62042230e-02,
        -2.09324923e-03, -6.45146668e-02,  3.86935682e-03,
        -5.94287589e-02, -5.76579347e-02,  5.01928007e-05,
        -1.96079332e-02,  2.57641580e-02, -1.69043243e-0

In [15]:
np.linalg.norm(embedding[0])

0.9999999

In [16]:
len(embedding[0])

384

In [38]:
def to_batches(lst, batch_size):
    for i in range(0, len(lst), batch_size):
        yield lst[i:i + batch_size]

In [62]:
def infer_embedding(data):
    batches = to_batches(data, batch_size=64)
    text_embedded = []
    for batch in batches:    
        text_embedded.extend(model.encode(batch))
    return text_embedded

In [46]:
text_embedded = infer_embedding(df_train['text_content'].to_list(), batch_size=64)
len(text_embedded), len(text_embedded[0])

384

In [60]:
import pickle

with open('./train-text-embeddings.pkl', 'wb') as fout:
    pickle.dump(text_embedded, fout)

In [63]:
df_test = pd.read_csv('./test_enriched.csv', index_col='id')
df_test['text_content'].head()

id
0                   Just happened a terrible car crash
1    Heard about #earthquake is different cities, s...
2    there is a forest fire at spot pond, geese are...
3             Apocalypse lighting. #Spokane #wildfires
4        Typhoon Soudelor kills 28 in China and Taiwan
Name: text_content, dtype: object

In [65]:
test_text_embedded = infer_embedding(df_test['text_content'].to_list())
len(test_text_embedded), len(test_text_embedded[0])

(3263, 384)

In [66]:
with open('./test-text-embeddings.pkl', 'wb') as fout:
    pickle.dump(test_text_embedded, fout)