In [1]:
from datasets import load_from_disk
import numpy as np

In [2]:
reloaded_dataset = load_from_disk("PreProcessedData3")
reloaded_dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'company', 'content', 'description', 'textwithoutcompany', 'fulltext', 'textwithoutcompanycombined'],
        num_rows: 33034
    })
    test: Dataset({
        features: ['image', 'company', 'content', 'description', 'textwithoutcompany', 'fulltext', 'textwithoutcompanycombined'],
        num_rows: 14158
    })
})

In [3]:
from collections import Counter
import torch
import torch.nn as nn

In [4]:
merged_sentance = ""
for data in reloaded_dataset["train"]:
    merged_sentance = merged_sentance + data["textwithoutcompanycombined"]+" "
print(len(merged_sentance))
for data in reloaded_dataset["test"]:
    merged_sentance = merged_sentance + data["textwithoutcompanycombined"]+" "
print(len(merged_sentance))

690114
984594


In [5]:
words = merged_sentance.split(' ')
 
# create a dictionary
vocab = Counter(words) 
vocab = sorted(vocab, key=vocab.get, reverse=True)
vocab_size = len(vocab)
 
# create a word to index dictionary from our Vocab dictionary
word2idx = {word: ind for ind, word in enumerate(vocab, 1)} 
idx2word = {ind: word for ind, word in enumerate(vocab, 1)} 

In [6]:
#idx2word[0]

In [7]:
len(word2idx),len(idx2word)

(1748, 1748)

In [8]:
maxlength = 0
maxstring = ""
for text in reloaded_dataset["train"]['textwithoutcompanycombined']:
    if len(text.split(' ')) > maxlength:
        maxlength = len(text.split(' '))
        maxstring = text
maxstring

'flag South Georgia &amp; South Sandwich Islands'

In [9]:
maxlength = 0
maxstring = ""
for text in reloaded_dataset["test"]['textwithoutcompanycombined']:
    if len(text.split(' ')) > maxlength:
        maxlength = len(text.split(' '))
        maxstring = text
maxstring, maxlength

('flag South Georgia &amp; South Sandwich Islands', 7)

In [10]:
words = reloaded_dataset["train"][1]['textwithoutcompanycombined']
words = words.split(' ')

In [11]:
words

['spiral', 'calendar']

In [12]:
encoded_sentences = [word2idx[word] for word in words]

In [13]:
encoded_sentences

[292, 280]

In [14]:
encoded_sentences = np.pad(encoded_sentences, [(0, 10-len(encoded_sentences))], mode='constant', constant_values=0)
encoded_sentences

array([292, 280,   0,   0,   0,   0,   0,   0,   0,   0])

In [15]:
e_dim = 20
torch.manual_seed(108)
emb = nn.Embedding(vocab_size, e_dim)

# initialise an Embedding layer from Torch
word_vectors = emb(torch.LongTensor(encoded_sentences))
 
#print the word_vectors
print(word_vectors.shape)

torch.Size([10, 20])


In [16]:
word_vectors.view(word_vectors.shape)

tensor([[ 0.8774,  2.6091, -1.6474,  0.7395,  2.5295,  0.9154,  1.3531, -0.5064,
          0.4786,  1.3478,  0.3787, -1.2011, -0.3320,  0.3511,  0.3902, -0.8412,
         -0.3038,  1.0621, -2.1123, -1.0391],
        [-0.1156, -0.8925,  2.3348,  2.0050,  0.9116, -0.4180,  0.1533,  1.1945,
          1.5877, -1.4791,  0.1501,  0.7314,  1.4814, -1.4480,  0.5282, -0.0815,
          0.0329,  0.4260,  0.1926, -0.8453],
        [-0.7916, -0.7535,  1.7878, -0.3890, -1.1483, -1.3955, -0.3308,  0.4340,
         -1.6240,  0.0469,  0.2348,  1.2304, -0.2808, -0.2349, -0.0063, -1.0051,
         -1.1442, -1.0616, -0.9591,  0.7995],
        [-0.7916, -0.7535,  1.7878, -0.3890, -1.1483, -1.3955, -0.3308,  0.4340,
         -1.6240,  0.0469,  0.2348,  1.2304, -0.2808, -0.2349, -0.0063, -1.0051,
         -1.1442, -1.0616, -0.9591,  0.7995],
        [-0.7916, -0.7535,  1.7878, -0.3890, -1.1483, -1.3955, -0.3308,  0.4340,
         -1.6240,  0.0469,  0.2348,  1.2304, -0.2808, -0.2349, -0.0063, -1.0051,
      

In [17]:
vector = word_vectors.reshape(word_vectors.shape[0]*word_vectors.shape[1]).detach().numpy()
vector

array([ 0.87744504,  2.6091251 , -1.6473674 ,  0.739503  ,  2.5295432 ,
        0.9153646 ,  1.3531317 , -0.5063943 ,  0.478623  ,  1.3478467 ,
        0.37872705, -1.2010821 , -0.3319827 ,  0.3510692 ,  0.39023072,
       -0.8412367 , -0.30378404,  1.0620909 , -2.1123266 , -1.0390626 ,
       -0.11561434, -0.89245903,  2.3348405 ,  2.004957  ,  0.9116171 ,
       -0.41804042,  0.15331705,  1.1944932 ,  1.5876695 , -1.4791347 ,
        0.15005708,  0.7314496 ,  1.4813932 , -1.4479685 ,  0.52821666,
       -0.08152338,  0.0329162 ,  0.42596632,  0.1926001 , -0.84533465,
       -0.7916029 , -0.75350225,  1.7878226 , -0.38895676, -1.1482836 ,
       -1.395541  , -0.3307842 ,  0.4339944 , -1.6239568 ,  0.04687895,
        0.23482616,  1.2304263 , -0.28079075, -0.23490097, -0.00626535,
       -1.0051268 , -1.1442207 , -1.0616112 , -0.95910597,  0.79948807,
       -0.7916029 , -0.75350225,  1.7878226 , -0.38895676, -1.1482836 ,
       -1.395541  , -0.3307842 ,  0.4339944 , -1.6239568 ,  0.04

In [18]:
def get_encoded_sentences(sentance):
    words = sentance.split(' ')
    encoded_words = [word2idx[word] for word in words]
    return encoded_words

def get_decoded_sentences(encoded_words):
    sentance = ' '.join([idx2word[idx] for idx in encoded_words])
    return sentance

In [19]:
get_decoded_sentences(get_encoded_sentences("dark-skin-tone"))

'dark-skin-tone'

In [20]:
reloaded_dataset["train"][2]

{'image': <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=64x64>,
 'company': 'huawei',
 'content': 'BACK arrow',
 'description': '',
 'textwithoutcompany': 'BACK arrow',
 'fulltext': 'huawei BACK arrow',
 'textwithoutcompanycombined': 'BACK arrow'}

In [21]:
fulltext_vector = []
for data in reloaded_dataset["train"]:
    #print(data["fulltext"])
    #print(get_encoded_sentences(data["fulltext"]))
    encoded_sentences = get_encoded_sentences(data["textwithoutcompanycombined"])
    encoded_sentences = np.pad(encoded_sentences, [(0, 10-len(encoded_sentences))], mode='constant', constant_values=0)
    word_vectors = emb(torch.LongTensor(encoded_sentences))
    word_vectors = word_vectors.reshape(word_vectors.shape[0]*word_vectors.shape[1]).detach().numpy()
    fulltext_vector.append(word_vectors)
    #print(fulltext_vector)

In [22]:
reloaded_dataset["train"]=reloaded_dataset["train"].add_column("fulltext_vector", fulltext_vector)
reloaded_dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'company', 'content', 'description', 'textwithoutcompany', 'fulltext', 'textwithoutcompanycombined', 'fulltext_vector'],
        num_rows: 33034
    })
    test: Dataset({
        features: ['image', 'company', 'content', 'description', 'textwithoutcompany', 'fulltext', 'textwithoutcompanycombined'],
        num_rows: 14158
    })
})

In [23]:
fulltext_vector = []
for data in reloaded_dataset["test"]:
    #print(data["fulltext"])
    #print(get_encoded_sentences(data["fulltext"]))
    encoded_sentences = get_encoded_sentences(data["textwithoutcompanycombined"])
    encoded_sentences = np.pad(encoded_sentences, [(0, 10-len(encoded_sentences))], mode='constant', constant_values=0)
    word_vectors = emb(torch.LongTensor(encoded_sentences))
    word_vectors = word_vectors.reshape(word_vectors.shape[0]*word_vectors.shape[1]).detach().numpy()
    fulltext_vector.append(word_vectors)
    #print(fulltext_vector)

In [24]:
reloaded_dataset["test"]=reloaded_dataset["test"].add_column("fulltext_vector", fulltext_vector)
reloaded_dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'company', 'content', 'description', 'textwithoutcompany', 'fulltext', 'textwithoutcompanycombined', 'fulltext_vector'],
        num_rows: 33034
    })
    test: Dataset({
        features: ['image', 'company', 'content', 'description', 'textwithoutcompany', 'fulltext', 'textwithoutcompanycombined', 'fulltext_vector'],
        num_rows: 14158
    })
})

In [25]:
reloaded_dataset.save_to_disk("PreProcessedDataWithEmb4")

Saving the dataset (0/1 shards):   0%|          | 0/33034 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/14158 [00:00<?, ? examples/s]

In [26]:
reloaded_dataset = load_from_disk("PreProcessedDataWithEmb4")
reloaded_dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'company', 'content', 'description', 'textwithoutcompany', 'fulltext', 'textwithoutcompanycombined', 'fulltext_vector'],
        num_rows: 33034
    })
    test: Dataset({
        features: ['image', 'company', 'content', 'description', 'textwithoutcompany', 'fulltext', 'textwithoutcompanycombined', 'fulltext_vector'],
        num_rows: 14158
    })
})