In [1]:
"""
@Author: Divyansh.Gupta
"""
from tensorflow.keras.preprocessing.text import one_hot # to get one hot vector for each word in vocab

In [2]:
para = """Paragraphs are the building blocks of papers. Many students define paragraphs in terms of length: a paragraph is a group of at least five sentences, a paragraph is half a page long, etc. In reality, though, the unity and coherence of ideas among sentences is what constitutes a paragraph. A paragraph is defined as “a group of sentences or a single sentence that forms a unit” (Lunsford and Connors 116). Length and appearance do not determine whether a section in a paper is a paragraph. For instance, in some styles of writing, particularly journalistic styles, a paragraph can be just one sentence long. Ultimately, a paragraph is a sentence or group of sentences that support one main idea. In this handout, we will refer to this as the “controlling idea,” because it controls what happens in the rest of the paragraph."""
para

'Paragraphs are the building blocks of papers. Many students define paragraphs in terms of length: a paragraph is a group of at least five sentences, a paragraph is half a page long, etc. In reality, though, the unity and coherence of ideas among sentences is what constitutes a paragraph. A paragraph is defined as “a group of sentences or a single sentence that forms a unit” (Lunsford and Connors 116). Length and appearance do not determine whether a section in a paper is a paragraph. For instance, in some styles of writing, particularly journalistic styles, a paragraph can be just one sentence long. Ultimately, a paragraph is a sentence or group of sentences that support one main idea. In this handout, we will refer to this as the “controlling idea,” because it controls what happens in the rest of the paragraph.'

# Text Cleaning

In [None]:
import nltk
nltk.download()
nltk.download('punkt')
import re # regular Expression
from nltk.corpus import stopwords # For removing stopwords
from nltk.stem import PorterStemmer # For doing stemming
from nltk.stem import WordNetLemmatizer # For doing lemmatization

In [7]:
# Creating objects 
stemm = PorterStemmer()
lemm = WordNetLemmatizer()

In [8]:
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
text = re.sub(r'\[[0-9]*\]',' ',para)
text = re.sub(r'\s+',' ',text)
text = text.lower()
text = re.sub(r'\d',' ',text)
text = re.sub(r'\s',' ',text)

In [10]:
sentences = nltk.sent_tokenize(text)
sentences

['paragraphs are the building blocks of papers.',
 'many students define paragraphs in terms of length: a paragraph is a group of at least five sentences, a paragraph is half a page long, etc.',
 'in reality, though, the unity and coherence of ideas among sentences is what constitutes a paragraph.',
 'a paragraph is defined as “a group of sentences or a single sentence that forms a unit” (lunsford and connors    ).',
 'length and appearance do not determine whether a section in a paper is a paragraph.',
 'for instance, in some styles of writing, particularly journalistic styles, a paragraph can be just one sentence long.',
 'ultimately, a paragraph is a sentence or group of sentences that support one main idea.',
 'in this handout, we will refer to this as the “controlling idea,” because it controls what happens in the rest of the paragraph.']

In [11]:
voc_size=10000 # our vocabulory size is 10000

# One Hot Representation

In [12]:
one_hot_rep = [one_hot(words,voc_size) for words in sentences]
print(one_hot_rep) # it'll print index number of each word in vocabulary for each sentence

[[280, 2888, 3884, 1520, 7160, 209, 4348], [4500, 6215, 7602, 280, 9241, 6457, 209, 6840, 9246, 9138, 8984, 9246, 1766, 209, 2429, 4446, 5290, 2072, 9246, 9138, 8984, 9807, 9246, 4475, 2972, 608], [9241, 2151, 9037, 3884, 5267, 3961, 5238, 209, 706, 5867, 2072, 8984, 5394, 1880, 9246, 9138], [9246, 9138, 8984, 9901, 2085, 2981, 1766, 209, 2072, 9955, 9246, 9056, 8820, 7225, 4551, 9246, 3301, 7770, 3961, 3544], [6840, 3961, 8453, 1023, 3371, 4765, 6699, 9246, 232, 9241, 9246, 5268, 8984, 9246, 9138], [4457, 277, 9241, 1237, 3139, 209, 2402, 471, 5897, 3139, 9246, 9138, 5951, 5456, 9246, 2138, 8820, 2972], [18, 9246, 9138, 8984, 9246, 8820, 9955, 1766, 209, 2072, 7225, 8576, 2138, 877, 7215], [9241, 6903, 2096, 6066, 9993, 1053, 3611, 6903, 2085, 3884, 3298, 7215, 6019, 5934, 4683, 3368, 5394, 8996, 9241, 3884, 4184, 209, 3884, 9138]]


In [15]:
# Want to check how its look like if sentence has slight change
s1 = "define paragraphs in terms of length"
s2 = "define paragraphs in terms of sentences"
s = [s1,s2]
one_hot_s1 = [one_hot(words,voc_size) for words in s]
print("One Hot representation of Sentence 1:", one_hot_s1[0])
print("One Hot representation of Sentence 2:", one_hot_s1[1])
# Here you can see last word is different in sentence so having different index value 

One Hot representation of Sentence 1: [7602, 280, 9241, 6457, 209, 6840]
One Hot representation of Sentence 2: [7602, 280, 9241, 6457, 209, 2072]


# Word Embedding Representation

In [21]:
from tensorflow.keras.layers import Embedding # to create embedding matrix which contains feature representation and words
from tensorflow.keras.preprocessing.sequence import pad_sequences # it is used to add padding to sentences and make all of same length 
from tensorflow.keras.models import Sequential # to create a sequential model
import numpy as np

In [25]:
sent_length = 32
padding = pad_sequences(one_hot_rep,padding="pre" ,maxlen=sent_length) # "pre": adding extra zeros in front to make all are of same length
print(padding)

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0  280 2888 3884
  1520 7160  209 4348]
 [   0    0    0    0    0    0 4500 6215 7602  280 9241 6457  209 6840
  9246 9138 8984 9246 1766  209 2429 4446 5290 2072 9246 9138 8984 9807
  9246 4475 2972  608]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0 9241 2151 9037 3884 5267 3961 5238  209  706 5867 2072 8984
  5394 1880 9246 9138]
 [   0    0    0    0    0    0    0    0    0    0    0    0 9246 9138
  8984 9901 2085 2981 1766  209 2072 9955 9246 9056 8820 7225 4551 9246
  3301 7770 3961 3544]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0 6840 3961 8453 1023 3371 4765 6699 9246  232 9241 9246
  5268 8984 9246 9138]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
  4457  277 9241 1237 3139  209 2402  471 5897 3139 9246 9138 5951 5456
  9246 2138 8820 2972

# Create Word Embedding Model

In [26]:
# Mention the dimensions of vectors for each word in sentence
dim=10

In [27]:
model = Sequential()
model.add(Embedding(voc_size,dim,input_length=sent_length))
model.compile("adam","mse")

In [28]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 32, 10)            100000    
Total params: 100,000
Trainable params: 100,000
Non-trainable params: 0
_________________________________________________________________


In [29]:
print(model.predict(padding))

[[[ 0.03353995  0.03963746 -0.04645773 ...  0.01317262 -0.0237596
   -0.00102959]
  [ 0.03353995  0.03963746 -0.04645773 ...  0.01317262 -0.0237596
   -0.00102959]
  [ 0.03353995  0.03963746 -0.04645773 ...  0.01317262 -0.0237596
   -0.00102959]
  ...
  [-0.03986184 -0.01845573  0.03058202 ... -0.01486502 -0.02003608
   -0.00743703]
  [-0.02975103 -0.04581063 -0.0209465  ...  0.03024281  0.01296455
    0.03867718]
  [-0.00255584  0.03052447 -0.04487751 ...  0.02593489  0.01097616
   -0.02916834]]

 [[ 0.03353995  0.03963746 -0.04645773 ...  0.01317262 -0.0237596
   -0.00102959]
  [ 0.03353995  0.03963746 -0.04645773 ...  0.01317262 -0.0237596
   -0.00102959]
  [ 0.03353995  0.03963746 -0.04645773 ...  0.01317262 -0.0237596
   -0.00102959]
  ...
  [-0.01903179 -0.01659364  0.02973784 ... -0.04506988  0.04727537
    0.01585228]
  [-0.04384639  0.03798742  0.028545   ...  0.01131365 -0.02177833
    0.02681507]
  [ 0.04848531 -0.03570934  0.02473709 ...  0.00170001 -0.03719307
    0.033950

In [32]:
padding[0][-1]

4348

In [33]:
print(model.predict(padding)[0][-1])

[-0.00255584  0.03052447 -0.04487751 -0.00218828 -0.03234901  0.00470889
 -0.00185559  0.02593489  0.01097616 -0.02916834]


Here you can see that the index number 4348 now is converted into vector which is having 10 dimension as we have mentioned in Embedding layer.