In [8]:
from keras.preprocessing.text import Tokenizer
from gensim.models.fasttext import FastText
import numpy as np
import matplotlib.pyplot as plt
import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from nltk import WordPunctTokenizer


import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

%matplotlib inline

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# **FAST TEXT Pretrained models for generating Embeddings**

In [7]:
import gensim.downloader as api
fast_text = api.load('fasttext-wiki-news-subwords-300')



In [9]:
import re
from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

def preprocess_text(document):
        # Remove all the special characters
        document = re.sub(r'\W', ' ', str(document))

        # remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

        # Remove single characters from the start
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)

        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)

        # Converting to Lowercase
        document = document.lower()

        # Lemmatization
        tokens = document.split()
        tokens = [stemmer.lemmatize(word) for word in tokens]
        tokens = [word for word in tokens if word not in en_stop]
        tokens = [word for word in tokens if len(word) > 3]

        preprocessed_text = ' '.join(tokens)

        return preprocessed_text

In [10]:
sent = preprocess_text('NLP drives computer programs that translate text from one language to another respond to spoken commands and summarize large volumes of text rapidly even in real time. There is a good chance you interacted with NLP in the form of voiceoperated GPS systems, digital assistants, speech to text dictation software, customer service chatbots, and other consumer conveniences. But NLP also plays a growing role in enterprise solutions that help streamline business operations, increase employee productivity, and simplify mission-critical business processes.')
print(sent)

drive computer program translate text language another respond spoken command summarize large volume text rapidly even real time good chance interacted form voiceoperated system digital assistant speech text dictation software customer service chatbots consumer convenience also play growing role enterprise solution help streamline business operation increase employee productivity simplify mission critical business process


In [11]:
word_punctuation_tokenizer = nltk.WordPunctTokenizer()
word_tokenized_corpus = [word_punctuation_tokenizer.tokenize(sent)]

In [12]:
corps = word_tokenized_corpus[0]

In [13]:
corpus_1 = []
for i in range(15):
    x = corps[i]
    corpus_1.append(x)

In [14]:
corpus_1

['drive',
 'computer',
 'program',
 'translate',
 'text',
 'language',
 'another',
 'respond',
 'spoken',
 'command',
 'summarize',
 'large',
 'volume',
 'text',
 'rapidly']

In [15]:
corpus_1[0]

'drive'

In [19]:
vectors = []
for i in corpus_1:
    y = fast_text[i]
    vectors.append(y)

In [20]:
word_vectors = dict(zip(corpus_1,vectors))

In [21]:
word_vectors['language']

array([ 0.052443  , -0.04092   ,  0.071409  , -0.050545  ,  0.011134  ,
        0.024363  , -0.003295  , -0.081621  , -0.013807  , -0.0047292 ,
        0.0062077 , -0.047232  , -0.041087  ,  0.030305  , -0.011236  ,
        0.0024607 ,  0.10316   ,  0.026832  ,  0.033888  ,  0.04143   ,
       -0.074284  , -0.064548  ,  0.12541   ,  0.069809  , -0.047025  ,
        0.024811  , -0.038041  ,  0.045696  ,  0.0057668 , -0.05067   ,
        0.020329  ,  0.0065136 , -0.016461  , -0.063523  ,  0.026372  ,
        0.0060635 ,  0.063069  , -0.0024937 ,  0.030853  ,  0.0035123 ,
       -0.016351  , -0.12949   , -0.0024779 , -0.015018  ,  0.023415  ,
       -0.043484  , -0.066284  ,  0.0070623 ,  0.031937  ,  0.006673  ,
        0.015953  ,  0.023214  ,  0.013413  , -0.028797  , -0.030572  ,
        0.032028  ,  0.069296  , -0.025772  , -0.040117  ,  0.015852  ,
        0.02472   , -0.033685  ,  0.12775   ,  0.032704  ,  0.0022198 ,
       -0.021993  ,  0.032521  ,  0.062367  ,  0.019989  , -0.02