In [0]:
import spacy
from spacy import displacy

nlp = spacy.load('en_core_web_sm')

In [0]:
doc1 = nlp("Dhaval Rajput is Senior software Engineer working at Toshiba.")

In [0]:
len(nlp.Defaults.stop_words)

326

In [0]:
displacy.render(doc1, style='dep', jupyter=True)

In [0]:
displacy.render(doc1, style='ent', jupyter=True)

In [0]:
from nltk.stem.porter import PorterStemmer

In [0]:
p_stemmer = PorterStemmer()
words = ['run', 'runner', 'runs', 'ran', 'easily', 'fairly']
for word in words:
  print(word + '-------->' + p_stemmer.stem(word))

run-------->run
runner-------->runner
runs-------->run
ran-------->ran
easily-------->easili
fairly-------->fairli


In [0]:
from nltk.stem.snowball import SnowballStemmer
s_stemmer = SnowballStemmer(language='english')

In [0]:
for word in words:
  print(word + ' ---------> ' + s_stemmer.stem(word))

run ---------> run
runner ---------> runner
runs ---------> run
ran ---------> ran
easily ---------> easili
fairly ---------> fair


In [0]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

In [0]:
pattern1 = [{'LOWER':'solarpower'}]
pattern2 = [{'LOWER':'solar'}, {'IS_PUNCT':True}, {'LOWER':'power'}]
pattern3 = [{'LOWER':'solar'}, {'LOWER':'power'}]

In [0]:
matcher.add('SolarPower', None, pattern1, pattern2, pattern3)

In [0]:
doc = nlp(u"The solar power industry continues to grow due to Solar-Power. solarpower is amazing.")
found_matchs = matcher(doc)

In [0]:
found_matchs

[(8656102463236116519, 1, 3),
 (8656102463236116519, 9, 12),
 (8656102463236116519, 13, 14)]

## **Parts Of Speech Tagging**



In [0]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")
print(doc.text)

The quick brown fox jumped over the lazy dog's back.


In [0]:
print(doc[4].pos_, doc[4].tag_)

VERB VBD


In [0]:
# Format and display text, part of speech, tag and explaination of tag associated with particular text in document.
for token in doc:
    print(f"{token.text:{10}} {token.pos_:{5}} {token.tag_:{5}} {spacy.explain(token.tag_)}")

The        DET   DT    determiner
quick      ADJ   JJ    adjective
brown      ADJ   JJ    adjective
fox        NOUN  NN    noun, singular or mass
jumped     VERB  VBD   verb, past tense
over       ADP   IN    conjunction, subordinating or preposition
the        DET   DT    determiner
lazy       ADJ   JJ    adjective
dog        NOUN  NN    noun, singular or mass
's         PART  POS   possessive ending
back       NOUN  NN    noun, singular or mass
.          PUNCT .     punctuation mark, sentence closer


In [0]:
# Notice difference between Tag and expalined tag
doc = nlp(u"I read books on NLP.")
word = doc[1]
token = word
print(f"{token.text:{10}} {token.pos_:{5}} {token.tag_:{5}} {spacy.explain(token.tag_)}")

read       VERB  VBD   verb, past tense


In [0]:
# Notice difference between Tag and expalined tag with change in tense
doc = nlp(u"I had read a book on NLP.")
word = doc[1]
token = word
print(f"{token.text:{10}} {token.pos_:{5}} {token.tag_:{5}} {spacy.explain(token.tag_)}")

had        AUX   VBD   verb, past tense


In [0]:
# Get parts of speech count in document
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")
POS_counts = doc.count_by(spacy.attrs.POS)
POS_counts

{84: 3, 85: 1, 90: 2, 92: 3, 94: 1, 97: 1, 100: 1}

In [0]:
doc.vocab[84].text

'ADJ'

In [0]:
doc[2].pos

84

In [0]:
for k, v in sorted(POS_counts.items()):
  print(f"{k:{5}}.  {doc.vocab[k].text:{5}} {v}")

   84.  ADJ   3
   85.  ADP   1
   90.  DET   2
   92.  NOUN  3
   94.  PART  1
   97.  PUNCT 1
  100.  VERB  1


In [0]:
# Get Tag count in document
TAG_counts = doc.count_by(spacy.attrs.TAG)
for k, v in sorted(TAG_counts.items()):
  print(f"{k:{5}}.  {doc.vocab[k].text:{5}} {v}")

   74.  POS   1
1292078113972184607.  IN    1
10554686591937588953.  JJ    3
12646065887601541794.  .     1
15267657372422890137.  DT    2
15308085513773655218.  NN    3
17109001835818727656.  VBD   1


In [0]:
# Get Dependency count in document
DEP_counts = doc.count_by(spacy.attrs.DEP)
for k, v in sorted(DEP_counts.items()):
  print(f"{k:{5}}.  {doc.vocab[k].text:{5}} {v}")

  402.  amod  3
  415.  det   2
  429.  nsubj 1
  439.  pobj  1
  440.  poss  1
  443.  prep  1
  445.  punct 1
8110129090154140942.  case  1
8206900633647566924.  ROOT  1


## **Visualizing Parts of Speech**

In [0]:
from spacy import displacy
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")
displacy.render(doc, style='dep', jupyter=True)

In [0]:
options = {'distance':110, 'compact':'True', 'color':'Blue', 'bg':'white', 'fonts':'Times'}
displacy.render(doc, style='dep', jupyter=True, options=options)

In [0]:
doc2 = nlp(u"This is a sentence. This is another sentence. This is another sentence possibly longer than other.")
spans = list(doc2.sents)
#displacy.serve(spans, style='dep', options=options)

## **Name Entity Recognition**


*   Name Entity Recognition seeks to locate and classify named entity mentioned in unstructured text into pre-defined categories such as person names, organization, percentages, monetary values, time expressions, locations, medical codes etc..
*   NER with spacy





In [0]:
# Function to show entities in document
def show_ents(doc):
  if doc.ents:
    for ent in doc.ents:
      print(f"{ent.text:{40}} {ent.label_:{30}} {spacy.explain(ent.label_)}")
  else:
    print('No entities found !!!')

In [0]:
doc = nlp('How are you?')
show_ents(doc)

No entities found !!!


In [0]:
doc = nlp('May I go to Washington, DC next May to see the Washington Monuments?')
show_ents(doc)

Washington                               GPE                            Countries, cities, states
next May                                 DATE                           Absolute or relative dates or periods
the Washington Monuments                 ORG                            Companies, agencies, institutions, etc.


In [0]:
doc = nlp('Can i have 500 dollars of Microsoft Stocks?')
show_ents(doc)

500 dollars                              MONEY                          Monetary values, including unit
Microsoft                                ORG                            Companies, agencies, institutions, etc.


In [0]:
doc = nlp('Tesla to build a UK factory for $6 million')
show_ents(doc)

UK                                       GPE                            Countries, cities, states
$6 million                               MONEY                          Monetary values, including unit


In [0]:
from spacy.tokens import Span
ORG = doc.vocab.strings[u"ORG"] 
new_ent = Span(doc, 0 , 1, label=ORG)
doc.ents = list(doc.ents) + [new_ent]
show_ents(doc)

Tesla                                    ORG                            Companies, agencies, institutions, etc.
UK                                       GPE                            Countries, cities, states
$6 million                               MONEY                          Monetary values, including unit


In [0]:
doc = nlp("Out company reated a brand enw vaccum cleaner."
          "This new vaccum-cleaner is best in show.")
show_ents(doc)

enw vaccum                               PERSON                         People, including fictional


In [0]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)
phrase_list = ['vaccum cleaner', 'vaccum-cleaner']
phrase_patterns = [nlp(text) for text in phrase_list]
matcher.add('newproduct', None, *phrase_patterns)
found_matches = matcher(doc)
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [0]:
PROD = doc.vocab.strings[u"PRODUCT"]
new_ents = [Span(doc, match[1], match[2], label=PROD) for match in found_matches]
doc.ents = list(doc.ents) + new_ents
show_ents(doc)

In [0]:
# Filtering specific entity from document.
doc = nlp(u"Originally I paid $29.95 for this car toy, but now it is marked down to 10 dollars")
parsed_ent = [ent for ent in doc.ents if ent.label_ == "MONEY"]
parsed_ent

[29.95, 10 dollars]

## **Visualizing Named Entity Recognition**

In [0]:
doc = nlp(u"Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million."
          u"By contrast, Sony only sold 8 thousand Walkman music players.")
displacy.render(doc, style='ent', jupyter=True)

In [0]:
for sent in doc.sents:
  displacy.render(sent, style='ent', jupyter=True)

In [0]:
colors = {'ORG':'radial-gradient(yellow, pink)'}
options = {'ents':['PRODUCT', 'ORG'], 'colors':colors}
displacy.render(doc, style='ent', jupyter=True, options=options)

## **Sentence Segmentation**

In [0]:
doc = nlp(u'"Management is doing right things; leadership is doing the right things." - Peter Drucker')
doc.text

In [0]:
for sent in doc.sents:
  print(sent)
  print('\n')

In [0]:
# Add a segmentation rule
def set_custom_boundaries(doc):
   for token in doc[:-1]:
     if token.text == ';':
       doc[token.i + 1].is_sent_start = True

   return doc

nlp.add_pipe(set_custom_boundaries, before='parser')
nlp.pipe_names

In [0]:
# Sentence separated at semicolon ';'
doc4 = nlp(u'"Management is doing right things; leadership is doing the right things." - Peter Drucker')
for sent in doc4.sents:
  print(sent)

In [0]:
# Change segmentation rule
nlp = spacy.load('en_core_web_sm')

In [0]:
mystring = nlp(u"This is a sentence. This is another.\n\n This ia \n third sentence")
print(mystring)

In [0]:
doc = nlp(u"This is a sentence. This is another.\n\n This ia \nthird sentence")

In [0]:
for sentence in doc.sents:
  print(sentence)

In [0]:
from spacy.pipeline import SentenceSegmenter

def split_on_newline(doc):
  start = 0
  seen_newline = False
  for word in doc:
    if seen_newline:
      yield doc[start : word.i]
      start = word.i 
      seen_newline = False
    elif word.text.startswith('\n'):
      seen_newline = True
  
  yield doc[start:]

In [0]:
sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_newline)
nlp.add_pipe(sbd)

doc = nlp(u"This is a sentence. This is another.\n\n This ia \nthird sentence")
for sentence in doc.sents:
  print(sentence)

## **Text Classification using Machine Learning**

In [0]:
import numpy as np
import pandas as pd

In [0]:
df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Udemy/smsspamcollection.tsv", sep='\t')
df.head()

In [0]:
df.isnull().sum()

In [0]:
df['label'].value_counts()

In [0]:
from sklearn.model_selection import train_test_split

X = df['message']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()

# Fit vectorizer to data, to build vocabulary and count number of words.
#count_vect.fit(X_train)
#X_train_counts = count_vect.transform(X_train)
# Transform orignal text message to vector
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts

In [0]:
X_train_counts.shape

In [0]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [0]:
X_train_tfidf.shape

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_train_tfidf.shape

In [0]:
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_train_tfidf, y_train)

In [0]:
# Create pipeline

from sklearn.pipeline import Pipeline
text_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])
text_clf.fit(X_train, y_train)

In [0]:
predictions = text_clf.predict(X_test)

In [0]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, predictions))

In [0]:
print(classification_report(y_test, predictions))

In [0]:
from sklearn import metrics

print(metrics.accuracy_score(y_test, predictions))

In [0]:
text_clf.predict(["Get free meal and 1000 dollars for winning contest"])

## **Text Classification Project**

In [0]:
import numpy as np
import pandas as pd

In [0]:
df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Udemy/moviereviews.tsv", sep='\t')
df.head()

In [0]:
df.shape

In [0]:
print(df['review'][2])

In [0]:
df.isnull().sum()

In [0]:
df.dropna(inplace=True)
df.isnull().sum()

In [0]:
blanks = []
mystring = 'hello'
empty = ' '

for i, lb, rv in df.itertuples():
  if rv.isspace():
    blanks.append(i)

In [0]:
blanks

In [0]:
df.drop(blanks, inplace=True)
df.shape

In [0]:
from sklearn.model_selection import train_test_split

X = df['review']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

text_clf = Pipeline([('tfidf', TfidfVectorizer()), 
                     ('clf', LinearSVC())])
text_clf.fit(X_train, y_train)
predictions = text_clf.predict(X_test)

In [0]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, predictions))

In [0]:
print(classification_report(y_test, predictions))

In [0]:
from sklearn import metrics

print(metrics.accuracy_score(y_test, predictions))

## **Semantic and Sentiment Analysis**

In [0]:
!python -m spacy download en_core_web_md

In [0]:
!python -m spacy download en_core_web_lg

In [0]:
import spacy

In [0]:
nlp = spacy.load('en_core_web_lg')

In [0]:
tokens = nlp(u'lion cat pet')
for token1 in tokens:
  for token2 in tokens:
    print(token1.text, token2.text, token1.similarity(token2))

In [0]:
tokens = nlp(u'like love hate')
for token1 in tokens:
  for token2 in tokens:
    print(token1.text, token2.text, token1.similarity(token2))

In [0]:
nlp.vocab.vectors.shape

In [0]:
tokens = nlp(u'dog cat nargle')
for token in tokens:
  print(token.text, token.has_vector, token.vector_norm, token.is_oov)

In [0]:
from scipy import spatial

cosine_similarity = lambda vec1, vec2 : 1-spatial.distance.cosine(vec1, vec2)

In [0]:
king = nlp.vocab['king'].vector
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector

In [0]:
new_vector = king - man + woman

computed_similarities = []
for word in nlp.vocab:
  if word.has_vector:
    if word.is_lower:
      if word.is_alpha:
        similarity = cosine_similarity(new_vector, word.vector)
        computed_similarities.append((word, similarity))

In [0]:
computed_similarities = sorted(computed_similarities, key=lambda item:-item[1])
print([t[0]. text for t in computed_similarities[:10]])


In [0]:
import nltk

In [0]:
nltk.download('vader_lexicon')

In [0]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()
a = 'This is a good movie'

sid.polarity_scores(a)

In [0]:
a = "This was the best, most awesome movie EVER MADE !!!"
sid.polarity_scores(a)

In [0]:
a = "This was the worst movie that has ever disgraced the screen."
sid.polarity_scores(a)

In [0]:
import pandas as pd
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Udemy/amazonreviews.tsv', sep='\t')
df.head()

In [0]:
df['label'].value_counts()

In [0]:
df.isnull().sum()

In [0]:
df.dropna(inplace=True)

In [0]:
blanks = []
for i, lb, rv in df.itertuples():
  if type(rv) == str:
    if rv.isspace():
      blanks.append(i)
blanks

df.drop(blanks, inplace=True)

In [0]:
print(df.iloc[0]['review'])
sid.polarity_scores(df.iloc[0]['review'])

In [0]:
df['scores'] = df['review'].apply(lambda review: sid.polarity_scores(review))

In [0]:
df.head()

In [0]:
df['compound'] = df['scores'].apply(lambda d:d['compound'])
df.head()

In [0]:
df['comp_score'] = df['compound'].apply(lambda score: 'pos' if score >= 0 else 'neg')
df.head()

In [0]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print(accuracy_score(df['label'], df['comp_score']))

In [0]:
print(classification_report(df['label'], df['comp_score']))

In [0]:
print(confusion_matrix(df['label'], df['comp_score']))

## **Sentiment Analysis Project**

In [0]:
import numpy as np
import pandas as pd

In [0]:
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Udemy/moviereviews.tsv', sep='\t')
df.head()

In [0]:
df.shape

In [0]:
df.dropna(inplace=True)
df.shape

In [0]:
blanks = []
for i, lb, rv in df.itertuples():
  if type(rv) == str:
    if rv.isspace():
      blanks.append(i)
blanks

In [0]:
df.drop(blanks, inplace=True)
df.shape

In [0]:
df['label'].value_counts()

In [0]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
sid = SentimentIntensityAnalyzer()

In [0]:
df['scores'] = df['review'].apply(lambda review:sid.polarity_scores(review))

In [0]:
df['compound'] = df['scores'].apply(lambda d: d['compound'])
df['comp_scores'] = df['compound'].apply(lambda score: 'pos' if score > 0 else 'neg')

In [0]:
df.head()

In [0]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print(accuracy_score(df['label'], df['comp_scores']))

In [0]:
print(confusion_matrix(df['label'], df['comp_scores']))

In [0]:
print(classification_report(df['label'], df['comp_scores']))

# **Topic Modelling**
## **Latent Dirichlet Allocation**

# **Deep Learning for NLP**

In [0]:
import numpy as np
from sklearn.datasets import load_iris

In [0]:
iris = load_iris()
type(iris)

In [0]:
print(iris.feature_names)
X = iris.data
y = iris.target

In [0]:
# Perform one hot encoding
#Class 0 -> [1, 0 , 0]
#Class 1 -> [0, 1 , 0]
#Class 2 -> [0, 0 , 1]
from keras.utils import to_categorical
y = to_categorical(y)
y.shape

In [0]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [0]:
from sklearn.preprocessing import MinMaxScaler
scaler_obj = MinMaxScaler()
scaler_obj.fit(X_train)

In [0]:
scaled_X_train = scaler_obj.transform(X_train)
scaled_X_test  = scaler_obj.transform(X_test)

In [0]:
from keras.models import Sequential
from keras.layers import Dense

In [0]:
model = Sequential()
model.add(Dense(8, input_dim=4, activation='relu'))
model.add(Dense(8, input_dim=4, activation='relu'))
model.add(Dense(3, activation='softmax')) # Returns probability of each class
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [0]:
model.fit(scaled_X_train, y_train, epochs=300, verbose=2)

In [0]:
predictions = model.predict_classes(scaled_X_test)
y_test.argmax(axis=1)

In [0]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

print(confusion_matrix(y_test.argmax(axis=1), predictions))

In [0]:
print(classification_report(y_test.argmax(axis=1), predictions))

In [0]:
print(accuracy_score(y_test.argmax(axis=1), predictions))

In [0]:
model.save('myfirstmodel.h5')

In [0]:
from keras.models import load_model
new_model = load_model('myfirstmodel.h5')

In [0]:
predict = new_model.predict_classes(scaled_X_test)

In [0]:
print(accuracy_score(y_test.argmax(axis=1), predict))

## **Text Generation with LSTM**

In [0]:
def read_file(filepath):
  with open(filepath) as f:
    str_text = f.read()

  return str_text

In [0]:
import spacy
nlp = spacy.load('en', disable=['parser', 'tagger', 'ner'])
nlp.max_length = 1198623

In [0]:
def separate_punc(doc_text):
  return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [0]:
d = read_file('/content/drive/My Drive/Colab Notebooks/Udemy/moby_dick_four_chapters.txt')

In [0]:
tokens = separate_punc(d)
len(tokens)

11338

In [0]:
# Pass 25 words and predict 26th word.

train_len = 25 + 1
text_sequences = []

for i in range(train_len, len(tokens)):
  seq = tokens[i-train_len:i]

  text_sequences.append(seq)

In [0]:
text_sequences[0]

['call',
 'me',
 'ishmael',
 'some',
 'years',
 'ago',
 'never',
 'mind',
 'how',
 'long',
 'precisely',
 'having',
 'little',
 'or',
 'no',
 'money',
 'in',
 'my',
 'purse',
 'and',
 'nothing',
 'particular',
 'to',
 'interest',
 'me',
 'on']

In [0]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)

In [0]:
sequences = tokenizer.texts_to_sequences(text_sequences)

In [0]:
sequences[0]

[956,
 14,
 263,
 51,
 261,
 408,
 87,
 219,
 129,
 111,
 954,
 260,
 50,
 43,
 38,
 315,
 7,
 23,
 546,
 3,
 150,
 259,
 6,
 2712,
 14,
 24]

In [0]:
for i in sequences[0]:
  print(f"{i} : {tokenizer.index_word[i]}")

956 : call
14 : me
263 : ishmael
51 : some
261 : years
408 : ago
87 : never
219 : mind
129 : how
111 : long
954 : precisely
260 : having
50 : little
43 : or
38 : no
315 : money
7 : in
23 : my
546 : purse
3 : and
150 : nothing
259 : particular
6 : to
2712 : interest
14 : me
24 : on


In [0]:
#tokenizer.word_counts
vocabulary_size = len(tokenizer.word_counts)
vocabulary_size

2717

In [0]:
import numpy as np

sequences = np.array(sequences)
sequences

array([[ 956,   14,  263, ..., 2712,   14,   24],
       [  14,  263,   51, ...,   14,   24,  957],
       [ 263,   51,  261, ...,   24,  957,    5],
       ...,
       [ 952,   12,  166, ...,  262,   53,    2],
       [  12,  166, 2711, ...,   53,    2, 2717],
       [ 166, 2711,    3, ...,    2, 2717,   26]])

1.   **Create the LSTM Based Model**
2.   **Split data into features and labels**
     1. **X features (First n words of sequence)**
     2. **Y lable (Next word after the sequence)**  
3. **Fit the Model**



In [0]:
from keras.utils import to_categorical

# Get all elements except the last one
sequences[:, :-1]

array([[ 956,   14,  263, ...,    6, 2712,   14],
       [  14,  263,   51, ..., 2712,   14,   24],
       [ 263,   51,  261, ...,   14,   24,  957],
       ...,
       [ 952,   12,  166, ...,   11,  262,   53],
       [  12,  166, 2711, ...,  262,   53,    2],
       [ 166, 2711,    3, ...,   53,    2, 2717]])

In [0]:
# Labels for each row
sequences[:, -1]

array([  24,  957,    5, ...,    2, 2717,   26])

In [0]:
X = sequences[:, :-1]
y = sequences[:, -1]
y = to_categorical(y, num_classes=vocabulary_size+1)

In [0]:
seq_len = X.shape[1]
X.shape

(11312, 25)

In [0]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding

In [0]:
def create_model(vocabulary_size, seq_len):
  model = Sequential()
  model.add(Embedding(vocabulary_size, seq_len, input_length=seq_len))
  model.add(LSTM(50,return_sequences=True))
  model.add(LSTM(50))
  model.add(Dense(50, activation='relu'))

  model.add(Dense(vocabulary_size, activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  model.summary()
  return model

In [0]:
model = create_model(vocabulary_size+1, seq_len)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 25, 25)            67950     
_________________________________________________________________
lstm_3 (LSTM)                (None, 25, 50)            15200     
_________________________________________________________________
lstm_4 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense_3 (Dense)              (None, 50)                2550      
_________________________________________________________________
dense_4 (Dense)              (None, 2718)              138618    
Total params: 244,518
Trainable params: 244,518
Non-trainable params: 0
_________________________________________________________________


In [0]:
from pickle import dump, load

model.fit(X, y, batch_size=128, epochs=2, verbose=1)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f2f713d47f0>

In [0]:
model.save('my_mobydick_model.h5')

In [0]:
dump(tokenizer, open('my_simpletokenizer', 'wb'))

1.   **Generate new text based on seed**



In [0]:
from keras.preprocessing.sequence import pad_sequences
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
  output_text = []
  input_text = seed_text

  for i in range(num_gen_words):
    encoded_text = tokenizer.texts_to_sequences([input_text])[0]
    pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
    pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0]
    pred_word = tokenizer.index_word[pred_word_ind]
    input_text += ' ' + pred_word
    output_text.append(pred_word)

  return ' '.join(output_text)

In [0]:
import random
random.seed(101)
random_pick = random.randint(0, len(text_sequences))
random_seed_text = text_sequences[random_pick]
random_seed_text

['thought',
 'i',
 'to',
 'myself',
 'the',
 'man',
 "'s",
 'a',
 'human',
 'being',
 'just',
 'as',
 'i',
 'am',
 'he',
 'has',
 'just',
 'as',
 'much',
 'reason',
 'to',
 'fear',
 'me',
 'as',
 'i',
 'have']

In [0]:
seed_text = ' '.join(random_seed_text)
seed_text

"thought i to myself the man 's a human being just as i am he has just as much reason to fear me as i have"

In [0]:
generate_text(model, tokenizer, seq_len, seed_text=seed_text, num_gen_words=25)

'the the the the the the the the the the the the the the the the the the the the the the the the the'

In [0]:
from keras.models import load_model
model = load_model('/content/drive/My Drive/Colab Notebooks/Udemy/epochBIG.h5')