In [1]:
%tensorflow_version 2.3.2
import tensorflow as tf
import string
import requests
import sqlalchemy
import numpy as np
import pandas as pd

`%tensorflow_version` only switches the major version: 1.x or 2.x.
You set: `2.3.2`. This will be interpreted as: `2.x`.


TensorFlow 2.x selected.


In [2]:
#import data
db_name = "actual_news_data.db"
table_name = "News"

engine = sqlalchemy.create_engine("sqlite:///%s" % db_name, execution_options={"sqlite_raw_colnames": True})
df = pd.read_sql_table(table_name, engine)

In [3]:
df.head()

Unnamed: 0,index,Author,Title,Description,URL,Source,Country,Date
0,8,Catherine Shu,Bibit raises another growth round led by Sequo...,Four months after leading a $30 million growth...,https://techcrunch.com/2021/05/02/bibit-raises...,TechCrunch,us,2021-05-03T04:36:17+00:00
1,10,Kim Lyons,California appeals court finds Amazon responsi...,Illustration by Alex Castro / The Verge An app...,https://www.theverge.com/2021/5/1/22414185/cal...,The Verge,us,2021-05-01T16:59:00+00:00
2,24,Darrell Etherington,Firefly Aerospace raises $75M Series A at a $1...,Firefly Aerospace has raised a total of $175 m...,https://techcrunch.com/2021/05/04/firefly-aero...,TechCrunch,us,2021-05-04T13:37:45+00:00
3,25,Mike Butcher,London’s Stride VC raised second $138.6M seed ...,"Stride VC, a London-based seed investment fund...",https://techcrunch.com/2021/05/04/londons-stri...,TechCrunch,us,2021-05-04T15:38:05+00:00
4,35,Connie Loizos,Are we overestimating the ransomware threat?,"On Monday afternoon, the U.S. Justice Departme...",https://techcrunch.com/2021/06/08/are-we-overe...,TechCrunch,us,2021-06-08T08:15:35+00:00


In [4]:
data=df['Description'].to_list()

In [5]:
len(data)

586

In [6]:
data=" ".join(data)

In [7]:
def clean_text(doc):
  tokens=doc.split()
  table=str.maketrans('','',string.punctuation)
  tokens=[w.translate(table) for w in tokens]
  tokens=[word for word in tokens if word.isalpha()]
  tokens=[word.lower() for word in tokens]
  return tokens

In [8]:
tokens=clean_text(data)
print(tokens[:50])

['four', 'months', 'after', 'leading', 'a', 'million', 'growth', 'round', 'in', 'bibit', 'sequoia', 'capital', 'india', 'has', 'doubled', 'down', 'on', 'its', 'investment', 'in', 'the', 'indonesian', 'roboadvisor', 'app', 'bibit', 'announced', 'today', 'that', 'the', 'firm', 'led', 'a', 'new', 'million', 'growth', 'round', 'that', 'also', 'included', 'participation', 'from', 'prosus', 'ventures', 'tencent', 'harvard', 'management', 'company', 'and', 'returning', 'investors']


In [9]:
len(tokens)

36127

In [10]:
len(set(tokens))
##unique words

552

In [11]:
length=5+1
lines=[]

for i in range(length, len(tokens)):
  seq=tokens[i-length:i]
  line= ' '.join(seq)
  lines.append(line)
  
print(len(lines))

36121


In [12]:
lines[0]

'four months after leading a million'

In [13]:
## Build LSTM model and Prepare X and Y
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [14]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(lines)
sequences=tokenizer.texts_to_sequences(lines)

In [15]:
sequences=np.array(sequences)

In [16]:
X,y=sequences[:,:-1],sequences[:,-1]

In [17]:
vocab_size=len(tokenizer.word_index)+1
vocab_size

553

In [18]:
y=to_categorical(y,num_classes=vocab_size)


In [19]:
seq_length=X.shape[1]
seq_length

5

In [20]:
#LSTM Model

model=Sequential()
model.add(Embedding(vocab_size, 5,input_length=seq_length))
model.add(LSTM(50, return_sequences=True))
model.add(LSTM(500))
model.add(Dense(500,activation="relu"))
model.add(Dense(vocab_size,activation="softmax"))
model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 5, 5)              2765      
_________________________________________________________________
lstm (LSTM)                  (None, 5, 50)             11200     
_________________________________________________________________
lstm_1 (LSTM)                (None, 500)               1102000   
_________________________________________________________________
dense (Dense)                (None, 500)               250500    
_________________________________________________________________
dense_1 (Dense)              (None, 553)               277053    
Total params: 1,643,518
Trainable params: 1,643,518
Non-trainable params: 0
_________________________________________________________________


In [21]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X,y,batch_size=64,epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7f62d018f490>

In [22]:
def generate_text_sequence(model,tokenizer,text_seq_length,seed_text, n_words):
  text=[]
  print(len(tokenizer.word_index))
  for _ in range(n_words):
    encoded=tokenizer.texts_to_sequences([seed_text])[0]
    print(encoded)
    encoded=pad_sequences([encoded],maxlen=text_seq_length, truncating='pre')

    # y_predic=model.predict_classes(encoded)
    y_predic=np.argmax(model.predict(encoded), axis=-1)

    predicted_word=''
    for word,index in tokenizer.word_index.items():
      if index==y_predic:
        predicted_word=word
        print(predicted_word)
        break
    seed_text=seed_text+" "+predicted_word
    text.append(predicted_word)
  return ' '.join(text)

In [23]:
output=generate_text_sequence(model,tokenizer,seq_length, "The doctors",10)

552
[1]
the
[1, 1]
the
[1, 1, 1]
which
[1, 1, 1, 15]
owns
[1, 1, 1, 15, 160]
of
[1, 1, 1, 15, 160, 4]
the
[1, 1, 1, 15, 160, 4, 1]
darkside
[1, 1, 1, 15, 160, 4, 1, 336]
of
[1, 1, 1, 15, 160, 4, 1, 336, 4]
rivian
[1, 1, 1, 15, 160, 4, 1, 336, 4, 63]
organization


In [24]:
output

'the the which owns of the darkside of rivian organization'

In [25]:
tf.__version__

'2.5.0'

In [26]:
import pickle

# saving
with open('textgen_tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [27]:
model.save("textgen_model.h5")

In [28]:
from google.colab import files
files.download('textgen_model.h5')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [29]:
files.download('textgen_tokenizer.pickle')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>