In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import string
import os
import tensorflow as tf
import json
from tensorflow.keras.preprocessing import sequence 
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import (LSTM, GRU, Embedding, Dense, Bidirectional, Dropout, BatchNormalization)
from tensorflow.keras.optimizers import Adam, RMSprop 
from tensorflow.keras.models import Model 
from tensorflow.keras import Input, layers 
from tensorflow.keras import optimizers 
from tensorflow.keras.applications.inception_v3 import preprocess_input
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences 
from tensorflow.keras.utils import to_categorical 

In [2]:
medium_data_path = 'medium_data.csv'
medium_data = pd.read_csv(medium_data_path)
medium_data.head()

Unnamed: 0,id,url,title,subtitle,image,claps,responses,reading_time,publication,date
0,1,https://towardsdatascience.com/a-beginners-gui...,A Beginner’s Guide to Word Embedding with Gens...,,1.png,850,8,8,Towards Data Science,2019-05-30
1,2,https://towardsdatascience.com/hands-on-graph-...,Hands-on Graph Neural Networks with PyTorch & ...,,2.png,1100,11,9,Towards Data Science,2019-05-30
2,3,https://towardsdatascience.com/how-to-use-ggpl...,How to Use ggplot2 in Python,A Grammar of Graphics for Python,3.png,767,1,5,Towards Data Science,2019-05-30
3,4,https://towardsdatascience.com/databricks-how-...,Databricks: How to Save Files in CSV on Your L...,When I work on Python projects dealing…,4.jpeg,354,0,4,Towards Data Science,2019-05-30
4,5,https://towardsdatascience.com/a-step-by-step-...,A Step-by-Step Implementation of Gradient Desc...,One example of building neural…,5.jpeg,211,3,4,Towards Data Science,2019-05-30


In [3]:
print(medium_data.shape[0])
print(medium_data.shape[1])

6508
10


In [4]:
medium_data['title']

0       A Beginner’s Guide to Word Embedding with Gens...
1       Hands-on Graph Neural Networks with PyTorch & ...
2                            How to Use ggplot2 in Python
3       Databricks: How to Save Files in CSV on Your L...
4       A Step-by-Step Implementation of Gradient Desc...
                              ...                        
6503    “We” vs “I” — How Should You Talk About Yourse...
6504                     How Donald Trump Markets Himself
6505        Content and Marketing Beyond Mass Consumption
6506    5 Questions All Copywriters Should Ask Clients...
6507               How To Write a Good Business Blog Post
Name: title, Length: 6508, dtype: object

In [5]:
medium_data['title'] = medium_data['title'].apply(lambda x: x.replace(u'\xa0',u' '))
medium_data['title'] = medium_data['title'].apply(lambda x: x.replace('\u200a',' '))

In [6]:
def clean_titles(dataset):
    train_data = []
    table = str.maketrans('', '', string.punctuation)
    for val in dataset:
        val = val.split()
        val = [word.lower() for word in val]
        val = [w.translate(table) for w in val]
        val = [word for word in val if len(word) > 1]
        val = [word for word in val if word.isalpha()]
        train_data.append(' '.join(val))
    return train_data

In [7]:
train_data = clean_titles(medium_data['title'])
print(train_data[0:5])

['guide to word embedding with gensim model', 'handson graph neural networks with pytorch pytorch geometric', 'how to use in python', 'databricks how to save files in csv on your local computer', 'stepbystep implementation of gradient descent and backpropagation']


In [8]:
word_count_threshold = 10
word_counts = {}
nsents = 0
for sent in train_data:
    nsents += 1
    for word in sent.split(' '):
        word_counts[word] = word_counts.get(word, 0) + 1

vocab = [word for word, count in word_counts.items() if count >= word_count_threshold]
print('Preprocessed words: %d -> %d' %(len(word_counts), len(vocab)))

word_to_idx = {word: i + 1 for i, word in enumerate(vocab)}
idx_to_word = {i + 1: word for i, word in enumerate(vocab)}

with open('word_to_idx.json', 'w', encoding='utf-8') as f:
    json.dump(word_to_idx, f, ensure_ascii=False)

with open('idx_to_word.json', 'w', encoding='utf-8') as f:
    json.dump(idx_to_word, f, ensure_ascii=False)

vocab_size = len(vocab) + 1
print('Vocabulary size:', vocab_size)

Preprocessed words: 7940 -> 661
Vocabulary size: 662


In [9]:
def max_length(sentences):
    maxlen = 0
    for sent in sentences:
        maxlen = max(maxlen, len(sent.split()))
    return maxlen
maxlen = max_length(train_data)
print('Max length of sentences:', maxlen)

Max length of sentences: 21


In [10]:
# Load Glove model
glove_dir = 'C:\\Users\\TIN\\PycharmProjects\\Onl2'
embeddings_index = {} 
f = open(os.path.join(glove_dir, 'glove.6B.200d.txt'), encoding="utf-8")

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

print(embeddings_index['the'])

embedding_dim = 200

embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word_to_idx.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
embedding_matrix.shape

Found 400000 word vectors.
[-7.1549e-02  9.3459e-02  2.3738e-02 -9.0339e-02  5.6123e-02  3.2547e-01
 -3.9796e-01 -9.2139e-02  6.1181e-02 -1.8950e-01  1.3061e-01  1.4349e-01
  1.1479e-02  3.8158e-01  5.4030e-01 -1.4088e-01  2.4315e-01  2.3036e-01
 -5.5339e-01  4.8154e-02  4.5662e-01  3.2338e+00  2.0199e-02  4.9019e-02
 -1.4132e-02  7.6017e-02 -1.1527e-01  2.0060e-01 -7.7657e-02  2.4328e-01
  1.6368e-01 -3.4118e-01 -6.6070e-02  1.0152e-01  3.8232e-02 -1.7668e-01
 -8.8153e-01 -3.3895e-01 -3.5481e-02 -5.5095e-01 -1.6899e-02 -4.3982e-01
  3.9004e-02  4.0447e-01 -2.5880e-01  6.4594e-01  2.6641e-01  2.8009e-01
 -2.4625e-02  6.3302e-01 -3.1700e-01  1.0271e-01  3.0886e-01  9.7792e-02
 -3.8227e-01  8.6552e-02  4.7075e-02  2.3511e-01 -3.2127e-01 -2.8538e-01
  1.6670e-01 -4.9707e-03 -6.2714e-01 -2.4904e-01  2.9713e-01  1.4379e-01
 -1.2325e-01 -5.8178e-02 -1.0290e-03 -8.2126e-02  3.6935e-01 -5.8442e-04
  3.4286e-01  2.8426e-01 -6.8599e-02  6.5747e-01 -2.9087e-02  1.6184e-01
  7.3672e-02 -3.0343e-01

(662, 200)

In [None]:
input_layer = Input(shape=(maxlen,))

node1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(input_layer)
node1 = Dropout(0.2)(node1)
node1 = BatchNormalization()(node1)

node2 = Bidirectional(LSTM(256))(node1)

node3 = Dense(256, activation='relu')(node2)
output = Dense(vocab_size, activation='softmax')(node3)

model = Model(inputs=input_layer, outputs=output)
model.summary()

In [12]:
def data_generator(sequences, max_length, batch_size=32):
    x_batch = []
    y_batch = []
    sample_count = 0

    while True:
        for val in sequences:
            seq = [word_to_idx[w] for w in val.split(' ') if w in word_to_idx]
            if len(seq) > 1: 
                for i in range(1, len(seq)):
                    in_seq = seq[:i]
                    out_seq = seq[i]

                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    out_seq = to_categorical(out_seq, num_classes=vocab_size)

                    x_batch.append(in_seq)
                    y_batch.append(out_seq)
                    sample_count += 1

                    if sample_count == batch_size:
                        yield np.array(x_batch), np.array(y_batch)
                        x_batch = []
                        y_batch = []
                        sample_count = 0

In [None]:
model.layers[1].set_weights([embedding_matrix])
model.layers[1].trainable = False

optimizer = Adam(learning_rate=0.0001)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

epochs = 50
number_pics_per_bath = 6
steps = len(train_data) // number_pics_per_bath

try:
    for i in range(epochs):
        generator = data_generator(train_data, maxlen, number_pics_per_bath)
        history = model.fit(
            generator,         
            epochs=1,         
            steps_per_epoch=steps,  
            verbose=1        
        )
    model.save('model_medium_V2.keras')
except Exception as e:
    print(f"Error during training: {str(e)}")

[1m1084/1084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 39ms/step - accuracy: 0.0643 - loss: 6.1547
[1m1084/1084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 36ms/step - accuracy: 0.1342 - loss: 4.9856
[1m1084/1084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 35ms/step - accuracy: 0.1585 - loss: 4.7184
[1m1084/1084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 36ms/step - accuracy: 0.1829 - loss: 4.5148
[1m1084/1084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 31ms/step - accuracy: 0.2024 - loss: 4.3042
[1m1084/1084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 30ms/step - accuracy: 0.2268 - loss: 4.1036
[1m1084/1084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 32ms/step - accuracy: 0.2427 - loss: 3.8988
[1m1084/1084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 39ms/step - accuracy: 0.2644 - loss: 3.6864
[1m1084/1084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 35ms/step - accuracy: 0.295

In [14]:
def generate_desc(seed_text, next_words):
    in_text = seed_text
    
    for _ in range(next_words):
        sequence = [word_to_idx[word] for word in in_text.split() if word in word_to_idx]
        sequence = pad_sequences([sequence], maxlen=maxlen)[0]
        yhat = model.predict(np.array([sequence]), verbose=0)
        predicted_index = np.argmax(yhat)
        output_word = idx_to_word.get(predicted_index, "")
        if output_word:
            in_text += " " + output_word
    
    return in_text

In [16]:
seed_text = "Can you" 
next_words = 3  
generated_text = generate_desc(seed_text, next_words)
print(generated_text)

Can you may be there
