In [1]:
#Basic Libraries to get started with the eco-system
import pandas as pd
import string
import numpy as np
import json

In [2]:
from keras.preprocessing.sequence import pad_sequences #To pad sequences to same length
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping # Stop training when a monitored metric has stopped improving.

import tensorflow as tf
import keras.utils as ku
from keras.models import Sequential
from numpy.random import seed

tf.random.set_seed(2)
seed(1)

In [3]:
!git clone https://github.com/ehsanayaz/Title-Generator-with-LSTM

Cloning into 'Title-Generator-with-LSTM'...
remote: Enumerating objects: 9, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 9 (delta 3), reused 9 (delta 3), pack-reused 0[K
Unpacking objects: 100% (9/9), done.


In [4]:
df1 = pd.read_csv('/content/Title-Generator-with-LSTM/Data/USvideos.csv')
df2 = pd.read_csv('/content/Title-Generator-with-LSTM/Data/CAvideos.csv')
df3 = pd.read_csv('/content/Title-Generator-with-LSTM/Data/GBvideos.csv')

In [5]:
#load the datasets containing the category names
data1 = json.load(open('/content/Title-Generator-with-LSTM/Data/US_category_id.json'))
data2 = json.load(open('/content/Title-Generator-with-LSTM/Data/CA_category_id.json'))
data3 = json.load(open('/content/Title-Generator-with-LSTM/Data/GB_category_id.json'))

In [6]:
#Helper function to extract categories 
def category_extractor(data):
    i_d = [data['items'][i]['id'] for i in range(len(data['items']))]
    title = [data['items'][i]['snippet']["title"] for i in range(len(data['items']))]
    i_d = list(map(int, i_d))
    category = zip(i_d, title)
    category = dict(category)
    return category

In [7]:
#Calling the helper function
df1['category_title'] = df1['category_id'].map(category_extractor(data1))
df2['category_title'] = df2['category_id'].map(category_extractor(data2))
df3['category_title'] = df3['category_id'].map(category_extractor(data3))

In [8]:
#Dataframe Preparation 
df = pd.concat([df1, df2, df3], ignore_index=True) #Concatenating all 3 dfs
df = df.drop_duplicates('video_id') #'Video_id' has no contextual value for our case

df

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description,category_title
0,2kyS6SvSYSE,17.14.11,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,2017-11-13T17:13:01.000Z,SHANtell martin,748374,57527,2966,15954,https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg,False,False,False,SHANTELL'S CHANNEL - https://www.youtube.com/s...,People & Blogs
1,1ZAPwfrtAFY,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,2017-11-13T07:30:00.000Z,"last week tonight trump presidency|""last week ...",2418783,97185,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,"One year after the presidential election, John...",Entertainment
2,5qpjK5DgCt4,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,2017-11-12T19:05:24.000Z,"racist superman|""rudy""|""mancuso""|""king""|""bach""...",3191434,146033,5339,8181,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,False,False,False,WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► http...,Comedy
3,puqaWrEC7tY,17.14.11,Nickelback Lyrics: Real or Fake?,Good Mythical Morning,24,2017-11-13T11:00:04.000Z,"rhett and link|""gmm""|""good mythical morning""|""...",343168,10172,666,2146,https://i.ytimg.com/vi/puqaWrEC7tY/default.jpg,False,False,False,Today we find out if Link is a Nickelback amat...,Entertainment
4,d380meD0W0M,17.14.11,I Dare You: GOING BALD!?,nigahiga,24,2017-11-12T18:01:41.000Z,"ryan|""higa""|""higatv""|""nigahiga""|""i dare you""|""...",2095731,132235,1989,17518,https://i.ytimg.com/vi/d380meD0W0M/default.jpg,False,False,False,I know it's been a while since we did this sho...,Entertainment
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120428,9iM7W1Dvl6Q,18.13.06,GLOW: Season 2 | Main Trailer [HD] | Netflix,Netflix,24,2018-06-11T14:30:11.000Z,"Netflix|""Trailer""|""movies""|""streaming""|""movies...",126958,2690,49,185,https://i.ytimg.com/vi/9iM7W1Dvl6Q/default.jpg,False,False,False,GLOW big or GLOW home. The girls are back in t...,Entertainment
120589,g4txv3A9ICQ,18.14.06,5 Seconds of Summer - No Roots (Alice Merton C...,BBCRadio1VEVO,10,2018-06-12T15:50:25.000Z,"5SOS|""5 Seconds of Summer""|""No Roots""|""Cover""|...",92541,12867,39,986,https://i.ytimg.com/vi/g4txv3A9ICQ/default.jpg,False,False,False,5SOS cover Alice Merton's No Roots in the BBC ...,Music
120593,YQJmvXamKYg,18.14.06,Conway: People are bending to the will of Pres...,Fox News,25,2018-06-13T12:56:49.000Z,"Fox News Channel|""FNC""|""Fox News""|""News""|""Late...",99048,2231,151,1294,https://i.ytimg.com/vi/YQJmvXamKYg/default.jpg,False,False,False,The senior counselor to the president talks di...,News & Politics
120623,BOhylL90UVQ,18.14.06,The Chainsmokers - Somebody ft. Drew Love (Ror...,The Chainsmokers,10,2018-06-09T16:00:00.000Z,"The Chainsmokers Somebody|""Chainsmokers Somebo...",402345,32126,427,1895,https://i.ytimg.com/vi/BOhylL90UVQ/default.jpg,False,False,False,The Chainsmokers - Somebody ft. Drew Love (A R...,Music


In [9]:
entertainment = df[df['category_title']=='Entertainment']['title']  #Selecting tittle of videos in 'Entertainment' category only
entertainment #Before

1         The Trump Presidency: Last Week Tonight with J...
3                          Nickelback Lyrics: Real or Fake?
4                                  I Dare You: GOING BALD!?
6                 Roy Moore & Jeff Sessions Cold Open - SNL
11        (SPOILERS) 'Shiva Saves the Day' Talked About ...
                                ...                        
119803    [SHINee - Good Evening] Comeback Stage | M COU...
119910    JUSTICE LEAGUE Is Better Than Infinity Wars | ...
119947    Diddy & King Combs on The Four, Rap Beef, NFL ...
120425    Hilary Duff Is Having a Baby Girl and Her Son ...
120428         GLOW: Season 2 | Main Trailer [HD] | Netflix
Name: title, Length: 9730, dtype: object

In [10]:
entertainment = entertainment.tolist() 
entertainment #After

['The Trump Presidency: Last Week Tonight with John Oliver (HBO)',
 'Nickelback Lyrics: Real or Fake?',
 'I Dare You: GOING BALD!?',
 'Roy Moore & Jeff Sessions Cold Open - SNL',
 "(SPOILERS) 'Shiva Saves the Day' Talked About Scene Ep. 804 | The Walking Dead",
 'SPAGHETTI BURRITO VS SPAGHETTI BURRITO',
 'Amazon Christmas Advert 2017 - Toys & Games',
 "What's Inside a Detectives Car?",
 'People are Awesome & The Pet Collective present Pets are Awesome!',
 'ELDERS REACT TO iPHONE X (Facial Recognition, Animojis)',
 'Will It Watermarble?! Sister Edition | Watermarbling 9 random objects in nail polish!',
 'Jason Momoa Wows Hugh Grant With Some Dothraki | The Graham Norton Show',
 "Daddy's Home 2 - Movie Review",
 '#VeteransDay: Thank You for Everything',
 'Batman: Gotham by Gaslight - Exclusive Trailer (2018)',
 'Watch Norman Reedus Come Face to Face with his ‘Walking Dead’ Double',
 'Reacting to Running a Half Marathon | MEGANBYTES EP. 101',
 'Justice League Reactions, Should Disney Buy 

In [11]:
#Helper function to clean data from punctuations
def clean_text(text):
    text = ''.join(e for e in text if e not in string.punctuation).lower()
    return text

In [12]:
corpus = [clean_text(i) for i in entertainment]
corpus #Cleaned title 

['the trump presidency last week tonight with john oliver hbo',
 'nickelback lyrics real or fake',
 'i dare you going bald',
 'roy moore  jeff sessions cold open  snl',
 'spoilers shiva saves the day talked about scene ep 804  the walking dead',
 'spaghetti burrito vs spaghetti burrito',
 'amazon christmas advert 2017  toys  games',
 'whats inside a detectives car',
 'people are awesome  the pet collective present pets are awesome',
 'elders react to iphone x facial recognition animojis',
 'will it watermarble sister edition  watermarbling 9 random objects in nail polish',
 'jason momoa wows hugh grant with some dothraki  the graham norton show',
 'daddys home 2  movie review',
 'veteransday thank you for everything',
 'batman gotham by gaslight  exclusive trailer 2018',
 'watch norman reedus come face to face with his ‘walking dead’ double',
 'reacting to running a half marathon  meganbytes ep 101',
 'justice league reactions should disney buy netflix  the john campea show',
 'rosie o

In [13]:
#Helper function to:
# 1) Tokenize words in dataset 
# 2) Convert data to seqence of tokens  
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    tokenizer.fit_on_texts(corpus) #Each word in 'corpus' with be assigned an integer (Integer Encoding) (E.g. This:1, is:2,a:3, program:4)
    total_words = len(tokenizer.word_index) + 1     
    
    #Converting data to Seqence of tokens 
    input_sequences = []

    for line in corpus:
      token_list = tokenizer.texts_to_sequences([line])[0] #(E.g. [12,34,45,565,20]) #Sequence of tokens
      for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)  #N-gram: a sequence of N words 
    return input_sequences, total_words # All N-grame sequences and total number of words (around:16449) 


inp_sequences, total_words = get_sequence_of_tokens(corpus)

In [14]:
get_sequence_of_tokens(corpus)

([[1, 92],
  [1, 92, 4286],
  [1, 92, 4286, 74],
  [1, 92, 4286, 74, 373],
  [1, 92, 4286, 74, 373, 1396],
  [1, 92, 4286, 74, 373, 1396, 11],
  [1, 92, 4286, 74, 373, 1396, 11, 140],
  [1, 92, 4286, 74, 373, 1396, 11, 140, 1156],
  [1, 92, 4286, 74, 373, 1396, 11, 140, 1156, 1713],
  [6592, 1526],
  [6592, 1526, 144],
  [6592, 1526, 144, 95],
  [6592, 1526, 144, 95, 411],
  [35, 1918],
  [35, 1918, 27],
  [35, 1918, 27, 797],
  [35, 1918, 27, 797, 6593],
  [1157, 1056],
  [1157, 1056, 2593],
  [1157, 1056, 2593, 6594],
  [1157, 1056, 2593, 6594, 434],
  [1157, 1056, 2593, 6594, 434, 453],
  [1157, 1056, 2593, 6594, 434, 453, 126],
  [551, 3213],
  [551, 3213, 2210],
  [551, 3213, 2210, 1],
  [551, 3213, 2210, 1, 83],
  [551, 3213, 2210, 1, 83, 4287],
  [551, 3213, 2210, 1, 83, 4287, 62],
  [551, 3213, 2210, 1, 83, 4287, 62, 359],
  [551, 3213, 2210, 1, 83, 4287, 62, 359, 14],
  [551, 3213, 2210, 1, 83, 4287, 62, 359, 14, 4288],
  [551, 3213, 2210, 1, 83, 4287, 62, 359, 14, 4288, 1],
 

In [15]:
from tensorflow.keras.utils import to_categorical

def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences]) #Length of largest sequence  #27
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')) #Padding sequences (from the start) to make them of same length


    predictors, label = input_sequences[:,:-1],input_sequences[:,-1] #Separating predictors and target label
    label = to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len



In [16]:
predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [17]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 26, 10)            164490    
_________________________________________________________________
lstm (LSTM)                  (None, 100)               44400     
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 16449)             1661349   
Total params: 1,870,239
Trainable params: 1,870,239
Non-trainable params: 0
_________________________________________________________________


In [31]:
model.fit(predictors, label, epochs=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f4494c89e10>

In [56]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for i in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]  #(E.g. [12,34,45,565,20]) #Sequence of tokens (Same as before)
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre') #Padding
        predicted = model.predict_classes(token_list, verbose=2)


        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [70]:
print(generate_text("Pakistan", 5, model, max_sequence_len))

1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
Pakistan Drama Mere Bewafa Episode 13


