Workbook for Universal Sentence Encoder.

In [None]:
import pandas as pd

# Load the CSV data into a DataFrame
jokes_df = pd.read_csv('jokes_dataset.csv')

# print the first 5 rows of the DataFrame
print(jokes_df.head())


                                                text  humor
0  Joe biden rules out 2020 bid: 'guys, i'm not r...  False
1  Watch: darvish gave hitter whiplash with slow ...  False
2  What do you call a turtle without its shell? d...   True
3      5 reasons the 2016 election feels so personal  False
4  Pasco police shot mexican migrant from behind,...  False


In [None]:
import nltk
from nltk.corpus import stopwords
import string
import re

# Download the stopwords corpus
nltk.download('stopwords')

# Define a function to preprocess the jokes
def preprocess_jokes(jokes_df):
    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    jokes_df['clean_joke'] = jokes_df['text'].apply(lambda x: ' '.join([word.lower() for word in x.split() if (word.lower() not in stop_words) and (word.lower() not in string.punctuation)]))
    return jokes_df

# Define a function to remove punctuation
def remove_punctuation(text):
    # Define the regular expression pattern to match any punctuation character
    pattern = r'[^\w\s]'

    # Use the sub() function to replace any matches with an empty string
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:

# Preprocess the jokes
preprocess_jokes(jokes_df)

# Apply the remove_punctuation function to the Clean_joke column
jokes_df['clean_joke'] = jokes_df['clean_joke'].apply(remove_punctuation)


In [None]:

humorous_jokes_df = jokes_df[jokes_df['humor'] == True]


Universal Sentence Encoder:

In [None]:
import tensorflow as tf
import tensorflow_hub as hub

# Load the model from TensorFlow Hub
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5"
model = hub.load(module_url)


In [None]:
# Define batch size
BATCH_SIZE = 1000

# Initialize empty list to store encoded joke vectors
joke_vectors_list = []

# Encode jokes in batches
for i in range(0, len(humorous_jokes_df), BATCH_SIZE):
    # Get a batch of jokes
    batch_jokes = humorous_jokes_df["clean_joke"][i:i+BATCH_SIZE]
    
    # Encode the batch of jokes into vectors
    batch_vectors = model(batch_jokes).numpy()
    
    # Append the batch of vectors to the list
    joke_vectors_list.append(batch_vectors)



In [None]:
import numpy as np

    
# Concatenate the list of vectors into a single numpy array
joke_vectors_array = np.concatenate(joke_vectors_list)

# Print the shape of the joke vectors array
print(joke_vectors_array.shape)

(100000, 512)


In [None]:
print(joke_vectors_array[:10])


[[-0.10272926 -0.01983313 -0.00284022 ... -0.00134287 -0.0245461
   0.02391309]
 [ 0.00263279  0.08414213  0.02961861 ...  0.04268469  0.07342681
   0.04781407]
 [ 0.02592236 -0.01635438 -0.02989341 ... -0.01631698 -0.07111952
   0.03338585]
 ...
 [ 0.04659941 -0.0460685  -0.00383115 ...  0.03140274  0.04096763
   0.05693741]
 [ 0.06997935 -0.06368115 -0.0437464  ... -0.07381413  0.01135412
  -0.00506185]
 [ 0.01967495 -0.00077027  0.06616988 ... -0.01556989  0.04132042
   0.03782702]]


Need to save the joke_vectors array for later use with streamlit app.

In [None]:
# Save the array as an npy file
np.save("joke_vectors.npy", joke_vectors_array)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

batch_size = 1000

similarity_matrix = None

# Compute the pairwise cosine similarity between all joke vectors in batches
for i in range(0, len(joke_vectors_array), batch_size):
    batch_vectors = joke_vectors_array[i:i+batch_size]
    if len(batch_vectors) < batch_size:  # pad the last batch if necessary
        num_padding_rows = batch_size - len(batch_vectors)
        padding_vectors = np.zeros((num_padding_rows, joke_vectors_array.shape[1]))
        batch_vectors = np.concatenate([batch_vectors, padding_vectors], axis=0)
    if similarity_matrix is None:
        similarity_matrix = cosine_similarity(batch_vectors)
    else:
        similarity_matrix = np.concatenate(
            [similarity_matrix, cosine_similarity(batch_vectors)], axis=0)

# Print the shape of the similarity matrix
print(similarity_matrix.shape)


(100000, 1000)


Need to save the similarity_matrix array for later use with streamlit app.

In [None]:
  # Save the array as an npy file
np.save("simalarity_matrix.npy", similarity_matrix)

In [None]:
import tensorflow_hub as hub

# Load the Universal Sentence Encoder model from TensorFlow Hub
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd '/content/drive/My Drive/Jokes-Capstone'


/content/drive/My Drive/Jokes-Capstone


In [None]:
import shutil
shutil.copy('/content/joke_vectors.npy', '/content/drive/MyDrive/Jokes-Capstone/joke_vectors.npy')


'/content/drive/MyDrive/Jokes-Capstone/joke_vectors.npy'

In [None]:
import shutil
shutil.copy('/content/simalarity_matrix.npy', '/content/drive/MyDrive/Jokes-Capstone/simalarity_matrix.npy')


'/content/drive/MyDrive/Jokes-Capstone/simalarity_matrix.npy'

In [None]:
# Given a user's input, encode it into a vector representation using the same Universal Sentence Encoder model.
user_input = input("Enter a sentence: ")


Enter a sentence: Dr. Seuss cat in the hat


In [None]:
user_input_vector = embed([user_input])[0].numpy()
similarity_scores = cosine_similarity(user_input_vector.reshape(1, -1), joke_vectors_array)

In [None]:
# Get the number of jokes to recommend from the user
num_jokes = int(input("How many jokes would you like to see? "))

How many jokes would you like to see? 5


In [None]:
# Get the top-n jokes with the highest similarity scores
top_indices = np.argsort(similarity_scores, axis=1)[:, ::-1][:, :num_jokes].ravel()

In [None]:
top_jokes = [humorous_jokes_df.iloc[i] for i in top_indices]

In [None]:
top_jokes

[text          What did dr. seuss call the book he wrote abou...
 humor                                                      True
 clean_joke          dr seuss call book wrote star wars cat atat
 Name: 151470, dtype: object,
 text          What was schrodinger's favorite childhood book...
 humor                                                      True
 clean_joke    schrodingers favorite childhood book cat box d...
 Name: 42771, dtype: object,
 text          What is dr. seuss' favorite play? green eggs a...
 humor                                                      True
 clean_joke             dr seuss favorite play green eggs hamlet
 Name: 87722, dtype: object,
 text          Did you read dr seuss as a kid because green e...
 humor                                                      True
 clean_joke                    read dr seuss kid green eggs damn
 Name: 150734, dtype: object,
 text          What do you call a magician in a dr. seuss boo...
 humor                               

In [None]:
# Print the top-n jokes
print(f"Top {num_jokes} jokes:")
for i, joke in enumerate(top_jokes):
    print(f"{i+1}. {joke.text}")


Top 5 jokes:
1. What did dr. seuss call the book he wrote about star wars? the cat in the at-at
2. What was schrodinger's favorite childhood book? the cat in the box by dr. seuss
3. What is dr. seuss' favorite play? green eggs and hamlet
4. Did you read dr seuss as a kid because green eggs and damn
5. What do you call a magician in a dr. seuss book? who-dini
