# Data Preparation

## Setup

In [30]:
# Install Library
!pip install datasets
!pip install annoy



In [None]:
# API Keys
import os

# obtain Hugging Face API Key from
os.environ["__Hugging-Face-Read"] =  'hf_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
os.environ["__Hugging-Face-Write"] = 'hf_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'

# obtain GOOGLE AI API KEY from https://makersuite.google.com/
os.environ["GOOGLE_AI_API_KEY"] = 'AIXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'

In [32]:
# clone joke dataset
!git clone https://github.com/taivop/joke-dataset.git
!git clone https://github.com/orionw/rJokesData.git

fatal: destination path 'joke-dataset' already exists and is not an empty directory.
fatal: destination path 'rJokesData' already exists and is not an empty directory.


## Loading Data to a Dataframe

In [33]:
import os
import pandas as pd
import numpy as np

In [34]:
DIRS = os.listdir()
if 'joke-dataset' not in DIRS:
  print('joke-dataset not clone yet; `git clone https://github.com/taivop/joke-dataset.git`')
if 'rJokesData' not in DIRS:
  print('rJokesData not clone yet; `git clone https://github.com/orionw/rJokesData.git`')

EXTRACT_rJokesData_PATH = os.path.join(os.getcwd(),"rJokesData/data/fullrjokes.json.gz")
if not os.path.exists(os.path.join(os.getcwd(),"rJokesData/data/fullrjokes.json")):
  !gzip -dk $EXTRACT_rJokesData_PATH

In [35]:
# Read joke (json) to pandas dataframe
JOKE1_PATH = os.path.join(os.getcwd(),"joke-dataset/reddit_jokes.json")
JOKE2_PATH = os.path.join(os.getcwd(),"rJokesData/data/fullrjokes.json")

joke1 = pd.read_json(JOKE1_PATH)
joke2 = pd.read_json(JOKE2_PATH, lines = True)

## Merge in same format

In [36]:
cache1 = joke1.copy()
cache2 = joke2.copy()

cache2['score'] = cache2['ups']
cache2 = cache2.drop(['downs', 'ups'], axis=1)

cache2['body']=cache2['selftext']
cache2 = cache2.drop(['selftext', 'name', 'created_utc'], axis=1)
# cache2 = cache2.drop(['selftext', 'name'], axis=1)

df_joke_data = pd.concat([cache1, cache2], ignore_index=True)

del cache1, cache2

df_joke_data

Unnamed: 0,body,id,score,title
0,"Now I have to say ""Leroy can you please paint ...",5tz52q,1,I hate how you cant even say black paint anymore
1,Pizza doesn't scream when you put it in the ov...,5tz4dd,0,What's the difference between a Jew in Nazi Ge...
2,...and being there really helped me learn abou...,5tz319,0,I recently went to America....
3,A Sunday school teacher is concerned that his ...,5tz2wj,1,"Brian raises his hand and says, “He’s in Heaven.”"
4,He got caught trying to sell the two books to ...,5tz1pc,0,You hear about the University book store worke...
...,...,...,...,...
1259476,[removed],einfhp,0,If a lesbian has 15 cupcakes
1259477,[deleted],eingcl,1,A doctor walks into his patients’ house to see...
1259478,It was an ether/oar situation,einj5y,22,Before my surgery my anaesthetist offered to k...
1259479,[deleted],einjvp,1,After my wife was murdered...


In [37]:
del joke1, joke2

## Filter Null Data

In [38]:
# Drop `[removed]` and `[deleted]` and `NaN`
cache = df_joke_data.copy()

cache["body1"]=cache["body"].str.split()
cache["body_wordCount"]=cache["body1"].apply(lambda x: len(x))

cache["title1"]=cache["title"].str.split()
cache["title_wordCount"]=cache["title1"].apply(lambda x: len(x))

cache=cache[(cache["body_wordCount"] > 1) & (cache["title_wordCount"] > 1)]

# put back in main df
cache = cache.drop(['title1', 'body1', 'body_wordCount', 'title_wordCount'], axis=1)
df_joke_data = cache.copy()

del cache

df_joke_data

Unnamed: 0,body,id,score,title
0,"Now I have to say ""Leroy can you please paint ...",5tz52q,1,I hate how you cant even say black paint anymore
1,Pizza doesn't scream when you put it in the ov...,5tz4dd,0,What's the difference between a Jew in Nazi Ge...
2,...and being there really helped me learn abou...,5tz319,0,I recently went to America....
3,A Sunday school teacher is concerned that his ...,5tz2wj,1,"Brian raises his hand and says, “He’s in Heaven.”"
4,He got caught trying to sell the two books to ...,5tz1pc,0,You hear about the University book store worke...
...,...,...,...,...
1259470,quatro sinko,ein9fl,4,What do you call four Mexicans in a leaky rowb...
1259473,"I named it “Not all Heroes, We’re Crepes”",einaw4,7,Just opened a Sandwich & Pancakes restaurant!
1259475,Because checkers can't be boozers.,einex7,3,Why are there no alcoholic cashiers?
1259478,It was an ether/oar situation,einj5y,22,Before my surgery my anaesthetist offered to k...


## Deduplication

### Merge Title and Body

In [39]:
import string

cache = df_joke_data.copy()
cache['title'] = cache['title'].apply(lambda x: x if x[-1] in string.punctuation else x+'.')
cache['title'] = cache['title'].apply(lambda x: x if x[-1] in ' ' else x+' ')
cache['title_body'] = cache['title'] + cache['body']
cache

df_joke_data['title_body'] = cache['title_body']

del cache

df_joke_data

Unnamed: 0,body,id,score,title,title_body
0,"Now I have to say ""Leroy can you please paint ...",5tz52q,1,I hate how you cant even say black paint anymore,I hate how you cant even say black paint anymo...
1,Pizza doesn't scream when you put it in the ov...,5tz4dd,0,What's the difference between a Jew in Nazi Ge...,What's the difference between a Jew in Nazi Ge...
2,...and being there really helped me learn abou...,5tz319,0,I recently went to America....,I recently went to America.... ...and being th...
3,A Sunday school teacher is concerned that his ...,5tz2wj,1,"Brian raises his hand and says, “He’s in Heaven.”","Brian raises his hand and says, “He’s in Heave..."
4,He got caught trying to sell the two books to ...,5tz1pc,0,You hear about the University book store worke...,You hear about the University book store worke...
...,...,...,...,...,...
1259470,quatro sinko,ein9fl,4,What do you call four Mexicans in a leaky rowb...,What do you call four Mexicans in a leaky rowb...
1259473,"I named it “Not all Heroes, We’re Crepes”",einaw4,7,Just opened a Sandwich & Pancakes restaurant!,Just opened a Sandwich & Pancakes restaurant! ...
1259475,Because checkers can't be boozers.,einex7,3,Why are there no alcoholic cashiers?,Why are there no alcoholic cashiers? Because c...
1259478,It was an ether/oar situation,einj5y,22,Before my surgery my anaesthetist offered to k...,Before my surgery my anaesthetist offered to k...


### Exact Deduplication

In [40]:
# exact deduplication

cache = df_joke_data.copy()

cache = cache.drop_duplicates(subset=['score','title_body'], keep='first')

df_joke_data = cache.copy()

del cache

df_joke_data

Unnamed: 0,body,id,score,title,title_body
0,"Now I have to say ""Leroy can you please paint ...",5tz52q,1,I hate how you cant even say black paint anymore,I hate how you cant even say black paint anymo...
1,Pizza doesn't scream when you put it in the ov...,5tz4dd,0,What's the difference between a Jew in Nazi Ge...,What's the difference between a Jew in Nazi Ge...
2,...and being there really helped me learn abou...,5tz319,0,I recently went to America....,I recently went to America.... ...and being th...
3,A Sunday school teacher is concerned that his ...,5tz2wj,1,"Brian raises his hand and says, “He’s in Heaven.”","Brian raises his hand and says, “He’s in Heave..."
4,He got caught trying to sell the two books to ...,5tz1pc,0,You hear about the University book store worke...,You hear about the University book store worke...
...,...,...,...,...,...
1259470,quatro sinko,ein9fl,4,What do you call four Mexicans in a leaky rowb...,What do you call four Mexicans in a leaky rowb...
1259473,"I named it “Not all Heroes, We’re Crepes”",einaw4,7,Just opened a Sandwich & Pancakes restaurant!,Just opened a Sandwich & Pancakes restaurant! ...
1259475,Because checkers can't be boozers.,einex7,3,Why are there no alcoholic cashiers?,Why are there no alcoholic cashiers? Because c...
1259478,It was an ether/oar situation,einj5y,22,Before my surgery my anaesthetist offered to k...,Before my surgery my anaesthetist offered to k...


### Vector Deduplication

In [13]:
import tensorflow_hub as hub
# import torch
import tensorflow as tf

module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
model = hub.load(module_url)
print ("module %s loaded" % module_url)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [14]:
def embed(input):
  return model(input)

cache = df_joke_data.iloc[:].copy()

cache['title_body_embed_tensor'] = cache['title_body'].apply(lambda x: embed([x]))

cache['title_body_embed'] = cache['title_body_embed_tensor'].apply(lambda x: x[0].numpy())

df_joke_data_embed = cache.copy()

del cache

df_joke_data_embed.sample(5)

Unnamed: 0,body,id,score,title,title_body,title_body_embed_tensor,title_body_embed
0,"Now I have to say ""Leroy can you please paint ...",5tz52q,1,I hate how you cant even say black paint anymore,I hate how you cant even say black paint anymo...,"((tf.Tensor(0.028685976, shape=(), dtype=float...","[0.028685976, -0.043191195, 0.034018885, 0.017..."
7,"Apparently ""Whatever's low in cholesterol"" was...",5tz04j,1,I walked into a PETA adoption center and the r...,I walked into a PETA adoption center and the r...,"((tf.Tensor(-0.05328251, shape=(), dtype=float...","[-0.05328251, -0.02309264, -0.044626266, -0.02..."
1,Pizza doesn't scream when you put it in the ov...,5tz4dd,0,What's the difference between a Jew in Nazi Ge...,What's the difference between a Jew in Nazi Ge...,"((tf.Tensor(-0.053992394, shape=(), dtype=floa...","[-0.053992394, -0.014222437, 0.06767268, 0.013..."
5,Because the p is silent.,5tz1o1,0,Why is it unknown on how pterodactyls urinate ...,Why is it unknown on how pterodactyls urinate ...,"((tf.Tensor(-0.016438166, shape=(), dtype=floa...","[-0.016438166, 0.06754724, -0.008088531, -0.01..."
4,He got caught trying to sell the two books to ...,5tz1pc,0,You hear about the University book store worke...,You hear about the University book store worke...,"((tf.Tensor(-0.010164504, shape=(), dtype=floa...","[-0.010164504, -0.037262503, 0.07529987, -0.05..."


### Approx. Nearest Neighbor Compute

In [15]:
from annoy import AnnoyIndex

DIMENSION = df_joke_data_embed['title_body_embed'][0].shape[0]

LST_DF_JOKE_EMBED_TB = df_joke_data_embed['title_body_embed'].to_numpy().tolist()
LST_DF_JOKE_EMBED_IDX = df_joke_data_embed.index.to_numpy().tolist()

tree = AnnoyIndex(DIMENSION, 'angular')

for idx, txt_embed in zip(LST_DF_JOKE_EMBED_IDX, LST_DF_JOKE_EMBED_TB):
  tree.add_item(idx, txt_embed)

tree.build(n_trees=100, n_jobs=-1)

tree.save('annoy_index_100tree.ann', prefault=True)

True

In [None]:
# Real Vector deduplication
CLOSE_NEIGHBORS_DIST = 0.534

cache = df_joke_data_embed.copy()

LST_DF_JOKE_EMBED_TB = df_joke_data_embed['title_body_embed'].to_numpy().tolist()
LST_DF_JOKE_EMBED_IDX = df_joke_data_embed.index.to_numpy().tolist()

drop_lst = []

def get_filter_knn(idx, k=150):
  tree_res = tree.get_nns_by_item(idx, k, search_k=-1, include_distances=True)
  idx_res = tree_res[0]
  dist_res = tree_res[1]

  if dist_res[-1] <= CLOSE_NEIGHBORS_DIST:
    # print(f'retry new_k:{round(k*2+9)}')
    return get_filter_knn(idx, round(k*2+9))

  idx_res_near = []

  for i, val in enumerate(dist_res):
    if val <= CLOSE_NEIGHBORS_DIST:
      idx_res_near.append(idx_res[i])

  return tree_res, idx_res_near

i= 0

for idx, txt_embed in zip(LST_DF_JOKE_EMBED_IDX, LST_DF_JOKE_EMBED_TB):
  print(i)
  # idx = 5886
  tree_res, idx_res_near = get_filter_knn(idx, 5)

  try:
    keep_idx = cache.loc[idx_res_near][cache.loc[idx_res_near]['score']==cache.loc[idx_res_near]['score'].max()].index[0]
    idx_res_near.remove(keep_idx)
  except:
    keep_idx = idx
    idx_res_near =[]


  for _idx in idx_res_near:
    drop_lst.append(_idx)

  # if i==1000:
  #   break
  i+=1

cache = cache.drop(drop_lst, axis=0)

df_joke_data_embed = cache.copy()

cache

## Funny Joke and Not that Funny Joke Separation

### Score (Upvote) Vistualization

In [None]:
import plotly.express as px

# pd.options.plotting.backend = "plotly"

px.line(df_joke_data_embed[df_joke_data_embed['score']!=0]['score'].value_counts().sort_index(), labels={
                     "value": "Counts",
                     "index": "Scores",
                 },)

In [None]:
df_joke_data_embed['score_clip'] = df_joke_data_embed['score'].clip(0,78)
px.line(df_joke_data_embed['score_clip'].value_counts().sort_index(), labels={
                     "value": "Counts",
                     "index": "Scores",
                 },)

In [None]:
import plotly.express as px

# pd.options.plotting.backend = "plotly"

px.box(df_joke_data_embed['score'].sort_values())

In [None]:
import plotly.express as px

# pd.options.plotting.backend = "plotly"

px.line(df_joke_data_embed['title_body'].map(lambda x: len(x)).value_counts().sort_index(), labels={
                     "value": "Counts",
                     "index": "Length [letter]",
                 },)

In [None]:
import plotly.express as px

# pd.options.plotting.backend = "plotly"

px.box(df_joke_data_embed['title_body'].map(lambda x: len(x)).sort_values())

In [None]:
import plotly.express as px

# pd.options.plotting.backend = "plotly"

px.line(df_joke_data_embed['title_body'].map(lambda x: len(x.split(' '))).value_counts().sort_index(), labels={
                     "value": "Counts",
                     "index": "Length [word]",
                 },)

In [None]:
import plotly.express as px

# pd.options.plotting.backend = "plotly"

px.box(df_joke_data_embed['title_body'].map(lambda x: len(x.split(' '))).sort_values())

## Separation

In [None]:
# Funny is True and not funny is False

# sep_score = df_joke_data_repeated_id['score'].quantile(0.75)
# sep_score = df_joke_data_embed['score'].quantile(0.75)
sep_score = df_joke_data_embed[df_joke_data_embed['score']!=0]['score'].quantile(0.755)
print("sep_score: ",sep_score)

sep_score = 24
df_joke_data_embed['funny'] =  df_joke_data_embed['score'].map(lambda x: True if x > sep_score else False)
# print(df_joke_data_embed[df_joke_data_embed['funny']==True].sample(1)['title_body'].values[0])

## Get Not Funny message from Gemini Pro

In [25]:
import google.generativeai as genai

genai.configure(api_key=os.environ["GOOGLE_AI_API_KEY"])

safety_settings = [
  {
    "category": "HARM_CATEGORY_HARASSMENT",
    "threshold": "BLOCK_NONE"
  },
  {
    "category": "HARM_CATEGORY_HATE_SPEECH",
    "threshold": "BLOCK_NONE"
  },
  {
    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
    "threshold": "BLOCK_NONE"
  },
  {
    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
    "threshold": "BLOCK_NONE"
  }
]

model = genai.GenerativeModel('gemini-pro', safety_settings=safety_settings)
# model1 = genai.GenerativeModel('gemini-pro', safety_settings=safety_settings)

In [None]:
from time import time, sleep

df_joke_data_embed['raw_not_funny_gemini']=np.full((df_joke_data_embed.shape[0],), None)

error_idx=[]
state=""

def get_not_funny_gemini(raw_text):
  prompt = """Paraphrase text below such that it has no humor in the text anymore. Answer it in the whole text format and only one version of text is enough!

"""
  prompt += raw_text

  response = model.generate_content(prompt)

  # sleep(0.001*random.random())

  return response.text

while df_joke_data_embed['raw_not_funny_gemini'].isna().sum() > 0:
  cache = df_joke_data_embed.copy()

  # cache = df_joke_data_embed[df_joke_data_embed['raw_not_funny_gemini'].isna() & df_joke_data_embed['funny']]
  # state = "Funny; "

  # if cache.shape[0] == 0:
  # cache = df_joke_data_embed[df_joke_data_embed['raw_not_funny_gemini'].isna()]
  # print('No more funny, start not funny')
  # state = "Not Funny; "

  for idx, title_body in zip(cache.index, cache['title_body']):
    start = time()
    if df_joke_data_embed.loc[idx]['raw_not_funny_gemini'] is None:
      try:
        df_joke_data_embed.at[idx, 'raw_not_funny_gemini'] = get_not_funny_gemini(
            title_body)
        print(f'{state}Done on {idx}; ', end='')
      except Exception as error:
        df_joke_data_embed.at[idx, 'raw_not_funny_gemini'] = None
        error_idx.append(idx)
        print(f'{state}Error {error} on {idx}; ', end="")
      print((time()-start))
      sleep(1-(time()-start) if 1-(time()-start) > 0 else 0)
    else:
      print(f'Skip {idx}; ', end='')
      print((time()-start))