In [1]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pandas as pd
from scipy import spatial

In [2]:
# -*- coding: utf-8 -*-
import re
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

In [3]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]

In [4]:
model = Doc2Vec(documents, vector_size=512, window=2, min_count=1, workers=4)

In [5]:
vector = model.infer_vector(["hello my name is curtis and this is a document", "this is a doument that I am writing about yay!"])


In [6]:
vector

array([ 9.19610728e-04,  3.01197928e-04,  8.06580880e-04,  6.09276351e-04,
       -4.53169341e-04, -1.15447918e-04,  7.17524323e-04,  8.96177604e-04,
       -7.58598617e-04, -6.51822847e-05, -3.44298955e-04,  3.78788507e-04,
       -2.17471446e-04, -9.49723937e-04, -6.39379839e-04, -4.22645564e-04,
       -8.12766550e-04, -7.75196531e-04,  3.45941808e-04,  3.37760051e-04,
        5.15586464e-04, -4.95488290e-04, -7.61370116e-04,  4.80284332e-04,
        9.03651700e-04, -5.44602808e-04,  5.02727868e-04,  4.38383315e-04,
        2.16900997e-04,  3.31462739e-04,  7.17164017e-04, -6.84669591e-04,
       -6.66361477e-04, -6.26740628e-04, -4.16559706e-05, -5.34589810e-04,
        4.77712398e-04, -7.58381793e-04,  6.29130926e-04,  2.61032372e-04,
       -8.64734699e-04, -1.13082306e-05,  6.41458726e-04,  5.39374196e-05,
        1.72155822e-04,  3.16716090e-04, -8.75335420e-04, -9.09587136e-04,
        3.41704908e-05,  1.03584789e-04,  4.33797017e-04,  5.34874038e-04,
       -1.63303848e-04,  

In [7]:
input = pd.read_csv("../input/shortjokes.csv")
input.head()

Unnamed: 0,ID,Joke
0,1,"[me narrating a documentary about narrators] ""..."
1,2,Telling my daughter garlic is good for you. Go...
2,3,I've been going through a really rough period ...
3,4,"If I could have dinner with anyone, dead or al..."
4,5,Two guys walk into a bar. The third guy ducks.


In [8]:
jokes = input["Joke"].values

In [9]:
jokes[:4]

array(['[me narrating a documentary about narrators] "I can\'t hear what they\'re saying cuz I\'m talking"',
       'Telling my daughter garlic is good for you. Good immune system and keeps pests away.Ticks, mosquitos, vampires... men.',
       "I've been going through a really rough period at work this week It's my own fault for swapping my tampax for sand paper.",
       'If I could have dinner with anyone, dead or alive... ...I would choose alive. -B.J. Novak-'],
      dtype=object)

In [10]:
jokes_documents = []
for joke in jokes:
    jokes_documents.append({
        "joke": joke,
        "vec": model.infer_vector(split_into_sentences(joke))
    })

In [11]:
fb_sentence = """It's really cool tbh
like with most languages you write code then compile it then run it
but with lisp you can do hot-reload
like you load your code the 1st time
run it
if you have a bug
you can rewrite the buggy function
and reload just that function, keeping all your other state""".split("\n")
# I might also get better results
fb_sentence_vec = model.infer_vector(fb_sentence)

In [12]:
closest_dist = -1
closest_joke = ""

for doc in jokes_documents:
    vec = doc["vec"]
    similarity = 1 - spatial.distance.cosine(vec, fb_sentence_vec)
    if(similarity > closest_dist):
        closest_dist = similarity
        closest_joke = doc["joke"]

In [13]:
print(closest_dist)
print(closest_joke)

0.20344072580337524
Do you know why you dont get along with cassiopeia? Cause she is petrifying gays.


Thoughts: maybe split on first sentence, second sentence, right after the last ?. I need to separate it into the setup and the punchline