## Import data, inspect, drop NaN rows

In [52]:
import pandas as pd
import numpy as np

In [109]:
lines = pd.read_csv('simpsons_script_lines.csv')
lines.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0.1,Unnamed: 0,id,episode_id,number,raw_text,timestamp_in_ms,speaking_line,character_id,location_id,raw_character_text,raw_location_text,spoken_words,normalized_text,word_count,lemmas,joined_lemmas
0,0,9549,32,209,"Miss Hoover: No, actually, it was a little of ...",848000,True,464,3.0,Miss Hoover,Springfield Elementary School,"No, actually, it was a little of both. Sometim...",no actually it was a little of both sometimes ...,31.0,"['actually', 'little', 'disease', 'magazine', ...",actually little disease magazine news show nat...
1,1,9550,32,210,Lisa Simpson: (NEAR TEARS) Where's Mr. Bergstrom?,856000,True,9,3.0,Lisa Simpson,Springfield Elementary School,Where's Mr. Bergstrom?,wheres mr bergstrom,3.0,"['s', 'mr', 'bergstrom']",s mr bergstrom
2,2,9551,32,211,Miss Hoover: I don't know. Although I'd sure l...,856000,True,464,3.0,Miss Hoover,Springfield Elementary School,I don't know. Although I'd sure like to talk t...,i dont know although id sure like to talk to h...,22.0,"['not', 'know', 'would', 'sure', 'like', 'talk...",not know would sure like talk not touch lesson...
3,3,9552,32,212,Lisa Simpson: That life is worth living.,864000,True,9,3.0,Lisa Simpson,Springfield Elementary School,That life is worth living.,that life is worth living,5.0,"['life', 'worth', 'live']",life worth live
4,4,9553,32,213,Edna Krabappel-Flanders: The polls will be ope...,864000,True,40,3.0,Edna Krabappel-Flanders,Springfield Elementary School,The polls will be open from now until the end ...,the polls will be open from now until the end ...,33.0,"['poll', 'open', 'end', 'recess', 'case', 'dec...",poll open end recess case decide thought final...


In [110]:
lines.isnull().sum()

Unnamed: 0                0
id                        0
episode_id                0
number                    0
raw_text                  0
timestamp_in_ms           0
speaking_line             0
character_id          17521
location_id             407
raw_character_text    17522
raw_location_text       408
spoken_words          26159
normalized_text       26184
word_count            26159
lemmas                26184
joined_lemmas         29849
dtype: int64

In [111]:
lines.shape

(158248, 16)

In [112]:
lines = lines.dropna()
lines.isnull().sum()

Unnamed: 0            0
id                    0
episode_id            0
number                0
raw_text              0
timestamp_in_ms       0
speaking_line         0
character_id          0
location_id           0
raw_character_text    0
raw_location_text     0
spoken_words          0
normalized_text       0
word_count            0
lemmas                0
joined_lemmas         0
dtype: int64

In [113]:
lines.shape

(128025, 16)

## Import spacy, create a list of docs and a list of their vectors

In [115]:
import spacy

In [116]:
docs = []
nlp = spacy.load('en_core_web_lg')

In [117]:
count = 0
for doc in nlp.pipe(lines['joined_lemmas'].values):
    docs.append(doc)

In [118]:
vectors = [doc.vector for doc in docs]

### Add vectors to lines df

In [119]:
lines['vectors'] = vectors

## Initialize a KNN model and fit on the vectors

In [120]:
from sklearn.neighbors import NearestNeighbors

In [121]:
nn  = NearestNeighbors(n_neighbors=5, algorithm='ball_tree')
nn.fit(vectors)

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

## Functions to get vectors/find quotes

In [126]:
def get_vectors_of_string(inp_str):
    lemma_list = get_lemmas(inp_str)
    joined = " ".join(lemma_list)
    return nlp(joined).vector

In [127]:
def find_quotes(inp_str):
    vect = get_vectors_of_string(inp_str)
    closest_quotes = nn.kneighbors([vect])
    return lines.iloc[closest_quotes[1][0]]

## Check if the functions are cromulent

In [128]:
find_quotes("matter of fact, they're all in the hammock complex")

Unnamed: 0.1,Unnamed: 0,id,episode_id,number,raw_text,timestamp_in_ms,speaking_line,character_id,location_id,raw_character_text,raw_location_text,spoken_words,normalized_text,word_count,lemmas,joined_lemmas,vectors
34271,34271,44093,155,199,"Hank Scorpio: That might... Matter of fact, th...",886000,True,2040,1497.0,Hank Scorpio,GLOBEX,"That might... Matter of fact, they're all in t...",that might matter of fact theyre all in the sa...,18.0,"['matter', 'fact', 'complex', 'hammock', 'comp...",matter fact complex hammock complex,"[-0.041675992, 0.15653822, 0.06628001, -0.2048..."
10240,10240,19829,67,67,Homer Simpson: My hammock! Do you understand? ...,321000,True,2,215.0,Homer Simpson,WINFIELD HOUSE,My hammock! Do you understand? Mine! Don't loo...,my hammock do you understand mine dont look th...,10.0,"['hammock', 'understand', 'not', 'look', 'way']",hammock understand not look way,"[0.0172494, 0.10171799, -0.285124, -0.10154042..."
34261,34261,44083,155,189,Hank Scorpio: (IMPRESSED) Hammocks? My goodnes...,870000,True,2040,1497.0,Hank Scorpio,GLOBEX,"Hammocks? My goodness, what an idea. Why didn'...",hammocks my goodness what an idea why didnt i ...,25.0,"['hammock', 'goodness', 'idea', 'not', 'think'...",hammock goodness idea not think hammock homer ...,"[0.24214274, 0.13538934, -0.10292957, -0.08530..."
88587,88587,98567,342,174,Homer Simpson: Well... I'm worried about Marge...,857000,True,2,10.0,Homer Simpson,Springfield Nuclear Power Plant,Well... I'm worried about Marge and Moe. They'...,well im worried about marge and moe theyve dev...,28.0,"['be', 'worry', 'marge', 'moe', 'have', 'devel...",be worry marge moe have develop intimate bond ...,"[-0.04170692, 0.102905124, -0.16318122, 0.0177..."
42568,42568,52417,186,102,"Dr. Julius Hibbert: Now, regardless of what th...",483000,True,332,890.0,Dr. Julius Hibbert,Construction Site,"Now, regardless of what this thing is, it's a ...",now regardless of what this thing is its a pri...,34.0,"['regardless', 'thing', 'priceless', 'scientif...",regardless thing priceless scientific find pre...,"[-0.17448243, -0.02654165, -0.21030657, 0.1130..."


In [24]:
lines.loc[34275]['spoken_words']

"That might... Matter of fact, they're all in the same complex. It's the Hammock Complex, down on Third?"

In [129]:
find_quotes("win friends with salad")

Unnamed: 0.1,Unnamed: 0,id,episode_id,number,raw_text,timestamp_in_ms,speaking_line,character_id,location_id,raw_character_text,raw_location_text,spoken_words,normalized_text,word_count,lemmas,joined_lemmas,vectors
142612,142612,153067,548,31,Bart Simpson: That man sure loved potato salad.,149000,True,8,131.0,Bart Simpson,First Church of Springfield,That man sure loved potato salad.,that man sure loved potato salad,6.0,"['man', 'sure', 'love', 'potato', 'salad']",man sure love potato salad,"[-0.2112926, 0.2741652, 0.02738459, -0.2130744..."
29948,29948,39671,138,161,"Lionel Hutz: Well, I didn't win. Here's your p...",0,True,347,1371.0,Lionel Hutz,Springfield Civic Auditorium,"Well, I didn't win. Here's your pizza.",well i didnt win heres your pizza,7.0,"['not', 'win', 'here', 'pizza']",not win here pizza,"[-0.09418982, 0.055107497, -0.042695, -0.04922..."
139617,139617,150066,536,227,Gary Chalmers: (TURNS TO SKINNER) I'll bet my ...,985000,True,1078,3.0,Gary Chalmers,Springfield Elementary School,I'll bet my baloney sandwich that girl's gonna...,ill bet my baloney sandwich that girls gonna win,9.0,"['ill', 'bet', 'baloney', 'sandwich', 'girl', ...",ill bet baloney sandwich girl go to win,"[-0.1573825, -0.024347529, 0.0022074897, 0.113..."
28558,28558,38280,133,114,Homer Simpson: But all normal people love meat...,536000,True,2,5.0,Homer Simpson,Simpson Home,But all normal people love meat. If I went to ...,but all normal people love meat if i went to a...,38.0,"['normal', 'people', 'love', 'meat', 'go', 'ba...",normal people love meat go barbecue meat yo go...,"[-0.17415681, 0.027866596, -0.018625999, -0.01..."
105387,105387,115494,406,63,Marge Simpson: Why does every kid who stays wi...,287000,True,1,3328.0,Marge Simpson,CRUISE SHIP TERMINAL,"Why does every kid who stays with us bring ""mo...",why does every kid who stays with us bring mom...,23.0,"['kid', 'stay', 'bring', 'mommy', 'meal', 'pin...",kid stay bring mommy meal pineapple potato sal...,"[-0.27995518, 0.17613508, -0.0129324505, -0.08..."


In [130]:
find_quotes("steamed hams")

Unnamed: 0.1,Unnamed: 0,id,episode_id,number,raw_text,timestamp_in_ms,speaking_line,character_id,location_id,raw_character_text,raw_location_text,spoken_words,normalized_text,word_count,lemmas,joined_lemmas,vectors
32670,32670,42488,149,114,"Gary Chalmers: For ""steamed hams.""",593000,True,1078,455.0,Gary Chalmers,Skinner Home,"For ""steamed hams.""",for steamed hams,3.0,"['steamed', 'ham']",steamed ham,"[-0.425248, -0.457865, 0.3288425, 0.460047, 0...."
32660,32660,42479,149,105,"Gary Chalmers: You call hamburgers ""steamed ha...",566000,True,1078,455.0,Gary Chalmers,Skinner Home,"You call hamburgers ""steamed hams?""",you call hamburgers steamed hams,5.0,"['hamburger', 'steam', 'ham']",hamburger steam ham,"[-0.13849701, -0.14111353, 0.40295032, 0.33257..."
110475,110475,120662,426,68,Moe Szyslak: Ham sandwiches!,295000,True,17,3491.0,Moe Szyslak,LICENSE BUREAU,Ham sandwiches!,ham sandwiches,2.0,"['ham', 'sandwich']",ham sandwich,"[-0.353722, -0.42622, 0.651725, 0.635225, 0.19..."
63025,63025,72928,254,98,Marge Simpson: Do you want turkey sausage or ham?,482000,True,1,48.0,Marge Simpson,Springfield Town Hall,Do you want turkey sausage or ham?,do you want turkey sausage or ham,7.0,"['want', 'turkey', 'sausage', 'ham']",want turkey sausage ham,"[-0.4730565, -0.11206575, 0.2597675, 0.4477937..."
2875,2875,12412,41,263,"Waitress: That's ham, sausage and bacon with a...",1016000,True,608,481.0,Waitress,Izzy's Deli,"That's ham, sausage and bacon with a smidge of...",thats ham sausage and bacon with a smidge of mayo,10.0,"['s', 'ham', 'sausage', 'bacon', 'smidge', 'ma...",s ham sausage bacon smidge mayo,"[-0.30809948, -0.021444669, 0.39093003, 0.3375..."


In [131]:
find_quotes("maybe lisa is right that America is the land of opportunity")

Unnamed: 0.1,Unnamed: 0,id,episode_id,number,raw_text,timestamp_in_ms,speaking_line,character_id,location_id,raw_character_text,raw_location_text,spoken_words,normalized_text,word_count,lemmas,joined_lemmas,vectors
152015,152015,3255,11,178,"Homer Simpson: Please, please kids. Stop fight...",694000,True,2,5.0,Homer Simpson,Simpson Home,"Please, please kids. Stop fighting. Maybe Lisa...",please please kids stop fighting maybe lisas r...,34.0,"['kid', 'stop', 'fight', 'maybe', 'lisa', 'rig...",kid stop fight maybe lisa right america land o...,"[-0.242601, 0.1560978, -0.0654051, 0.004721159..."
9266,9266,18878,63,221,"Lisa Simpson: As Little Miss Springfield, it's...",1077000,True,9,670.0,Lisa Simpson,SPRINGFIELD DOCK,"As Little Miss Springfield, it's my pleasure t...",as little miss springfield its my pleasure to ...,16.0,"['little', 'miss', 'springfield', 'pleasure', ...",little miss springfield pleasure welcome ameri...,"[-0.066171385, 0.1839, -0.16130738, -0.0113474..."
62557,62557,72465,252,225,"The Rich Texan: Uh-huh. In Texas, we do traged...",992000,True,1508,25.0,The Rich Texan,Simpson Living Room,"Uh-huh. In Texas, we do tragedy right. That's ...",uh-huh in texas we do tragedy right thats why ...,27.0,"['uh', 'huh', 'texas', 'tragedy', 'right', 's'...",uh huh texas tragedy right s memory poor littl...,"[-0.11148514, 0.15672927, -0.07756445, -0.0282..."
87112,87112,97090,337,95,"Apu Nahasapeemapetilon: Oh Mrs. Simpson, you m...",443000,True,208,136.0,Apu Nahasapeemapetilon,Kwik-E-Mart,"Oh Mrs. Simpson, you must pursue your dreams. ...",oh mrs simpson you must pursue your dreams lik...,35.0,"['oh', 'mrs', 'simpson', 'pursue', 'dream', 'l...",oh mrs simpson pursue dream like old dream com...,"[0.037973415, 0.09380421, -0.043061897, -0.113..."
40260,40260,50101,177,238,Troy McClure: So join America's favorite TV fa...,1324000,True,426,1636.0,Troy McClure,Museum of TV and Television,So join America's favorite TV family... and a ...,so join americas favorite tv family and a tiny...,27.0,"['join', 'america', 'favorite', 'tv', 'family'...",join america favorite tv family tiny green spa...,"[0.044942755, 0.017361527, -0.11628673, 0.0002..."


In [31]:
lines.loc[152056]['spoken_words']

"Please, please kids. Stop fighting. Maybe Lisa's right about America being a land of opportunity, and maybe Adil has a point about the machinery of capitalism being oiled with the blood of the workers."

In [134]:
find_quotes('and this is the snack holder where i can put my beverage or cupcake')

Unnamed: 0.1,Unnamed: 0,id,episode_id,number,raw_text,timestamp_in_ms,speaking_line,character_id,location_id,raw_character_text,raw_location_text,spoken_words,normalized_text,word_count,lemmas,joined_lemmas,vectors
11546,11546,21142,71,169,Homer Simpson: And this is the snack holder wh...,770000,True,2,743.0,Homer Simpson,Monorail Cockpit,And this is the snack holder where I can put m...,and this is the snack holder where i can put m...,19.0,"['snack', 'holder', 'beverage', 'cupcake']",snack holder beverage cupcake,"[0.289251, -0.23025802, 0.3376525, -0.07969624..."
157139,157139,8445,28,306,Head Engineer: (COWED) Extremely large beverag...,1041000,True,420,340.0,Head Engineer,Powell Motors Corporate Headquarters,Extremely large beverage holder.,extremely large beverage holder,4.0,"['extremely', 'large', 'beverage', 'holder']",extremely large beverage holder,"[-0.138504, -0.09092925, 0.004502997, -0.03402..."
137906,137906,148334,530,100,Marge Simpson: I thought I was buying snack ca...,477000,True,1,5.0,Marge Simpson,Simpson Home,I thought I was buying snack cakes!,i thought i was buying snack cakes,7.0,"['think', 'buy', 'snack', 'cake']",think buy snack cake,"[0.015032997, 0.061175875, 0.106795, -0.203697..."
136266,136266,146695,524,130,Lunchlady Dora: Him?! All he takes are sodas a...,681000,True,4715,3.0,Lunchlady Dora,Springfield Elementary School,Him?! All he takes are sodas and desserts!,him all he takes are sodas and desserts,8.0,"['take', 'soda', 'dessert']",take soda dessert,"[-0.031605, 0.07468366, 0.21922666, -0.2043966..."
157137,157137,8443,28,304,"Head Engineer: Sir, the car has a beverage hol...",1029000,True,420,340.0,Head Engineer,Powell Motors Corporate Headquarters,"Sir, the car has a beverage holder.",sir the car has a beverage holder,7.0,"['sir', 'car', 'beverage', 'holder']",sir car beverage holder,"[-0.012304001, -0.010795996, 0.0131974965, -0...."


## Lemmatization with Spacy

In [60]:
# function to get a list of lemmas from a string
def get_lemmas(text):
    
    if(isinstance(text, float)): #return NaN for NaN values
        return np.nan
    
    else:
        lemmas = []
        doc = nlp(text.lower())
    
        for token in doc:
            if (token.is_stop == False and token.is_punct == False) and token.pos_ != "-PRON-":
                lemmas.append(token.lemma_)
        return lemmas

In [61]:
df['lemmas'] = df['normalized_text'].apply(get_lemmas)

In [84]:
def join_lists(x):
    if(isinstance(x, float)): #return NaN for NaN values
        return np.nan
    
    else:
        return " ".join(x)

In [85]:
df['joined_lemmas'] = df['lemmas'].apply(join_lists)

In [108]:
df.to_csv('simpsons_script_lines.csv')

## Pickle the model

In [139]:
import joblib

In [140]:
# save the model to disk
filename = 'knn_model.sav'
joblib.dump(nn, filename)

['knn_model.sav']

In [142]:
# save the model to disk
pickle_filename = 'knn_pickled.sav'
pickle.dump(nn, open(pickle_filename, 'wb'))

Test

In [1]:
import pickle
import spacy
import pandas as pd

In [2]:
# Load Search Model
knn_search_file = open('knn_pickled.sav', 'rb')
knn_search = pickle.load(knn_search_file)

# Load Spacy Model
nlp = spacy.load('en_core_web_md')

# Import Data
lines = pd.read_csv('simpsons_script_lines.csv')
lines = lines.dropna()

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# function to get a list of lemmas from a string
def get_lemmas(text):
    if(isinstance(text, float)): #return NaN for NaN values
        return np.nan
    else:
        lemmas = []
        doc = nlp(text.lower())
    
        for token in doc:
            if (token.is_stop == False and token.is_punct == False) and token.pos_ != "-PRON-":
                lemmas.append(token.lemma_)
        return lemmas
    
# Get Vectors Function
def get_vectors_of_string(inp_str):
    lemma_list = get_lemmas(inp_str)
    joined = " ".join(lemma_list)
    return nlp(joined).vector

# Find Quotes Function
def find_quotes(inp_str):
    vect = get_vectors_of_string(inp_str)
    closest_quotes = knn_search.kneighbors([vect])
    return (lines['raw_character_text'].iloc[closest_quotes[1][0]], lines['spoken_words'].iloc[closest_quotes[1][0]])

In [4]:
# Find Quotes Function
def find_quotes(inp_str):
    vect = get_vectors_of_string(inp_str)
    closest_quotes = knn_search.kneighbors([vect])
    indices = closest_quotes[1][0].tolist()
    results = [(lines['raw_character_text'].iloc[i], lines['spoken_words'].iloc[i]) for i in indices]
    return results

In [7]:
search_results = find_quotes("steamed ham")

In [37]:
search_results

[('Gary Chalmers', 'You call hamburgers "steamed hams?"'),
 ('Seymour Skinner',
  'No, no. I said "steamed hams." That\'s what I call hamburgers.'),
 ('Seymour Skinner',
  "Ah... oh, that isn't smoke. It's steam. Steam from the steamed clams we're having. Mmmm, steamed clams."),
 ('Homer Simpson', 'Steamed Maine cabbages!'),
 ('Gary Chalmers',
  'Well, Seymour, you are an odd fellow, but I must say -- you "steam a good ham."')]

In [42]:
output = {'one': search_results[0],
          'two': search_results[1],
          'three': search_results[2],
          'four': search_results[3],
          'five': search_results[4]}

In [43]:
output

{'one': ('Gary Chalmers', 'You call hamburgers "steamed hams?"'),
 'two': ('Seymour Skinner',
  'No, no. I said "steamed hams." That\'s what I call hamburgers.'),
 'three': ('Seymour Skinner',
  "Ah... oh, that isn't smoke. It's steam. Steam from the steamed clams we're having. Mmmm, steamed clams."),
 'four': ('Homer Simpson', 'Steamed Maine cabbages!'),
 'five': ('Gary Chalmers',
  'Well, Seymour, you are an odd fellow, but I must say -- you "steam a good ham."')}

In [33]:
type(search_results)

list