# Model Tester (v1.1)
Run the code below only once. It might take a few minutes, let it.

In [8]:
# Import libraries:
import sys
!{sys.executable} -m pip install spacy wget -q
!{sys.executable} -m spacy download en_core_web_lg -q
import os
import pickle
import spacy
from thinc.api import Config
from spacy import Language
from spacy.lang.en import English
import en_core_web_lg
import wget
import json
import math
import pandas as pd
# Build the model. We only do this once per Binder instance.

os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
url = "https://mediacloud-ihop.s3.amazonaws.com/models/spacy_model.p"
take1 = wget.download(url)
with open("spacy_model.p", "rb") as h:
	take = pickle.load(h)
config = Config().from_disk("./config.cfg")
lang_cls = spacy.util.get_lang_class(config["nlp"]["lang"])
nlp = lang_cls.from_config(config)
nlp = nlp.from_bytes(take)
# The model should now be a file named "spacy_model.p" in the nav pane. This will NOT be in the Github.


##Here are some functions for bigger testing.
#This function takes the storyid for a storyjson blog post, and returns a dataframe with expected and actual results for each term in the post.
def build_food_tester(convert):

	#First, do the csv results.
	jsons_export = dict()
	filepath = "./storyjsons/" + convert + ".json"
	with open(filepath, 'rb') as json_file:
		foo = json.load(json_file)
		bar = foo['content']
		doc = nlp(bar) #Don't worry about any of these :)

		for ents in doc.ents:
			if(ents.label_ != "FOOD"): continue
			elif jsons_export.get(ents.text, -1) != -1:
				jsons_export[ents.text] += 1
			else:
				jsons_export[ents.text] = 1
	
	#Next, collect the data from the Manual Tests
	filepath = "./csvs/" + convert + ".csv"
	csvs_export = pd.read_csv(filepath)
	csvs_export = dict(zip(csvs_export.iloc[:,0], csvs_export.iloc[:,1]))
	#print(csvs_export)

	#Finally, put the lists together.
	df = pd.DataFrame()
	df = df.reindex(columns = ["term", "expected", "actual"])
	for key in csvs_export:
		value3 = 0
		if jsons_export.get(key, -1) != -1:
			value3 = jsons_export[key]
			jsons_export[key] = -2
		df.loc[len(df.index)] = [key, csvs_export[key], value3]

	for key in jsons_export:
		if jsons_export[key] != -2:
			df.loc[len(df.index)] = [key, 0, jsons_export[key]]
	df = df.sort_values('term')
	return df


#This gives precision and recall numbers for a given dataframe created above.
def precall(data):
	false_positives = 0
	false_negatives = 0
	true_positives = 0
	for x in range(len(data)):
		expected = data.iloc[x,1]
		actual = data.iloc[x,2]
		if math.isnan(expected - actual): continue
		else:
			if expected == actual:
				true_positives += actual
			elif expected > actual:
				true_positives += actual
				false_negatives += (expected - actual)
			else:
				true_positives += expected
				false_positives += (actual - expected)
	precision = true_positives / (true_positives + false_positives)
	recall = true_positives / (true_positives + false_negatives)
	print("PRECISION: " + str(precision))
	print("RECALL: " + str(recall))



[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [None]:
# Here is a set of code for you to test everything out on your own; a personal sandbox!
doc = nlp("bacon egg and cheese sandwich")
for token in doc:
	print(token.text, token.pos_)
for chunk in doc.noun_chunks:
	print(chunk.text, chunk.root.text)
for ents in doc.ents:
	print(ents.text, ents.label_)

In [3]:
# Testing random sentences.
testsentences = ['', 
                 'get your car insurance at 50% average rates today by calling 334-808-1992', 
                 'today i ate a bacon cheeseburger with lettuce, onion and tomato. the salsa added to the top was too runny, so i would add some lemon juice on top.', 
                 'red pepper flakes are a great seasoning to add to many dishes.', 
                 'just one egg is fine, but i think two eggs will help the millefeuille maintain structure'
                ]
testanswers = [[], 
               [], 
               ['bacon', 'cheeseburger', 'lettuce', 'onion', 'tomato', 'salsa', 'lemon juice'], 
               ['red pepper flakes', 'seasoning'], 
               ['egg', 'egg', 'millefeuille']
              ]
counter = 0
for sentence in testsentences:
    doc = nlp(sentence)
    nlp_answers = []
    print("Computer tags:")
    for ents in doc.ents:
        nlp_answers.append(ents.text)
    print(nlp_answers)
    print("Correct answer:")
    print(testanswers[counter])
    counter += 1
    

Computer tags:
[]
Correct answer:
[]
Computer tags:
['50%']
Correct answer:
[]
Computer tags:
['bacon', 'cheeseburger', 'onion', 'tomato', 'salsa', 'lemon', 'juice']
Correct answer:
['bacon', 'cheeseburger', 'lettuce', 'onion', 'tomato', 'salsa', 'lemon juice']
Computer tags:
['pepper', 'seasoning']
Correct answer:
['red pepper flakes', 'seasoning']
Computer tags:
['egg', 'eggs']
Correct answer:
['egg', 'egg', 'millefeuille']


### Manual Tests (from manually-entered csvs)

In [9]:
#j_to_c exists to ensure that everything in the csv folder of our answers is in the storyjson on its own.
#This also means that I haven't created tests yet for Instagram captions and the magazine covers.
j_to_c = dict()
contain = os.listdir("storyjsons")
for index in range(len(contain)):
	j_to_c[contain[index][0:-5]] = index


item_list = os.listdir("csvs")
#This will run the main system for each storyjson file, and create a comparison table each time.
for element in item_list:	
	convert = element[0:-4]
	if j_to_c.get(convert) is not None:
		df = build_food_tester(convert)
		print(df)
		precall(df)

                        term  expected  actual
27                     azuki       1.0     0.0
37                     broth       1.0     1.0
51                      cake       0.0     1.0
29         candied chestnuts       1.0     0.0
40                      chew       0.0     2.0
42                     chewy       0.0     2.0
28              chiffon cake       1.0     0.0
56                     chips       0.0     1.0
55                 chocolate       0.0     2.0
38  chocolate chip ice cream       1.0     0.0
39           chocolate chips       1.0     0.0
45                confection       0.0     1.0
26                cornflakes       1.0     0.0
44                     cream       0.0     3.0
4                      dango       9.0     0.0
34                     dashi       1.0     0.0
52                   dessert       0.0     2.0
0                  dumplings       3.0     2.0
36                       egg       1.0     1.0
7                      flour       NaN     4.0
47           

                term  expected  actual
11             achar       1.0     0.0
0       aloo paratha       2.0     0.0
1              bread       2.0     3.0
36            butter       0.0     1.0
27           cheddar       0.0     1.0
13    cheddar cheese       1.0     0.0
28            cheese       0.0     1.0
30             chili       0.0     3.0
32             chill       0.0     1.0
18          cilantro       3.0     2.0
5         condiments       1.0     0.0
3               dals       1.0     0.0
7              dough       5.0     0.0
24            fennel       1.0     0.0
17      fennel seeds       2.0     0.0
21             flesh       1.0     0.0
15             flour       4.0     6.0
37              ghee       0.0     1.0
35            ginger       0.0     1.0
34         jalepenos       0.0     1.0
25   mashed potatoes       1.0     0.0
26               oil       3.0     8.0
33             onion       0.0     1.0
10           paratha       4.0     0.0
2           parathas     

             term  expected  actual
7      appetizers       1.0     0.0
11          bread       1.0     1.0
4         chicken       1.0     1.0
6          citrus       1.0     0.0
1    diced onions       1.0     0.0
12         garlic       0.0     1.0
3          juices       1.0     0.0
0            meat       1.0     1.0
2   minced garlic       1.0     0.0
9             oil       1.0     1.0
13         smooth       0.0     1.0
8           steak       1.0     1.0
10        vinegar       1.0     1.0
5      watermelon       1.0     1.0
PRECISION: 0.7777777777777778
RECALL: 0.5833333333333334
                     term  expected  actual
33                arugula       0.0     1.0
21          arugula salad       1.0     0.0
12                   beef       1.0     1.0
16                  beefy       1.0     0.0
11           black pepper       1.0     0.0
26                burgers       0.0     1.0
30                   cake       0.0     1.0
28                   chew       0.0     1.0
1      

                       term  expected  actual
54             BeanRiceRice       0.0     1.0
0          Diri Kole ak Pwa       1.0     0.0
29             basmati rice       1.0     0.0
1                     beans       8.0    12.0
11              bell pepper       2.0     0.0
32             black pepper       1.0     0.0
50                bratwurst       0.0     1.0
42                    broth       3.0     5.0
33                   celery       1.0     1.0
14             celery stalk       1.0     0.0
34                    chile       2.0     0.0
49                    chill       0.0     1.0
51                 chipotle       0.0     1.0
48                   chunky       0.0     1.0
36                 cilantro       1.0     2.0
20          cilantro leaves       1.0     0.0
10                   citrus       1.0     0.0
19                    clove       2.0     0.0
43                     cook       1.0     0.0
24  distilled white vinegar       1.0     0.0
27       dried kidney beans       

          term  expected  actual
3     beverage       2.0     2.0
21   carbonara       1.0     0.0
20      cheese       1.0     1.0
17       chili       1.0     1.0
5      dim sum       1.0     0.0
23      drinks       0.0     1.0
4       fruits       2.0     0.0
18      greens       1.0     0.0
8          ham       1.0     1.0
0     kombucha       9.0    11.0
13   kombuchas       2.0     0.0
10   mushrooms       1.0     1.0
16     noodles       1.0     1.0
6       orange       1.0     1.0
11      oyster       1.0     0.0
9       pollen       1.0     0.0
19       salad       1.0     1.0
24       spicy       0.0     1.0
12  sugarplums       1.0     0.0
2         tart       1.0     1.0
1          tea       6.0     6.0
22        teas       1.0     0.0
15        tofu       1.0     1.0
14  tteokbokki       1.0     0.0
7         wine       2.0     0.0
PRECISION: 0.8666666666666667
RECALL: 0.65
