# Model Tester (v1.1)
Run the code below only once. It might take a few minutes, let it.

In [1]:
# Import libraries:
import sys
!{sys.executable} -m pip install spacy wget -q
!{sys.executable} -m spacy download en_core_web_lg -q
import os
import pickle
import spacy
from thinc.api import Config
from spacy import Language
from spacy.lang.en import English
import en_core_web_lg
import wget
import json
import math
import pandas as pd
# Build the model. We only do this once per Binder instance.

os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
url = "https://mediacloud-ihop.s3.amazonaws.com/models/spacy_model.p"
take1 = wget.download(url)
with open("spacy_model.p", "rb") as h:
	take = pickle.load(h)
config = Config().from_disk("./config.cfg")
lang_cls = spacy.util.get_lang_class(config["nlp"]["lang"])
nlp = lang_cls.from_config(config)
nlp = nlp.from_bytes(take)
# The model should now be a file named "spacy_model.p" in the nav pane. This will NOT be in the Github.


##Here are some functions for bigger testing.

#This gives precision and recall numbers for a given dataframe created above.
def precall(data):
	false_positives = 0
	false_negatives = 0
	true_positives = 0
	for x in range(len(data)):
		expected = data.iloc[x,1]
		actual = data.iloc[x,2]
		if math.isnan(expected - actual): continue
		else:
			if expected == actual:
				true_positives += actual
			elif expected > actual:
				true_positives += actual
				false_negatives += (expected - actual)
			else:
				true_positives += expected
				false_positives += (actual - expected)
	precision = true_positives / (true_positives + false_positives)
	recall = true_positives / (true_positives + false_negatives)
	print("PRECISION: " + str(precision))
	print("RECALL: " + str(recall))

#This function takes the storyid for a storyjson blog post, and returns a dataframe with expected and actual results for each term in the post.
def build_food_tester(convert):

	#First, do the csv results.
	jsons_export = dict()
	filepath = "./storyjsons/" + convert + ".json"
	with open(filepath, 'rb') as json_file:
		foo = json.load(json_file)
		bar = foo['content']
		doc = nlp(bar) #Don't worry about any of these :)

		for ents in doc.ents:
			if(ents.label_ != "FOOD"): continue
			elif jsons_export.get(ents.text, -1) != -1:
				jsons_export[ents.text] += 1
			else:
				jsons_export[ents.text] = 1
	
	#Next, collect the data from the Manual Tests
	filepath = "./csvs/" + convert + ".csv"
	csvs_export = pd.read_csv(filepath)
	csvs_export = dict(zip(csvs_export.iloc[:,0], csvs_export.iloc[:,1]))
	#print(csvs_export)

	#Finally, put the lists together.
	df = pd.DataFrame()
	df = df.reindex(columns = ["term", "expected", "actual"])
	for key in csvs_export:
		value3 = 0
		if jsons_export.get(key, -1) != -1:
			value3 = jsons_export[key]
			jsons_export[key] = -2
		df.loc[len(df.index)] = [key, csvs_export[key], value3]

	for key in jsons_export:
		if jsons_export[key] != -2:
			df.loc[len(df.index)] = [key, 0, jsons_export[key]]
	df = df.sort_values('term')
	return df




[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


### User-Input Tests

In [None]:
# Here is a set of code for you to test everything out on your own; a personal sandbox!
doc = nlp("bacon egg and cheese sandwich")
for token in doc:
	print(token.text, token.pos_)
for chunk in doc.noun_chunks:
	print(chunk.text, chunk.root.text)
for ents in doc.ents:
	print(ents.text, ents.label_)

In [3]:
# Testing random sentences.
testsentences = ['', 
                 'get your car insurance at 50% average rates today by calling 334-808-1992', 
                 'today i ate a bacon cheeseburger with lettuce, onion and tomato. the salsa added to the top was too runny, so i would add some lemon juice on top.', 
                 'red pepper flakes are a great seasoning to add to many dishes.', 
                 'just one egg is fine, but i think two eggs will help the millefeuille maintain structure'
                ]
testanswers = [[], 
               [], 
               ['bacon', 'cheeseburger', 'lettuce', 'onion', 'tomato', 'salsa', 'lemon juice'], 
               ['red pepper flakes', 'seasoning'], 
               ['egg', 'egg', 'millefeuille']
              ]
counter = 0
for sentence in testsentences:
    doc = nlp(sentence)
    nlp_answers = []
    print("Computer tags:")
    for ents in doc.ents:
        nlp_answers.append(ents.text)
    print(nlp_answers)
    print("Correct answer:")
    print(testanswers[counter])
    counter += 1
    

Computer tags:
[]
Correct answer:
[]
Computer tags:
['50%']
Correct answer:
[]
Computer tags:
['bacon', 'cheeseburger', 'onion', 'tomato', 'salsa', 'lemon', 'juice']
Correct answer:
['bacon', 'cheeseburger', 'lettuce', 'onion', 'tomato', 'salsa', 'lemon juice']
Computer tags:
['pepper', 'seasoning']
Correct answer:
['red pepper flakes', 'seasoning']
Computer tags:
['egg', 'eggs']
Correct answer:
['egg', 'egg', 'millefeuille']


### Manual Tests (from manually-entered csvs)

In [6]:
stories_dict = dict()
#j_to_c exists to ensure that everything in the csv folder of our answers is in the storyjson on its own.
#This also means that I haven't created tests yet for Instagram captions and the magazine covers.
j_to_c = dict()
contain = os.listdir("storyjsons")
for index in range(len(contain)):
    j_to_c[contain[index][0:-5]] = index



item_list = os.listdir("csvs")
#This will run the main system for each storyjson file, and create a comparison table each time.
for element in item_list:
    convert = element[0:-4]
    if j_to_c.get(convert) is not None:
        df = build_food_tester(convert)
        stories_dict[convert] = df
        #UNCOMMENT TO PRINT ALL RESULTS: print(df)
        #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ precall(df)      

        
        
        
#Modify the array below with URLs you would like to look at, or leave it blank and uncomment above if you want all!:
observe = ["ff401e9686de3bec35052aa33a8382c5", "e443c33f56c338bc50653946ce88460b"]
for element in observe:
    output = build_food_tester(element)
    print(element + "\n")
    print(output)
    precall(output)

ff401e9686de3bec35052aa33a8382c5

                        term  expected  actual
27                     azuki       1.0     0.0
37                     broth       1.0     1.0
51                      cake       0.0     1.0
29         candied chestnuts       1.0     0.0
40                      chew       0.0     2.0
42                     chewy       0.0     2.0
28              chiffon cake       1.0     0.0
56                     chips       0.0     1.0
55                 chocolate       0.0     2.0
38  chocolate chip ice cream       1.0     0.0
39           chocolate chips       1.0     0.0
45                confection       0.0     1.0
26                cornflakes       1.0     0.0
44                     cream       0.0     3.0
4                      dango       9.0     0.0
34                     dashi       1.0     0.0
52                   dessert       0.0     2.0
0                  dumplings       3.0     2.0
36                       egg       1.0     1.0
7                      flo