# Model Tester (v1.1)
Run the code below only once. It might take a few minutes, let it.

In [17]:
# Import libraries:
import sys
!{sys.executable} -m pip install spacy wget -q
!{sys.executable} -m spacy download en_core_web_lg -q
!{sys.executable} -m pip install pandas
import os
import pickle
import spacy
from thinc.api import Config
from spacy import Language
from spacy.lang.en import English
import en_core_web_lg
import wget
import json
import math
import pandas as pd
# Build the model. We only do this once per Binder instance.

os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
url = "https://mediacloud-ihop.s3.amazonaws.com/models/spacy_model.p"
take1 = wget.download(url)
with open("spacy_model.p", "rb") as h:
	take = pickle.load(h)
config = Config().from_disk("./config.cfg")
lang_cls = spacy.util.get_lang_class(config["nlp"]["lang"])
nlp = lang_cls.from_config(config)
nlp = nlp.from_bytes(take)
# The model should now be a file named "spacy_model.p" in the nav pane. This will NOT be in the Github.


##Here are some functions for bigger testing.

#This gives precision and recall numbers for a given dataframe created above.
def precall(data):
	false_positives = 0
	false_negatives = 0
	true_positives = 0
	for x in range(len(data)):
		expected = data.iloc[x,1]
		actual = data.iloc[x,2]
		if math.isnan(expected - actual): continue
		else:
			if expected == actual:
				true_positives += actual
			elif expected > actual:
				true_positives += actual
				false_negatives += (expected - actual)
			else:
				true_positives += expected
				false_positives += (actual - expected)
	precision = true_positives / (true_positives + false_positives)
	recall = true_positives / (true_positives + false_negatives)
	print("PRECISION: " + str(precision))
	print("RECALL: " + str(recall))

#This function takes the storyid for a storyjson blog post, and returns a dataframe with expected and actual results for each term in the post.
def build_food_tester(convert):

	#First, do the json results.
	jsons_export = dict()
	url = "https://mediacloud-ihop.s3.amazonaws.com/models/storyjsons/" + convert + ".json"
	filepath = convert + ".json"
	if (filepath) not in os.listdir():
		filepath = wget.download(url)
	with open(filepath, 'rb') as json_file:
		foo = json.load(json_file)
		bar = foo['content']
		doc = nlp(bar) #Don't worry about any of these :)

		for ents in doc.ents:
			if(ents.label_ != "FOOD"): continue
			elif jsons_export.get(ents.text, -1) != -1:
				jsons_export[ents.text] += 1
			else:
				jsons_export[ents.text] = 1
	
	#Next, collect the data from the Manual Tests
	filepath = "./csvs/" + convert + ".csv"
	csvs_export = pd.read_csv(filepath)
	csvs_export = dict(zip(csvs_export.iloc[:,0], csvs_export.iloc[:,1]))
	#print(csvs_export)

	#Finally, put the lists together.
	df = pd.DataFrame()
	df = df.reindex(columns = ["term", "expected", "actual"])
	for key in csvs_export:
		value3 = 0
		if jsons_export.get(key, -1) != -1:
			value3 = jsons_export[key]
			jsons_export[key] = -2
		df.loc[len(df.index)] = [key, csvs_export[key], value3]

	for key in jsons_export:
		if jsons_export[key] != -2:
			df.loc[len(df.index)] = [key, 0, jsons_export[key]]
	df = df.sort_values('term')
	return df




[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


### User-Input Tests

In [None]:
# Here is a set of code for you to test everything out on your own; a personal sandbox!
doc = nlp("bacon egg and cheese sandwich")
for token in doc:
	print(token.text, token.pos_)
for chunk in doc.noun_chunks:
	print(chunk.text, chunk.root.text)
for ents in doc.ents:
	print(ents.text, ents.label_)

In [3]:
# Testing random sentences.
testsentences = ['', 
                 'get your car insurance at 50% average rates today by calling 334-808-1992', 
                 'today i ate a bacon cheeseburger with lettuce, onion and tomato. the salsa added to the top was too runny, so i would add some lemon juice on top.', 
                 'red pepper flakes are a great seasoning to add to many dishes.', 
                 'just one egg is fine, but i think two eggs will help the millefeuille maintain structure'
                ]
testanswers = [[], 
               [], 
               ['bacon', 'cheeseburger', 'lettuce', 'onion', 'tomato', 'salsa', 'lemon juice'], 
               ['red pepper flakes', 'seasoning'], 
               ['egg', 'egg', 'millefeuille']
              ]
counter = 0
for sentence in testsentences:
    doc = nlp(sentence)
    nlp_answers = []
    print("Computer tags:")
    for ents in doc.ents:
        nlp_answers.append(ents.text)
    print(nlp_answers)
    print("Correct answer:")
    print(testanswers[counter])
    counter += 1


Computer tags:
[]
Correct answer:
[]
Computer tags:
['50%']
Correct answer:
[]
Computer tags:
['bacon', 'cheeseburger', 'onion', 'tomato', 'salsa', 'lemon', 'juice']
Correct answer:
['bacon', 'cheeseburger', 'lettuce', 'onion', 'tomato', 'salsa', 'lemon juice']
Computer tags:
['pepper', 'seasoning']
Correct answer:
['red pepper flakes', 'seasoning']
Computer tags:
['egg', 'eggs']
Correct answer:
['egg', 'egg', 'millefeuille']


In [13]:
# Use this if you want to see the results from an entire storyjson file.
def test_from_sjs(convert, print_text = False):
    jsons_export = dict()
    url = "https://mediacloud-ihop.s3.amazonaws.com/models/storyjsons/" + convert + ".json"
    filepath = convert + ".json"
    if filepath not in os.listdir("csvs"):
        filepath = wget.download(url)
    with open(filepath, 'rb') as json_file:
        foo = json.load(json_file)
        bar = foo['content']
        if print_text: print(bar)
        doc = nlp(bar)

        for ents in doc.ents:
            if(ents.label_ != "FOOD"): continue
            elif jsons_export.get(ents.text, -1) != -1:
                jsons_export[ents.text] += 1
            else:
                jsons_export[ents.text] = 1
    return jsons_export

print(test_from_sjs("e443c33f56c338bc50653946ce88460b"))

{'sushi': 5, 'sauce': 1, 'yogurt': 2, 'coffee': 2, 'oatmeal': 1, 'milk': 4, 'butter': 2, 'tea': 1, 'egg': 2, 'salad': 2, 'eggs': 5, 'diet': 1}


### Manual Tests (from manually-entered csvs)

In [29]:
stories_dict = dict()

#This will run the main system for each storyjson file, and create a comparison table each time. This might take a while.
for element in os.listdir("csvs"):
    convert = element[0:-4]
    if element[0:2] != "BA" and element[0:9] != "instagram" and element != ".DS_Store": #Weird # of exceptions...
        df = build_food_tester(convert)
        stories_dict[convert] = df
        #UNCOMMENT TO PRINT ALL RESULTS: print(df)
        #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ precall(df)      
        
#Modify the array below with URLs you would like to look at, or leave it blank and uncomment above if you want all!:
observe = ["ff401e9686de3bec35052aa33a8382c5", "e443c33f56c338bc50653946ce88460b"]
for element in observe:
    output = build_food_tester(element)
    print(element + "\n")
    print(output)
    precall(output)

instagram.com_p_CbYHj4cB_S2_.csv
ff401e9686de3bec35052aa33a8382c5.csv
a5d7a468b900da50f787f11e10673224.csv
instagram.com_p_BmwXCK2nAK8_.csv
c332838cb3ab544ce16aaaf0d30d1587.csv
instagram.com_p_CY4XZ16Bvyf_.csv
7a2421f687bf90732cf4ba727ab6b09b.csv
613aa03ee246323cd6ffcc4ea03e68cc.csv
.DS_Store
a85cbd99b2dba1cd1dbc31b46952ac76.csv
703a2a8217ed493e0ae29b81386aa9b4.csv
instagram.com_p_CautFXhhpil_.csv
instagram.com_p_CAsvIekHOVG_.csv
ca6f92e3a1f112f6714cfa69bbe90cd0.csv
BA June-July 2019.csv
BA Dec 2019 - Jan 2020 .csv
instagram.com_p_CbgKjuGh4Ir_.csv
0acff7d82116e5c0f1b3eb1b6b5bade8.csv
instagram.com_p_Bnt5-sRndK7_.csv
BA June-July 2018.csv
BA June-July 2020.csv
instagram.com_p_BnR5DxyHioP_.csv
BA Dec 2021 - Jan 2022 .csv
BA June-July 2021.csv
c3f04ed913c48f33238d9764c3817cfb.csv
0cf67eb7f52cd235509b530bf7c3ea9b.csv
instagram.com_p_CbSUwTSMqwC_.csv
f3d1c80cb0263594aadf4903d4ff448b.csv
fa262c03a3f536e5165fc0e9cd061b64.csv
BA March 2019.csv
instagram.com_p_CbQSUT9Bnu0_.csv
cc834cc2d8291bcc4

In [27]:
#Of course, if you'd like, you can run this function against any manually checked ID:
print(build_food_tester("a94ef4308e8cae75766fea73531918c1"))

                         term  expected  actual
8           all-purpose flour       2.0     0.0
4                      butter       1.0     2.0
34                      chard       1.0     0.0
43                      chill       0.0     1.0
16  crushed red pepper flakes       1.0     0.0
44                     delish       0.0     1.0
54        delish!AnonymousSan       0.0     1.0
24                       dill       1.0     0.0
46                        dip       0.0     1.0
7                       dough       7.0     0.0
21                        egg       1.0     3.0
12     extra-virgin olive oil       2.0     0.0
49                       feta       0.0     1.0
28                      flour       1.0     2.0
36                    floured       1.0     0.0
1                     galette       3.0     6.0
32                     garlic       2.0     3.0
15              garlic cloves       1.0     0.0
20                     grated       1.0     0.0
22               greek yogurt       1.0 