In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
from datasets import load_from_disk
from transformers import AutoTokenizer

import os
import itertools
import json
import numpy as np
from tqdm import tqdm
from collections import defaultdict
import matplotlib.pyplot as plt

Matplotlib created a temporary config/cache directory at /tmp/pbs.2925400.pbsha.ib.sockeye/matplotlib-dijnb8mr because the default path (/home/gregdeon/.config/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


In [3]:
# work around insane huggingface requirements
from datasets.utils.logging import set_verbosity_error
set_verbosity_error()

In [5]:
# from spotlight.datasets import *
from torch_spotlight.utils import *
from torch_spotlight.plot_utils import *

setupPlots()

In [9]:
os.environ['DATA_DIR'] = '/arc/project/st-kevinlb-1/gregdeon/spotlight/datasets'
os.environ['MODEL_DIR'] = '/arc/project/st-kevinlb-1/gregdeon/spotlight/models'

In [None]:
model_dir = '/arc/project/st-kevinlb-1/gregdeon/spotlight/models'
amazon_model_path = os.path.join(model_dir, 'amazon')

In [11]:
data_dir = os.environ['DATA_DIR'] 
model_dir = os.environ['MODEL_DIR'] 
amazon_dir = os.path.join(data_dir, 'amazon')
amazon_model_path = os.path.join(model_dir, 'amazon')
squad_dir = os.path.join(data_dir, 'squad')
squad_model_path = os.path.join(model_dir, 'squad')

# Amazon reviews

## Dataset + tokenizer

In [12]:
dataset = load_from_disk(amazon_dir)['test']

In [13]:
dataset[1]

{'content': "Despite the fact that I have only played a small portion of the game, the music I heard (plus the connection to Chrono Trigger which was great as well) led me to purchase the soundtrack, and it remains one of my favorite albums. There is an incredible mix of fun, epic, and emotional songs. Those sad and beautiful tracks I especially like, as there's not too many of those kinds of songs in my other video game soundtracks. I must admit that one of the songs (Life-A Distant Promise) has brought tears to my eyes on many occasions.My one complaint about this soundtrack is that they use guitar fretting effects in many of the songs, which I find distracting. But even if those weren't included I would still consider the collection worth it.",
 'label': 1,
 'title': "One of the best game music soundtracks - for a game I didn't really play"}

In [14]:
num_examples = 20000
amazon_labels = [dataset[i]['label'] for i in range(num_examples)]

In [15]:
tokenizer = AutoTokenizer.from_pretrained(
    "distilbert-base-uncased-finetuned-sst-2-english", 
    cache_dir=amazon_model_path, 
    do_lower_case=True, 
    do_basic_tokenize=True,
    local_files_only=True
)

In [11]:
unique_tokens = []
for i in tqdm(range(num_examples)):
    tokenized = tokenizer(dataset[i]['content'], padding=True, return_tensors='pt')
    unique_tokens.append(torch.unique(tokenized['input_ids']))

100%|██████████| 20000/20000 [00:09<00:00, 2015.61it/s]


## Model outputs

In [13]:
embeddings, outputs, losses = loadInferenceResults('inference_results/amazon_train_sst_20k.pkl')

In [14]:
loss_idx = losses.argsort().tolist()[::-1]
for idx in loss_idx[:10]:
    example = dataset[idx]
    display(example['content'])
    display(example['label'])
    display(losses[idx])
    print()

"This production is so awesome we can't stop watching it over and over. It is one of the finest vocal performances I have ever seen."

0

9.071007




'Very nice peridot necklace. I bought the 3-piece set for a birthday gift, and I was proud to give it as a gift.'

0

8.975364




'love the movie. made me cry. I was expecting the original movie but it still is a wonderful story to watch'

0

8.92529




'This was my first encounter with the magical realism style and I love it! The Buendia family is quite possibly the most intriguing I have encountered in any book. A must read.'

0

8.909518




'This one gift I could give my grown daughter. I read this book over and over to her as a child. It was in wonderful condition!'

0

8.891371




"I watched the movie on Cartoon Network about two years ago. It's an excellent movie, I tell you. It's facinating!!! ^_^I loved the characters and their personalities. I seriously loved the art designs animation. It brings out the true life of the movie. In my opinion, this DVD is great! You should get this DVD and soon. It ROCKS! ^_^!"

0

8.877153




'Like it. Love it. Live in Paris will always be in my top 5. She is a star with this incredible band. Some of her originals are too melancholy for me, but she is truly unique in her talents. Who else can compose, sing and play jazz piano like that? Who was the girl?'

0

8.789619




'This is Hawaiian music done very well. Darnall is a master arranger and player of string instruments. Most songs could be background music. However! Over the Rainbow is pure genius. Unforgettable. I would use the word, haunting. I think Judy Garland would approve.'

0

8.738428




'Jerry Garcia was a stone cold genius. The Grateful Dead was his backup band. Bob Weir\'s solo work is easily as good as the solo work of Bill Wyman, Keith Moon, Dave Davies, and Clarence Clemmons. From a 40 year career this is the "best" they could come up with?'

0

8.728806




'The background music is not clear and the CD was a waste of money. One star is too high.'

1

8.652259




## Spotlights

In [15]:
spotlights = {}
for count in np.arange(1, 6):
    spotlights[count] = loadSpotlightResults(f'spotlight_results/amazon_train_sst_20k_0.05_{count}.pkl')

In [24]:
# Example of high-weight reviews
spotlight = spotlights[3]
weight_idx = spotlight[0].argsort().tolist()[::-1]
for idx in weight_idx[:50]:
    example = dataset[idx]
    display(idx)
    display(example['content'])
    display(example['label'])
    display(losses[idx])
    display(outputs[idx])
    print()

5253

'I gave one star because i could not play game.Im glad i only paid $6.95.Thair is a way to get the game to play on windows 7,you have to set you boot advanced options to 1 core every time you want to play game.'

0

0.0050201556

0.0050075655




5171

"I run my own insurance business and was always inundated by a sea of paperwork. This little scanner fits into my BJC-85 printer and serves double duty - I can scan and then print with the same machine. Genius! I would have liked the ability to scan documents into a pdf format, but you can't have everything. This little scanner is the main reason I will not upgrade my BJC-85, as it is not compatible with later models."

1

5.110242

0.006034624




7564

"I futzed with this for about a week. Had d-link tech support's number programmed into my phone (power down the router, press the reset key, replug everything, yada yada yada). I bought this router and the PCI wireless card for my desktop because of the excellent reviews I read on this item at CNET and PC Magazine. The PCI card froze my desktop EVERY time I turned the network on and the router would not connect to my laptop even when the units were a few feet apart. Exchanged for Linksys and have had not one call to tech support. I suppose this will work for a lot of people, but for me this operated as a 1 star product. Do buy from a place you can easily return (like Amazon!) because if your experience is like mine, you will get fed up talking with the tech support people and try something else!"

0

0.0053183027

0.005304181




3504

'This player is great for DVDs, unfortunately I bought the player for its 5 disc cdr/mp3 playability. The player makes 2 second jumps on every song of every cdr I own. I took it in to get fixed. When I got it back, I plugged it in and still does the same thing.'

0

0.008454129

0.0084185405




8227

"The previous reviews are all correct. Inexpensive, silent and easy to load watches. However, I have had this unit overwind two watches. These are $200 Japan automatic movements so I wasn't very happy. Might be better on a swiss movement, but why would you want to risk a more expensive watch on this winder."

0

0.0055729738

0.005557475




9872

'I purchased this item last year. The first one wasn\'t even completely assembled. The water leaked out all over the counter and floor. My husband took it apart and the hoses weren\'t attached. It made ice once and I returned my new unit to Sunpentown for another. That one worked for several months without a problem. Then the "needs water" light kept coming on and it would not make ice. I did just read on another site review that if you tip it backwards, sometimes that will help. They should enclose that info in the instructions. I finally gave it away on freecycle. I love the concept of this appliance, but would like to find one that worked without have to do rocket surgery.'

0

0.004948867

0.004936638




13181

"I noticed this a while after I purchased the player and I attributed it to the CD, not the player. You know, being a Tivoli this should be Great! However now it's been two years an I barely use it because of that and using USB memory music. http://www.amazon.com/gp/product/B000083GPS/ref=cm_cr_rev_prod_title"

0

0.0085005835

0.008464535




19032

"I have always wanted a DVD of this Nutcracker ballet company. But when reading the specs they said it won't play on recordable DVD's or PC's. I have a home theater system plus another DVD recorder/player so I don't want to take a chance it could be ruined or I wouldn't be able to play. Bummer!!!"

0

0.005839313

0.0058222776




14227

"As a student chemical engineer I found the book somewhat helpful, but not really. My teacher taught the same material but in a different way and his way seemed much simpler than the books. I returned the book at the end of the semester because none of my classes 'needed' the book. Surprise! I now need the book and amazon apparently takes 1 to 3 months to ship this book...aka I'll get the book at the end of the semester."

0

0.005284152

0.005270215




9671

"Works fine but don't buy this if you buy the seven-speed mixer. It's already included with the mixer and you only need one!"

0

0.00637853

0.006358221




3496

'I bought the nailer over a year ago and use it every few weeks when I get some finish work. It is light weight and it is easy to clear jams. One day it would not fire. I took it to be rebuilt and had an estimate of $100. The guy called and told me it needs a new firing mechanism and it would cost $189. So I am ordering the Hitachi today.'

0

0.0067539504

0.006731216




11586

'The Rio 800 has had some poor press, but the problem is not the player but the documentation accompanying it. The battery seems to be a major problem until you figure out that it must be calibrated several times until the screen says, "the battery is calibrated" and not that there are 0 minutes left to charge. By the third time, I finally got 10hrs. playing time. The sounds tends to be a little "tinny" but that is a headphone problem and the sound can be adjusted on the device. I\'m not a "tech" type, but you just need to play with this for a while to learn what it will do. Then, you\'ll be more than happy for the price.'

1

5.5415316

0.0039205174




2364

"The HDMI cable works OK with our plasma (someone was asking if was only for LCD before) but if you are using it with your XBOX 360 and you have a headset (Turtle Beach-type, not the regular ones) you'll also need the adaptor for audio OUT. I should have checked before, because the adaptor by itself is more expensive than cable and adaptor together, at least here in Amazon. Otherwise, no problem. Recommended, especially for the price."

1

5.5492454

0.0038903928




7263

"I bought this book because I am working on a software project that involves Exchange 2000. However, the book's author does not use proper terminology. For example, they talk about SAM replication times but Exchange 2000 uses Active Directory. They talk about site connectors but Exchange 2000 uses Routing Group Connectors. I know this product is still in Beta, but RC2 is feature complete. This book could have been better. Only buy this if you absolutely need to implement Exchange 2000 ASAP. If you read Microsoft's documentation first, you can weed through this book for more details."

0

0.004425731

0.004415952




1237

"I was looking for a coffee machine with a thermal caraffe. At first, the cofee was great. Then a couple of weeks later, coffee started to pour directly out of the machine onto the floor! I thought maybe I had a bad machine, so I took it back to the store and exchanged it for a new one. A month later, same problem! The coffee pours around the caraffe, not into it. I burned myself trying to clean it up, and it stained my nice kitchen floors. I'm taking it back for a different brand coffee maker, I won't be fooled three times!"

0

0.00485622

0.00484449




9970

'theres probably going to be a 2-disc edition available in october or november, but im just saying that because high school musical released on 1-disc dvd and everyone bought it, and then it came out on 2-disc dvd a few monthes later.'

1

5.184576

0.0056023113




2208

"I am sorry but this album was not what I was hoping it would be. I purchased it as a child and own it now as an adult. However, the only reason I do own it was because they attached their videos on DVD. We like it (Wild & Loose) and Juicy Gothca Krazy are great songs but only in there video forms. If you are looking for those sames songs as released on video on CD you aren't going to find it. Sad to say but I feel the only reason this album really sold was because many saw the videos first and then purchased the album hoping to hear the same thing."

0

0.0052269944

0.0052133575




1651

'This eight pound bag lasts our cockatoo 2-3 months. Way more economical to buy here on Amazon than in the store. We reseal it for storage, and use it well before it goes stale (colours and smells still vibrant).'

1

5.2000904

0.0055160653




18971

"Probably I got a defective one, but was very hard tu lock/unlock, key wouldn't turn, was frustrating, I purchased it based on other people's review"

0

0.004613351

0.0046027116




14270

"Got a pack of 50 of these to burn playlists for X-Mas gifts thinking Sony would be better than the Staples or Memorex brands, which I did not recognize. WOW, did I ever go wrong. These made a racket in my CD burner before ejecting. Eventually, some worked, most didn't.Returning these tomorrow."

0

0.0039791465

0.003971287




12139

'I use my 8GB MicroSD card in a USB adapter that is roughly half the size of this device. There is a small loop for a keychain on it which is useful when pulling it out of a usb port... not to mention that pulling out the chip allows use in a cellphone or digital camera (via adapter), digital camcorder, etc...There are smaller out there, if you are looking for small you can do a bit better!'

0

0.005333006

0.0053188056




9164

"When my old Zyliss gave up the ghost I looked about for a replacement. I have other OXO products and am pleased with them, so I gave this a flyer. On the plus side, it's sturdy and very easy to operate. However, I found that most of the time, about four cans out of five, it wouldn't cut the lid completely off. I'd be left with a lid slightly immersed in the can's contents and attached by a small strand of metal. I'd then have to get a small knife or fork to pry the lid out of the can and was still left with the task of getting the lid off the can without cutting myself.Based on the other reviews I'm figuring I just got a dud, so I'll replace it with another OXO opener, but not this one."

0

0.0071697086

0.0071441117




3868

"This brand was my 2nd Delonghi thermal carafe model in 2 months, after the first one broke after 5 weeks. This one worked wonderfully for about 4 weeks until one morning. I came in and found coffee all over the counter. I thought I had not aligned the pot properly or something. After another week of dealing with this, the electrical system shorted out due to all the coffee spills. This coffee maker made the best coffee and the carafe kept it hot all day. I don't know what happened and we couldn't get it resolved. We purchased a Cuisinart with a 3 yr warranty (not the one that grinds coffee) and has worked great for 6 weeks."

0

0.00568772

0.0056715803




496

"I received this as a Christmas gift from a friend who knows I like hot tea. ALL of the tea was some variation on flavors of green tea. I avoid drinking caffein, which green tea contains, so I gave away all of the tea and kept the chest to fill with my own selections. The chest is kind of cheap (paper glued over something, as described by another reviewer). It's pretty, but I don't expect it to hold up."

0

0.004648474

0.0046377284




11258

"Received as gift for holidays, and am very used to electronic handheld games and their use. This one's screen was very difficult to use or see, despite changes in the contrast. It also has no back-light, with limited instructions. Not user-friendly, don't waste your time with this one. I returned mine to the store of purchase. Thanks,"

0

0.003727276

0.0037202872




16778

"My DH's favorite TV show....so of course, I started getting him the collection. Don't know what we'll do when I've got all of the seasons..."

1

5.083794

0.0061963536




8849

"This thing scoops out rock hard ice cream so easily. I've gone through many ice cream scoopers and I stuggled with all of them. Yes, it is more like a slab of ice cream than a scoop, but if it's going in a bowl and not a cone, then so what. No more letting the ice cream sit out for 10 minutes.*UPDATE*I should have updated this a while ago, as I threw this thing out a long time ago. It's coated in some non-stick material that I noticed flakes of in my ice cream. Since it's only used for ice cream, I can't imagine what caused it to flake off. It never goes in the dishwasher, either. I still love the design, but I'm not interested in eating teflon, or whatever they use to coat this. One star, not five!"

0

0.004810902

0.0047993124




11621

"Owning a Rowenta that faied within the first year I went the nearest Target and asked for the black Rowenta 6900 man's iron, only to find that it and model D2030 were not in stock as they described.Not ready to leave I saw a Black and Decker digital advantage model. I could have bought either of the other displayed Rowenta's but I choose the B/D model w/o a box. It was a 75% off close out.Returning home I logged on to Con. Rep. site to find that this model wasa best buy. Plenty of steam at a good price. It also shuts off the steam with setting less than 4 .......1-acryllic 7.- linen.All this for $14.95 drive out!Berne AbriolaWest Columbia, Tx"

0

0.004317724

0.0043084365




16539

'It took me a few days to open the package and check the memory. Outside it looked okay but when I put this memory stick into my computer, my computer would not boot. I put the original memory back in and it booted. So I do not recommend anyone to buy memory from this seller.'

0

0.0062699043

0.0062502804




14501

'I loved this movie when I saw it on cable many years back, so I ordered it on BluRay from Amazon. This is the only BluRay I have ever purchased that would not play in any of my BluRay players. I returned it to Amazon and asked for a replacement, which had the same problem. This time, I gave up and asked for a refund.I would hope that Amazon would check their stock and let those of use who got bad discs know when they got good stock in.I would love to purchase a non-defective BluRay of this movie!'

1

4.973938

0.00691586




6671

'This is a great book (5 star content), however, the kindle edition does not have the new redesigned cover (reason for 1 star rating). I really hope this gets fixed soon. The new covers are the only reason I am purchasing these books on the kindle. All the other WOT books I have purchased for the Kindle have the nice redesigned cover. If you want to know what the cover should look like (from the Tor Website): [...]'

0

0.007897817

0.007866734




10427

'I was SO dissapointed recieving my new Swan Lake DVD; a HD DVD....I have no HD DVD player and I did not notice that is was a HD movie......I really suggest that Amazon warn about the format - HD....who has a HD DVD player these days???'

0

0.005506111

0.0054909303




13228

'So this sorter is okay, but has a MAJOR PROBLEM, fortunately, it is easy to fix.The major problem is the lid won\'t stay on, at all. Here is how you fix it:You will need: 8" of twine or cord, 1 elastic hair band.Cut the twin in half. Tie one length into a loop with a knot going through the triangle and circle holes.Loop the hair band in on itself through the twine so they are now connected.Put the other length of twine through the hair band and blue plastic handle of the base box, and tie it into a loop going through the square and plus holes.Now the lid is held securely to the box by the tension of the elastic, and can easily be pulled open to removed the shape toys after the child puts them in.See pictures.'

0

0.0064845383

0.006463591




10526

"I got this to use it as a crimper. When I got it, I was excited to see how it worked. After I plugged it in, I forgot about it so when I went to use it, it had been heating up for about 45 minutes. It worked pretty well this time, so I thought it was a good product. However, now when I wake up in the morning, I don't have 45 minutes to wait for it to heat up. It hardly crimps my hair at all, even when I let it heat up for 30 minutes. Also, it is made of really cheap plastic."

0

0.008441245

0.008405685




649

"Camera is only usable outside in broad daylight. Trying anything inside your home/place of business and you'll have nothing but black on your screen.I am returning the product and will probably try lifeview camera with flash to see if camera aperture allows more light in.I was in rooms that were pretty well lit up...and viewfinder is dark as night.Should never have been released as a product because of these flaws...and it doesn't have to be this way. Laptop usb cameras offer incredibly bright pictures in low light conditions."

0

0.0038038772

0.0037967097




6086

'This was a great cooler out of the box, it is silent and cooled nicely. However, it\'s cheaply made. The power cords are very cheap, and the cooler itself is flemsy plastic. The pictures make it look much more substantial than it actually is. Our quit working after about 3 months. Targus makes a "new & improved" version. We tried it as well. Much better made, but it was noisey (so noisey it gave me a headache). So we opted for another brand and much happier.'

0

0.00581395

0.005797038




1666

"We outgrew our 8x8 tents with the kids, so we bought this tent preparing for our next camp trip. We test it out at our back yard right after we bought it. It's easy to set up, and roommy. However, the zippers are done in very low end quality. The window zippers work ok, but both the door zipper did not work well! The mesh zipper will open up after you zipped it. The door zipper is not well stiched on one side! And it's hard to zip it around.We are return it back to the store tomorrow!"

0

0.004966067

0.0049537085




5718

'I bought this book for $250 and I also bought "How to Create and Manage a Mutual Fund or Exchange-Traded Fund" 9780470120552 for ~$70. This book was paper back and the "How to create..." book was hardback and copy written 3 years later (2008 compared to 2005 of this book.) The words are mostly the same on the mutual fund side, but they have added exchange traded funds into the book. All the graphs are updated and it is written by the same author.'

0

0.005566335

0.0055508194




5012

'Bought this product after drinking great coffee in Amsterdam that was pressure brewed,[senseo one cup],and this machine is close but no cigar.The taste from this machine is better than drip machines but not as frothy as the Senseo.Senseo draw-back is buying your coffee in those pods,very expensive.This machine is low pressure,maybe the difference in taste.'

1

5.3091965

0.0049458994




3934

"my second yamaha A/V reciever my first was A 2nd from top rxv-2090 it came out on the fringe of home theatre it had a excelent amp but a little out of date as far as home theatredecoding beeing its based on 5 channels not 5.1knowing i needed a updated after hearing my girl friends get this! 300$ home theatre in a box i realized while my 2090 sounded better the timming was better on her cheep system . Still I am surprised at how much better the HTR-5590 sounded my first dolbly digital movie blew my mind don't forget i had a 1500$ 1995 model yamaha witch was very good too ,i would give it a 4.5 but no option for it here"

1

5.0957894

0.006122471




17883

"I've yet to find a soft dog toy that my Springer Spaniel won't destroy. I bought a few of these hoping they would work. My dog did love them, but he had the first one shredded within minutes of me giving it to him. It's not like I have some supper dog - he's only 45 pounds, but he had no problem chewing right through them.If you're dog isn't a big chewer then these would be a good toy, but if he chews through everything you've given him - this will be no different."

0

0.0074094725

0.0073820935




2256

'This phone was fantastic for about 8 months. I could go about 100 yards from my house while talking without any interference. However in the last month the handset started "Channel Searching" no matter how close I was to the base and how good the signal. Two days ago the handset went into channel search mode and literally has not stopped searching since. I have done everything I can think of including change the battery and it continues to search and while it searches the answering machine doesn\'t seem to work. Oh well, no more Sony for me.'

0

0.0059761885

0.0059584077




10171

"Letter Factory, Talking Words Factory, Code Word Caper, Fridge Phonics, Fridge Words Magnetic Word BuilderWhen my son was 22 months old he could tell me the letters and their sounds because of this system. We started with the DVDs. Then we added Fridge Phonics. After a couple months I added the Word Builder. I only let him have about 10 or 12 letters at a time so he doesn't get overwhelmed. When I had too many out, he would knock them to the floor. He's 28 months old now. He doesn't make real words yet. But that's coming soon!But watch out for the fridge magnets. Leapfrog changed the sizes. The Word Whammer and the older Phonics with the Frog have the letters that match. The ones with Scout the Dog and the Sun match the Jet (word builder)."

1

5.5787406

0.0037773203




18352

'Waiting in line to buy the new Madden every year is like waiting in line to buy a new calendar. Sure, there are enough differences to technically call it "new". There might be newer or prettier pictures but it\'s still just a calendar, just like the year before - same stuff only with slight changes. I know all the Maddenphiles and football fans say that there really are additions and changes that justify another $50/$60 drop. I think they tell themselves that just so they don\'t feel ripped off. Now that online play is a regular part of console gaming, EA should just have yearly download updates and charge $15 or so. Oh but wait, there\'s no money in not screwing over the consumer.'

0

0.006475182

0.006454313




11643

'I purchased a WAP and 2 usb adapters a year ago. It worked fine for windows 98 but on windows xp it has given me nothing but trouble. I updated the drivers and firmware on the wusb11 and have been rewarded with a non-reliable connection. I am waiting for the price to drop on 802.11a cards and wap to replace these...'

0

0.006072879

0.006054494




9242

'For the price, this is a great little vacuum. The handle adjusts up and down, and when you put it down to it\'s shortest position, you can do the stairs very quickly, going side to side. This vacuum is very powerful, and when using the attachments this is especially apparent. Great for cleaning cars with hose attachment. Works on hard surface floors, when you turn off the spinning brushes. We have cats and dogs, and lots of hair, hard wood and tile floors and I have found that instead of sweeping everywhere, I an just vacuum now, quickly, and use the attachment easily to get into corners into small places. Works great on carpets too.My only complaint- the dust accumulates towards the top of the canister, so it has to be emptied more often than it should if the dust/lint/hair would just drop into the canister. It all sort of "cakes" around the top. I would still buy it again. Very easy to empty canister, just push a button, it pops out, and then just dump it out.'

1

5.038231

0.006485212




17645

'i was really disapointed with this product. i bought it because it was a 2 disk dvd with bonus bluray disc. but that was not so. it is a 2 disc bluray with 1 dvd packaged in the dvd packageing'

0

0.007540573

0.0075122593




14119

'I SPECIFICALLY PURCHASED AN HD DVD PLAYER SO THAT I COULD ENJOY SOME OF MY FAVORITE FLICKS ON MY KILLER NEW BIG SCREEN TV, ONLY PROBLEM IS IS THAT THIS DISC DOES NOT, I REPEAT DOES NOT PLAY IN THE HD DVD PLAYER, IT IS A BRAND NEW SONY 1080 PIXEL DVD PLAYER AND THIS DISC IS MEANT TO PLAY IN IT AND IT DOES NOT, I AM HESITANT NOW TO PURCHASE ANY OTHER MOVIES WITH THE HD DVD LOGO ON THEM, I PURCHASED A COUPLE FOR CHRISTMAS WITH THE SAME LOGO AND I FEAR THEY WILL NOT PLAY EITHER, WHO DO I SEE ABOUT THIS??'

0

0.005028933

0.0050163497




3513

'Received this keyboard today and was very dissapointed. I can\'t believe IBM actually allowed Micro Innovations to put their logo on this item. When typing, the keystrokes either were not recognized or would display seconds later. Receiver "data" light blinks showing that are you pressing keys, however, the keystrokes are not sent to the computer.Returning item now as I type on a good/working LOGITECH wireless RF keyboard.'

0

0.004011086

0.004002994




11176

"Haven't really been able to use as hoped. Have tried on several occasions but before I really get started the nozzle clogs. Using coal slag as the media which is pretty darn fine...but the second you stop the flow of material with stop it backs up and plugs. Air flow settings are key and just haven't found the settings to make it work yet."

0

0.004660577

0.004649722




In [19]:
def getCommonTokens(weights, smoothing=0.01, print_first=50):
    token_frequencies = defaultdict(float)
    token_frequencies_spotlight = defaultdict(float)

    weights_uniform = np.full_like(weights, 1 / len(weights))

    num_examples = len(weights)
    for i in tqdm(range(num_examples)):
        for token in unique_tokens[i]:
            token_frequencies[token.item()] += weights_uniform[i]
            token_frequencies_spotlight[token.item()] += weights[i]
            
    token_lrs = {k: (smoothing+token_frequencies_spotlight[k]) / (smoothing+token_frequencies[k]) for k in token_frequencies}
    tokens_sorted = list(map(lambda x: x[0], sorted(token_lrs.items(), key=lambda x: x[1])[::-1]))
    for i, (token) in enumerate(tokens_sorted[:print_first]):
        print('%15s %.4f %.4f %4.2f' % (tokenizer.decode(token), token_frequencies[token], token_frequencies_spotlight[token], token_lrs[token]))

In [20]:
getCommonTokens(spotlights[1][0], 0.005, 20)

100%|██████████| 20000/20000 [00:03<00:00, 6419.12it/s]


            que 0.0028 0.0112 2.08
            est 0.0023 0.0083 1.83
            las 0.0016 0.0069 1.80
           como 0.0011 0.0059 1.78
           ##as 0.0057 0.0135 1.72
              y 0.0071 0.0153 1.68
            tod 0.0011 0.0053 1.67
             si 0.0045 0.0109 1.66
             la 0.0080 0.0162 1.63
            dry 0.0088 0.0174 1.63
            por 0.0017 0.0059 1.62
             es 0.0060 0.0128 1.61
             ot 0.0015 0.0054 1.59
        regular 0.0070 0.0138 1.57
           ##lu 0.0034 0.0083 1.57
        however 0.0503 0.0810 1.55
           ##sa 0.0090 0.0168 1.55
           tiny 0.0038 0.0086 1.54
           para 0.0029 0.0071 1.53
       returned 0.0091 0.0166 1.53


In [21]:
getCommonTokens(spotlights[2][0], 0.005, 20)

100%|██████████| 20000/20000 [00:03<00:00, 6399.25it/s]


         bigger 0.0035 0.0090 1.65
          super 0.0104 0.0203 1.64
           hang 0.0028 0.0075 1.60
         prefer 0.0057 0.0120 1.58
        killing 0.0028 0.0074 1.58
            job 0.0239 0.0405 1.57
       discover 0.0029 0.0075 1.57
           slip 0.0026 0.0070 1.57
            rev 0.0039 0.0089 1.56
           easy 0.0415 0.0676 1.56
        wearing 0.0035 0.0083 1.56
          ##ves 0.0019 0.0058 1.56
         ##ling 0.0059 0.0119 1.55
             bt 0.0013 0.0048 1.54
           heat 0.0049 0.0103 1.54
           thus 0.0037 0.0084 1.54
           trip 0.0090 0.0165 1.54
            ##x 0.0118 0.0209 1.54
          handy 0.0021 0.0059 1.53
         source 0.0043 0.0092 1.53


In [22]:
getCommonTokens(spotlights[3][0], 0.005, 20)

100%|██████████| 20000/20000 [00:03<00:00, 6372.25it/s]

        problem 0.0279 0.0531 1.76
        ##point 0.0113 0.0236 1.76
      returning 0.0048 0.0120 1.73
          ##ssa 0.0082 0.0173 1.69
          ##und 0.0071 0.0153 1.67
              $ 0.0234 0.0420 1.66
         hoping 0.0092 0.0182 1.63
       returned 0.0091 0.0180 1.63
  unfortunately 0.0158 0.0280 1.58
           okay 0.0088 0.0167 1.57
         ##ware 0.0072 0.0142 1.56
            ##0 0.0057 0.0118 1.56
            box 0.0181 0.0310 1.55
         unless 0.0141 0.0244 1.54
          sadly 0.0051 0.0103 1.52
             ok 0.0174 0.0291 1.52
          maybe 0.0297 0.0477 1.52
        however 0.0503 0.0788 1.52
           half 0.0210 0.0344 1.51
           warn 0.0022 0.0060 1.51





In [144]:
loss_weights = (losses > 6.13)
loss_weights = high_loss_weights / high_loss_weights.sum()
getCommonTokens(loss_weights, 0.005, 20)

100%|██████████| 20000/20000 [00:03<00:00, 5456.58it/s]

          ##ven 0.0025 0.0174 2.98
            sen 0.0033 0.0174 2.71
             64 0.0010 0.0099 2.51
         length 0.0052 0.0199 2.42
       outdated 0.0022 0.0124 2.42
         potter 0.0022 0.0124 2.40
         bubble 0.0012 0.0099 2.39
              = 0.0024 0.0124 2.34
            rom 0.0014 0.0099 2.33
        contact 0.0037 0.0149 2.30
 cinematography 0.0026 0.0124 2.29
      adjusting 0.0006 0.0074 2.22
      functions 0.0018 0.0099 2.19
          stock 0.0042 0.0149 2.16
         versus 0.0008 0.0074 2.15
             bu 0.0032 0.0124 2.14
            fix 0.0044 0.0149 2.12
      confirmed 0.0010 0.0074 2.09
           wars 0.0021 0.0099 2.09
          ##nes 0.0010 0.0074 2.07





# SQuAD

## Dataset + tokenizer

In [260]:
dataset = load_from_disk(squad_dir)['validation']

In [261]:
categories = list(set(dataset['title']))

In [262]:
def filter_short_examples(example):
    example_length = len(tokenizer(
        example["question"],
        example["context"],
    )['input_ids'])
    return example_length < 384
short_dataset = dataset.filter(filter_short_examples)

Token indices sequence length is longer than the specified maximum sequence length for this model (629 > 512). Running this sequence through the model will result in indexing errors


In [263]:
tokenizer = AutoTokenizer.from_pretrained(
    "distilbert-base-uncased-distilled-squad", 
    cache_dir=squad_model_path, 
    local_files_only=True
)

In [264]:
unique_tokens = []
for i in tqdm(range(len(short_dataset))):
    example = short_dataset[i]
    tokenized = tokenizer(example["question"], example["context"], return_tensors='pt')
    unique_tokens.append(torch.unique(tokenized['input_ids']))

100%|██████████| 10386/10386 [00:11<00:00, 934.09it/s]


## Model outputs

In [265]:
embeddings, outputs, losses = loadInferenceResults('inference_results/squad_val_bert.pkl')

In [266]:
# Examples of high-loss questions
loss_idx = losses.argsort().tolist()[::-1]
for idx in loss_idx[0:10]:
    example = short_dataset[idx]
#     display(example['title'])
#     display(example['context'])
    display(example['question'])
    display(example['answers']['text'])
    display(losses[idx])
    print()

'Who disliked the affiliate program?'

['passed',
 'Several University of Chicago professors',
 'Several University of Chicago professors']

15.388167




'In which year were the North and South Courts opened?'

['Secretariat Wing', 'June 1862', 'June 1862']

14.664542




"Dutch architect Janjaap Ruijssenaars's performative architecture 3D-printed building is scheduled to be built when?"

['Working versions of 3D-printing building technology are already printing',
 '2014',
 '2014']

14.02876




'Did Tesla graduate from the university?'

['no', 'not', 'He never graduated']

13.218931




'What position does Jerricho Cotchery play?'

['receivers', 'receivers', 'receivers']

12.920362




'What is attributed to the income inequality in the United States?'

['less willing to travel or relocate',
 'women not taking jobs due to marriage or pregnancy',
 'not taking jobs due to marriage or pregnancy']

12.913456




'Who organized the Britain Can Make It exhibition?'

['Festival of Britain (1951)',
 'Council of Industrial Design',
 'the Council of Industrial Design']

12.896971




'If polynomial time can be utilized within an NP-complete problem, what does the imply P is equal to?'

['NP', 'NP', 'NP']

12.88658




'What is one way of digital civil disobedience that can have far reaching consequences?'

['sending an email', 'email', 'Supreme Court case', 'broadcasting']

12.751001




'How has civil disobedience evolved in current times?'

['code-word describing the activities of muggers, arsonists, draft evaders',
 'utterly debased',
 'become utterly debased',
 'become utterly debased',
 'become utterly debased']

12.487267




## Spotlights

In [269]:
spotlights = {}
for count in np.arange(1, 6):
    spotlights[count] = loadSpotlightResults(f'spotlight_results/squad_val_bert_0.05_spherical_{count}.pkl')

### Common tokens

In [271]:
getCommonTokens(spotlights[1][0], 0.005, 20)

100%|██████████| 10386/10386 [00:02<00:00, 3513.94it/s]


         packet 0.0048 0.0114 1.67
     networking 0.0033 0.0088 1.67
            pad 0.0041 0.0100 1.64
        packets 0.0032 0.0082 1.62
      switching 0.0047 0.0105 1.60
  communication 0.0066 0.0127 1.52
          ##bed 0.0181 0.0290 1.47
       messages 0.0026 0.0062 1.47
       ignition 0.0016 0.0047 1.46
          ##lee 0.0032 0.0068 1.45
        circuit 0.0039 0.0079 1.44
      bandwidth 0.0013 0.0041 1.44
        ##ience 0.0184 0.0285 1.43
          ##ncy 0.0020 0.0051 1.43
       paradigm 0.0007 0.0031 1.43
           pure 0.0053 0.0097 1.42
      protocols 0.0016 0.0044 1.42
          alice 0.0025 0.0056 1.41
      energetic 0.0013 0.0038 1.41
         inputs 0.0026 0.0057 1.40


In [272]:
getCommonTokens(spotlights[2][0], 0.005, 20)

100%|██████████| 10386/10386 [00:02<00:00, 3546.26it/s]

          ##bed 0.0181 0.0300 1.52
            why 0.0238 0.0380 1.49
         ##ient 0.0086 0.0151 1.48
        ##ience 0.0184 0.0295 1.48
           know 0.0065 0.0113 1.41
           ##so 0.0236 0.0351 1.40
         arrest 0.0052 0.0093 1.40
          ##lor 0.0292 0.0427 1.40
     collective 0.0034 0.0065 1.38
          ##las 0.0271 0.0390 1.37
             ch 0.0285 0.0410 1.37
         packet 0.0048 0.0085 1.37
          ##lak 0.0089 0.0136 1.34
       membrane 0.0097 0.0146 1.33
           wage 0.0042 0.0073 1.33
        happens 0.0040 0.0070 1.33
          might 0.0167 0.0236 1.32
     protesters 0.0015 0.0036 1.32
           thor 0.0056 0.0090 1.32
         guilty 0.0034 0.0060 1.32





In [273]:
getCommonTokens(spotlights[3][0], 0.005, 20)

100%|██████████| 10386/10386 [00:02<00:00, 3587.59it/s]

          ##bed 0.0181 0.0257 1.33
         packet 0.0048 0.0080 1.32
            why 0.0238 0.0327 1.31
        ##ience 0.0184 0.0256 1.31
         ##ient 0.0086 0.0123 1.28
        packets 0.0032 0.0054 1.27
           ##so 0.0236 0.0306 1.24
     punishment 0.0066 0.0093 1.23
           know 0.0065 0.0092 1.23
          ##lor 0.0292 0.0365 1.21
         arrest 0.0052 0.0074 1.21
       messages 0.0026 0.0042 1.21
             ch 0.0285 0.0354 1.21
      switching 0.0047 0.0067 1.20
        circuit 0.0039 0.0058 1.20
     collective 0.0034 0.0050 1.20
          ##las 0.0271 0.0332 1.19
          wages 0.0072 0.0096 1.19
         guilty 0.0034 0.0050 1.19
     networking 0.0033 0.0048 1.19





In [274]:
# getCommonTokens(spotlights[4][0], 0.005, 20)

In [275]:
# getCommonTokens(spotlights[5][0], 0.005, 20)

In [229]:
np.quantile(losses, 0.98)

7.583634710311874

In [230]:
loss_weights = (losses > 7.58)
loss_weights = loss_weights / loss_weights.sum()
getCommonTokens(loss_weights, 0.005, 20)

100%|██████████| 10386/10386 [00:02<00:00, 4065.87it/s]


          sacks 0.0079 0.0529 4.49
            ##½ 0.0065 0.0433 4.18
        tackles 0.0065 0.0433 4.18
       confused 0.0020 0.0240 4.14
          ##bed 0.0181 0.0865 3.96
        ##ience 0.0184 0.0865 3.91
          yards 0.0161 0.0769 3.89
       behavior 0.0052 0.0337 3.79
     touchdowns 0.0065 0.0385 3.76
     defendants 0.0014 0.0192 3.76
     protesters 0.0015 0.0192 3.70
     cornerback 0.0041 0.0288 3.70
  interceptions 0.0081 0.0433 3.69
         fumble 0.0094 0.0481 3.68
         ##ling 0.0057 0.0337 3.62
         ##ient 0.0086 0.0433 3.56
      touchdown 0.0078 0.0385 3.40
          judge 0.0050 0.0288 3.38
         dallas 0.0037 0.0240 3.35
      defensive 0.0065 0.0337 3.35


### Topic distributions

In [276]:
def getTopicDistribution(weights, smoothing=0.01):
    topic_frequencies = defaultdict(float)
    topic_frequencies_spotlight = defaultdict(float)

    weights_uniform = np.full_like(weights, 1 / len(weights))

    num_examples = len(weights)
    for i in range(num_examples):
        example = short_dataset[i]
        category = example['title']
        topic_frequencies[category] += weights_uniform[i]
        topic_frequencies_spotlight[category] += weights[i]
        
    topic_ratios = {c: (smoothing + topic_frequencies_spotlight[c]) / (smoothing + topic_frequencies[c]) for c in topic_frequencies}

    categories_sorted = map(lambda x: x[0], sorted(topic_ratios.items(), key=lambda x: x[1], reverse=True))
    for category in categories_sorted:
        print('%.3f %.3f %.2f %s' % (topic_frequencies[category], topic_frequencies_spotlight[category], topic_ratios[category], category))

In [277]:
getTopicDistribution(spotlights[1][0], smoothing=0)

0.010 0.023 2.36 Packet_switching
0.019 0.031 1.61 Computational_complexity_theory
0.025 0.040 1.60 Teacher
0.019 0.030 1.57 Civil_disobedience
0.010 0.015 1.47 Intergovernmental_Panel_on_Climate_Change
0.017 0.023 1.40 French_and_Indian_War
0.024 0.031 1.29 Oxygen
0.021 0.026 1.25 Huguenot
0.020 0.025 1.24 Force
0.012 0.015 1.23 Pharmacy
0.010 0.012 1.19 Normans
0.012 0.014 1.18 Victoria_(Australia)
0.049 0.057 1.16 Nikola_Tesla
0.046 0.053 1.16 Martin_Luther
0.022 0.025 1.12 Kenya
0.030 0.033 1.10 Doctor_Who
0.010 0.011 1.08 Black_Death
0.028 0.031 1.08 Chloroplast
0.009 0.010 1.07 Jacksonville,_Florida
0.017 0.018 1.06 Southern_California
0.015 0.016 1.06 Prime_number
0.028 0.029 1.06 Economic_inequality
0.023 0.024 1.02 Genghis_Khan
0.011 0.011 1.01 Private_school
0.018 0.018 1.00 Scottish_Parliament
0.076 0.075 0.99 Super_Bowl_50
0.023 0.023 0.97 Apollo_program
0.018 0.016 0.92 Imperialism
0.010 0.009 0.90 Sky_(United_Kingdom)
0.021 0.019 0.87 Yuan_dynasty
0.028 0.024 0.87 Rhine
0

In [278]:
getTopicDistribution(spotlights[2][0], smoothing=0)

0.019 0.032 1.67 Civil_disobedience
0.010 0.015 1.57 Packet_switching
0.028 0.041 1.45 Chloroplast
0.017 0.022 1.35 French_and_Indian_War
0.015 0.020 1.34 Prime_number
0.012 0.016 1.33 Pharmacy
0.010 0.013 1.24 Black_Death
0.017 0.020 1.23 Ctenophora
0.023 0.027 1.18 Genghis_Khan
0.020 0.023 1.17 Force
0.024 0.027 1.15 Oxygen
0.028 0.032 1.15 Economic_inequality
0.019 0.022 1.14 Computational_complexity_theory
0.046 0.052 1.13 Martin_Luther
0.010 0.011 1.10 Intergovernmental_Panel_on_Climate_Change
0.022 0.024 1.09 Kenya
0.021 0.023 1.09 Yuan_dynasty
0.021 0.022 1.07 Huguenot
0.009 0.010 1.07 Jacksonville,_Florida
0.018 0.019 1.07 Imperialism
0.025 0.025 0.99 Teacher
0.028 0.028 0.99 Rhine
0.030 0.029 0.99 Doctor_Who
0.018 0.018 0.97 Scottish_Parliament
0.011 0.011 0.96 Geology
0.010 0.010 0.94 Sky_(United_Kingdom)
0.011 0.010 0.93 Private_school
0.023 0.021 0.93 Immune_system
0.049 0.046 0.93 Nikola_Tesla
0.023 0.021 0.92 Apollo_program
0.010 0.009 0.91 1973_oil_crisis
0.010 0.009 0.9

In [280]:
getTopicDistribution(spotlights[3][0], smoothing=0)

0.010 0.015 1.53 Packet_switching
0.019 0.027 1.43 Civil_disobedience
0.012 0.015 1.24 Pharmacy
0.028 0.035 1.24 Chloroplast
0.019 0.023 1.22 Computational_complexity_theory
0.017 0.020 1.22 French_and_Indian_War
0.025 0.030 1.19 Teacher
0.020 0.023 1.14 Force
0.015 0.017 1.14 Prime_number
0.010 0.011 1.13 Intergovernmental_Panel_on_Climate_Change
0.010 0.012 1.13 Black_Death
0.046 0.051 1.11 Martin_Luther
0.017 0.018 1.10 Ctenophora
0.024 0.026 1.10 Oxygen
0.028 0.030 1.09 Economic_inequality
0.023 0.025 1.06 Genghis_Khan
0.022 0.024 1.06 Kenya
0.021 0.022 1.04 Huguenot
0.009 0.010 1.03 Jacksonville,_Florida
0.049 0.050 1.02 Nikola_Tesla
0.021 0.022 1.01 Yuan_dynasty
0.018 0.018 1.01 Imperialism
0.030 0.030 1.00 Doctor_Who
0.018 0.018 1.00 Scottish_Parliament
0.010 0.010 0.99 Normans
0.012 0.012 0.96 Victoria_(Australia)
0.010 0.010 0.95 Sky_(United_Kingdom)
0.017 0.016 0.94 Southern_California
0.076 0.071 0.94 Super_Bowl_50
0.011 0.010 0.94 Private_school
0.028 0.026 0.93 Rhine
0.023

In [235]:
loss_weights = (losses > 7.58)
loss_weights = loss_weights / loss_weights.sum()
getTopicDistribution(loss_weights, 0)

0.019 0.091 4.82 Civil_disobedience
0.010 0.029 2.83 1973_oil_crisis
0.019 0.053 2.79 Computational_complexity_theory
0.076 0.168 2.21 Super_Bowl_50
0.018 0.038 2.15 Imperialism
0.012 0.024 2.01 Pharmacy
0.024 0.043 1.82 Oxygen
0.010 0.014 1.47 Packet_switching
0.010 0.014 1.44 Intergovernmental_Panel_on_Climate_Change
0.027 0.038 1.43 Victoria_and_Albert_Museum
0.017 0.024 1.42 University_of_Chicago
0.028 0.034 1.22 Economic_inequality
0.046 0.053 1.16 Martin_Luther
0.013 0.014 1.14 Fresno,_California
0.028 0.029 1.03 Rhine
0.028 0.029 1.02 Chloroplast
0.015 0.014 0.98 Prime_number
0.020 0.019 0.97 Force
0.025 0.024 0.96 Teacher
0.010 0.010 0.92 Normans
0.021 0.019 0.90 Yuan_dynasty
0.011 0.010 0.88 Private_school
0.023 0.019 0.84 Immune_system
0.049 0.038 0.78 Nikola_Tesla
0.021 0.014 0.69 Huguenot
0.017 0.010 0.58 French_and_Indian_War
0.018 0.010 0.53 United_Methodist_Church
0.009 0.005 0.52 Jacksonville,_Florida
0.009 0.005 0.51 Construction
0.030 0.014 0.48 Doctor_Who
0.010 0.005