# Caption Contest

# captions_json file

In [2]:
import json

def get_path(basename):
    from os.path import isdir
    return f'{basename}' #if isdir('.voc') else basename

with open(get_path('captions.json'), 'rt', encoding='utf-8') as fp:
    captions_json = json.load(fp)
    
print(f"==> The dataset contains {len(captions_json)} captions. The first four are:")
captions_json[:4]

==> The dataset contains 2458 captions. The first four are:


[{'target_id': 0,
  'primary_type': 'text',
  'primary_description': 'I told you not to pick the one from the pilot experiment...'},
 {'target_id': 1,
  'primary_type': 'text',
  'primary_description': 'Well that explains the gas station'},
 {'target_id': 2,
  'primary_type': 'text',
  'primary_description': "The dairy-free vegan soy cheese doesn't seem to be having the same effect..."},
 {'target_id': 3,
  'primary_type': 'text',
  'primary_description': 'Repeatedly, cheese demonstrated characteristics of a performance enhancing drug'}]

## Basic Cleaning: get_captions from list of dictionaries
- The input, captions_json, is an object just like the one loaded above (a list of dictonaries with the given keys and values).
- The function returns a list of just the text (string) captions.

In [3]:
def get_captions(captions_json):
    ###
    ### YOUR CODE HERE
    ###
#     captions_orig = []
#     for i,j in enumerate(captions_json):
#         captions_orig.append(j['primary_description'])
#     return captions_orig
    return ([c['primary_description'] for c in captions_json])
        
captions_orig = get_captions(captions_json)
captions_orig[:4]

['I told you not to pick the one from the pilot experiment...',
 'Well that explains the gas station',
 "The dairy-free vegan soy cheese doesn't seem to be having the same effect...",
 'Repeatedly, cheese demonstrated characteristics of a performance enhancing drug']

## Cleaning: return clean list of words from caption: REGEX
- Converts s to lowercase.
- Returns a list of the words in s, defined as above.
***Hint: Heed the definition, "... including up to one apostrophe if that apostrophe is sandwiched between two letters."***
```python
assert clean("Please sir, that's obviously a clip-on.") \
       == ['please', 'sir', "that's", 'obviously', 'a', 'clip', 'on']
assert clean("I'm sorry, sir, but this is a 'gluten-free' restaurant. We don't serve bread.") \
       == ["i'm", 'sorry', 'sir', 'but', 'this', 'is', 'a', 'gluten', 'free', 'restaurant', 'we', "don't", 'serve', 'bread']


In [48]:
def clean(s):
    ###
    ### YOUR CODE HERE
    ###
    import re
    pattern = r"[a-z]+(?:'[a-z]+)?" #non-capturing:matches but doesn't allow reference
    return(re.findall(pattern,s.lower()))
    
clean("I'm sorry, sir, but this is a 'gluten-free' restaurant. We don't serve bread.")

["i'm",
 'sorry',
 'sir',
 'but',
 'this',
 'is',
 'a',
 'gluten',
 'free',
 'restaurant',
 'we',
 "don't",
 'serve',
 'bread']

## Cleaning: remove stopwords: Captions as "bags of words"

For example, if `c` is the cleaned caption,

```python
    c == "you're in luck a slot for you just opened up in our kitchen"
```

then its bag of words representation is a Python set of the form,

```python
    {'just', 'kitchen', 'luck', 'opened', 'slot'}
```

In [49]:
stopwords = {'with', 'theirs', 'their', 'its', 'his', "didn't", 'in', 'through', 'be', 'and', "i'll", "shouldn't", 'a', "she'll", "wasn't", 'own', 'would', 'it', 'how', 'during', 'under', "don't", 'down', 'for', 'about', 'over', "i've", "haven't", 'so', 'ourselves', 'if', 'your', "weren't", 'should', 'some', "mustn't", "you'd", "they've", 'by', "she's", "they'll", 'my', 'was', 'here', 'before', 'at', 'itself', 'more', 'from', 'am', 'very', 'that', "when's", "shan't", 'not', 'whom', "hadn't", 'those', "here's", 'has', "what's", "isn't", 'this', 'an', 'few', 'me', "aren't", 'are', 'yourselves', 'ought', "we'll", 'do', 'having', 'yours', 'until', 'then', 'as', 'because', 'himself', 'herself', 'only', 'they', 'who', "let's", "you'll", 'no', 'i', "we've", 'the', 'against', 'both', 'each', 'them', 'any', 'been', 'had', 'which', 'being', 'why', "you're", 'you', "that's", "wouldn't", "i'd", 'have', 'we', 'cannot', 'does', 'doing', 'is', 'after', "we're", 'further', "where's", 'off', 'below', 'he', 'yourself', 'up', "it's", 'these', 'than', "she'd", 'did', 'but', 'she', 'between', 'hers', 'of', 'were', "doesn't", "he'll", "how's", "we'd", 'above', 'on', 'nor', 'out', "why's", "who's", "they'd", "hasn't", 'ours', 'into', "won't", "he'd", 'again', "he's", 'same', 'other', 'or', 'when', 'once', "can't", 'her', 'such', 'most', 'what', "they're", 'there', 'themselves', 'all', "you've", 'could', 'while', "couldn't", 'our', "there's", 'too', 'him', 'to', "i'm", 'where', 'myself'}

In [50]:
def bag_of_words(c):
    # Assume `c` is already clean per Exercise 1
    ###
    ### YOUR CODE HERE
    ###
    c_split = set(c.split(" "))
    return c_split - stopwords
        
bag_of_words("you're in luck a slot for you just opened up in our kitchen")

{'just', 'kitchen', 'luck', 'opened', 'slot'}

## ASIDE: Counter

In [51]:
from collections import Counter

# Three example sets:
A = {'cat', 'hat', 'fish', 'red', 'blue'}
B = {'dog', 'beret', 'one', 'fish', 'two', 'red', 'blue'}
C = {'dog', 'cat', 'fish'}

# Count value occurrences in `A`, `B`, and `C`
K_A = Counter(A)
K_B = Counter(B)
K_C = Counter(C)

print(K_A, "==>", K_A['fish'], K_A['dog'])
print(K_B, "==>", K_B['fish'], K_B['dog'])
print(K_C, "==>", K_C['fish'], K_C['dog'])

# Combine occurrence counts by simple addition!
K_D = K_A + K_B + K_C
print(K_D, "==>", K_D['fish'], K_D['dog'])

Counter({'fish': 1, 'red': 1, 'blue': 1, 'hat': 1, 'cat': 1}) ==> 1 0
Counter({'dog': 1, 'beret': 1, 'one': 1, 'two': 1, 'fish': 1, 'red': 1, 'blue': 1}) ==> 1 1
Counter({'fish': 1, 'dog': 1, 'cat': 1}) ==> 1 1
Counter({'fish': 3, 'red': 2, 'blue': 2, 'cat': 2, 'dog': 2, 'hat': 1, 'beret': 1, 'one': 1, 'two': 1}) ==> 3 2


## Count word frequencies for all words across multiple dictionary lists: counter.update

In [52]:
def count_freq(bags):
    ###
    ### YOUR CODE HERE
    ###
    bag_words = Counter()
    for bag in bags:
        bag_words.update(bag)
    return bag_words

In [53]:
bags = [{'cat', 'hat', 'fish', 'red', 'blue'},
        {'dog', 'beret', 'one', 'fish', 'two', 'red', 'blue'},
        {'dog', 'cat', 'fish'}]
print(count_freq(bags))

Counter({'fish': 3, 'red': 2, 'blue': 2, 'cat': 2, 'dog': 2, 'hat': 1, 'beret': 1, 'one': 1, 'two': 1})


# Scoring Captions

In [54]:
def score_bag_common(bag, freq):
    return sum([freq[word] for word in bag])

K = Counter({'fish': 3, 'cat': 2, 'blue': 2, 'red': 2, 'dog': 2, 'hat': 1, 'one': 1, 'two': 1, 'beret': 1})
print(score_bag_common({'blue', 'cat', 'fish'}, K))
print(score_bag_common({'one', 'two', 'beret', 'hat', 'dog'}, K))

7
6


In [55]:
word_freq = count_freq(caption_bags)
captions_orig = get_captions(captions_json)


print("Top 5 (word, frequency) pairs:")
print(word_freq.most_common(5))

print("\nLeast frequent words:")
print(word_freq.most_common()[-5::])

Top 5 (word, frequency) pairs:
[('sorry', 819), ('sir', 509), ('toast', 451), ('bread', 223), ('gluten', 193)]

Least frequent words:
[('allergies', 1), ('fancy', 1), ('odd', 1), ('millenials', 1), ('y', 1)]


In [56]:
def load_bags(infilename="caption_bags.pickle"):
    from pickle import load
    full_path = get_path(infilename)
    print(f"Loading caption bags from '{full_path}'...")
    with open(full_path, "rb") as fp:
        bags = load(fp)
    return bags

caption_bags = load_bags()
print("- Bag for caption 123:", caption_bags[123])
print("- Bag for caption 314:", caption_bags[314])

Loading caption bags from 'caption_bags.pickle'...
- Bag for caption 123: {'obviously', 'clip', 'sir', 'please'}
- Bag for caption 314: {'sorry', 'meetings', 'toastmasters', 'sir', 'tuesday'}


In [57]:

def score_all(bags, scoring_fun, freq):
    return [scoring_fun(bag, freq) for bag in bags]

def get_order(scores, max_rank=None):
    if max_rank is None:
        max_rank = len(scores)
    # https://stackoverflow.com/questions/7851077/how-to-return-index-of-a-sorted-list
    return sorted(range(len(scores)), key=lambda k: scores[k], reverse=True)[:max_rank]

def print_top_captions(scores, captions, max_rank=10):
    for rank, k in enumerate(get_order(scores, max_rank)):
        print(f"{rank+1} [{scores[k]:.3f}]: '{k}. {captions[k]}'")
        
scores_common = score_all(caption_bags, score_bag_common, word_freq)
print_top_captions(scores_common, captions_orig, max_rank=5)

1 [2252.000]: '1440. I'm sorry sir, we cannot seat you until the rest of your party arrives. Without avocado, you're just toast.'
2 [2134.000]: '1481. I'm sorry, sir, but this is a gluten-free restaurant. We don't serve bread.'
3 [2099.000]: '1778. I'm very sorry, sir, but we only serve gluten free bread in this establishment.'
4 [2094.000]: '808. I'm sorry, sir, but even being the toast of the town won't get you a table tonight.'
5 [2061.000]: '134. I'm sorry sir, the only reservation I have under "Toast" checked in 15 minutes ago with a palate of butter and some strawberry jam.'


In [58]:
def score_bag_rare(b, freq):
    return sum([1/freq[w] for w in b])

scores_rare = score_all(caption_bags, score_bag_rare, word_freq)
print_top_captions(scores_rare, captions_orig, max_rank=5)

1 [9.372]: '1732. Party of one... Woah, I think I finally understand super-symmetry, the duality of infinity, and the entire universe... What? No, they're not putting LSD in the sauce or anything, in case that's what you were wondering, Mr. what did you say it was?'
2 [8.746]: '1690. And now, to close the twenty-fourth annual meeting of the "Literalists Society", the dry crisp stylings of our Toastmaster Emeritus, Toasty McToast.'
3 [8.658]: '2385. I'm sorry, this is a restaurant. We don't arrange “murder the homeless” Most Dangerous Game island getaways for sentient slices of bread. I can't begin to imagine why you thought I could do that for you.'
4 [8.656]: '212. I would like to practice my speaking skills with a sad story about my family being roasted alive and eaten right in front of me.  I will try not to cry so I can avoid getting soggy and moldy.'
5 [8.381]: '461. Okay, so here's how it's going to go. Forty-seven people will write captions that are some variant on the most obvi

## Exercise 4
**Exercise 4** (2 points). Let's try to implement a score that rewards a mix of common and surprising words, along with captions that are both not too short and not too long.

In particular, here is what your colleague has suggested.

- Suppose a bag has $m$ words.
- Suppose these $m$ words have frequencies $f = [f_0, f_1, \ldots, f_{m-1}]$.
- Let $\frac{1}{f} \equiv \left[\frac{1}{f_0}, \frac{1}{f_1}, \ldots, \frac{1}{f_{m-1}}\right]$ denote the inverse of these frequencies.

From these, let $s(f)$ be the score of the bag, defined using the following two-part formula:

$$
\begin{eqnarray}
  s(f) & = & \mathrm{pstdev}\left(\frac{1}{f}\right) \cdot w(m) \\
  w(m) & = & \frac{m}{\mu} \exp\left( - \frac{|m - \mu|}{\mu} \right),
\end{eqnarray}
$$

where $\mathrm{pstdev}(1 / f)$ is the _(population) standard deviation_ of the inverse-frequencies, and $\mu$ is a constant. Note the use of an absolute value in the definition of $w(m)$.

Here is your colleague's reasoning.

- The standard deviation rewards bags that have highly variable frequencies, i.e., a diverse mix of rare and common words.
- The $w(m)$ factor penalizes overly short or overly long sentences. It equals 1 when $m = \mu$, which you can interpret as a "target" caption length. It is less than 1 for all other values. (The function $\exp(x)$ is $e^x$, where $e$ is the base of the natural logarithm.)

Translate your colleague's formula into code, implementing it as `score_bag_mixed(bag, freq, mu)`, below, where

- `bag` is the bag of a caption;
- `freq` is a `Counter()` object, where `freq[w]` is the frequency of word `w` for any `w` in `bag`;
- `mu` is the target caption length, whose default value is 5.

> **Hint 0:** The code cell imports a functions to compute the population standard deviation (`statistics.pstdev`) and exponential.
>
> **Hint 1:** The population standard deviation is only well-defined when computed on at least one value. So if therre are no values, then let $\mathrm{pstdev}(\cdot) = 0$.

In [59]:
from statistics import pstdev # https://docs.python.org/3/library/statistics.html#statistics.stdev
from math import exp

def score_bag_mixed(bag, freq, mu=5):
    ###
    ### YOUR CODE HERE
    ###
    m = len(bag)
    if m == 0:
        return 0
    f_inv = [1/freq[w] for w in bag]
    w = (m/mu)*exp(-abs(m-mu)/mu)
    return pstdev(f_inv)*w

In [60]:
# Demo:
scores_mixed = score_all(caption_bags, score_bag_mixed, word_freq)
print_top_captions(scores_mixed, captions_orig)

# If you are on the right track, the top three captions should be:
#
# 1 [0.488]: '297. I'm sorry, but this is the gluten-free keynote address.'
# 2 [0.485]: '1575. I'm afraid it's one of those modern weddings and there won't be a toast.'
# 3 [0.484]: '796. Sorry,Captain Picard's dog's name is not "Toast".'

1 [0.488]: '297. I'm sorry, but this is the gluten-free keynote address.'
2 [0.485]: '1575. I'm afraid it's one of those modern weddings and there won't be a toast.'
3 [0.484]: '796. Sorry,Captain Picard's dog's name is not "Toast".'
4 [0.484]: '1805. Sir, I think you misunderstand what the Toastmasters organization is all about...'
5 [0.482]: '912. Sorry. Toastmasters met last Monday.'
6 [0.481]: '587. Now I don't wanna see you crumble under pressure'
7 [0.480]: '497. Mr. Peanut?  I'm so sorry.  He recently donated all he had.'
8 [0.478]: '629. I can squeeze you in if you take off the feet, the arms and the tie.'
9 [0.478]: '606. You lied about your experience in the service industry?! Oh, you're toast.'
10 [0.477]: '1305. Mr. SquarePants, please spell "SpongBob."'
