In [1]:
# Initialize logging.
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')

In [11]:
import pandas as pd
import numpy as np

## Define preprocessor

In [2]:
# Import and download stopwords from NLTK.
from nltk.corpus import stopwords
from nltk import download
from nltk.tokenize import RegexpTokenizer
download('stopwords')  # Download stopwords list.

# Remove stopwords.
stop_words = stopwords.words('english')
tokenizer = RegexpTokenizer(r'\w+')

[nltk_data] Downloading package stopwords to /Users/jenya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def preprocess(text):
    text = text.lower()    
    tokens = tokenizer.tokenize(text)
    return [w for w in tokens if w not in stop_words]

## Load word2vec model

[GoogleNews pretrained model](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit)

In [4]:
%%time
from gensim.models import Word2Vec
model = Word2Vec.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True)

CPU times: user 2min 26s, sys: 4.78 s, total: 2min 31s
Wall time: 2min 33s


In [5]:
%%time
# Normalizing word2vec vectors.
model.init_sims(replace=True)  # Normalizes the vectors in the word2vec class.

CPU times: user 18.7 s, sys: 2.55 s, total: 21.3 s
Wall time: 21.5 s


#### Test WMDistance

In [6]:
candidate = "A young child is wearing blue goggles and sitting in a float in a pool."
ref0 = "A blond woman in a blue shirt appears to wait for a ride."
ref1 = "A blond woman is on the street hailing a taxi."
ref2 = "A woman is signaling is to traffic , as seen from behind."
ref3 = "A woman with blonde hair wearing a blue tube top is waving on the side of the street."
ref4 = "The woman in the blue dress is holding out her arm at oncoming traffic."
ref5 = "Sooners football player weas the number 28 and black armbands."

In [7]:
print(model.wmdistance(preprocess(ref0), preprocess(candidate)))
print(model.wmdistance(preprocess(candidate), preprocess(ref1)))
print(model.wmdistance(preprocess(candidate), preprocess(ref2)))
print(model.wmdistance(preprocess(candidate), preprocess(ref3)))
print(model.wmdistance(preprocess(candidate), preprocess(ref4)))
print(model.wmdistance(preprocess(candidate), preprocess(ref5)))

1.019768842436988
1.232557323921767
1.2774999511754035
0.9651754374781372
1.0468171477841213
1.227218955335274


## Load candidates and refs

In [8]:
candiadates = []
with open('flickr8k/candidates') as f:
    candiadates = [preprocess(text) for text in f.readlines()]

In [9]:
refs = {}
for i in range(0, 5):
    with open('flickr8k/ref-' + str(i)) as f:
        refs[i] = [preprocess(text) for text in f.readlines()]

## Calculate distances

In [10]:
distances = {}
for i in range(0, 5):
    distances[i] = [model.wmdistance(candidate, refs[i][j]) for j, candidate in enumerate(candiadates)]

In [12]:
distances_df = pd.DataFrame(distances)

In [14]:
def normalize(distance):
    return 1 / (1 + distance)

In [15]:
for i in range(0, 5):
    distances_df[i] = distances_df[i].map(normalize)

#### Average by MEAN and MAX

In [17]:
def calculate_metrics(raw):
    raw['mean'] = np.mean(raw[:5])
    raw['max'] = np.max(raw[:5])
    return raw

In [20]:
distances_df = distances_df.apply(calculate_metrics, axis=1)

In [22]:
distances_df['id'] = distances_df.index
distances_df['text'] = ''

#### Save to csv

In [29]:
distances_df.to_csv('scores.csv', columns=["id", "text", 0, 1, 2, 3, 4, "mean", "max",], index=False, sep='\t')