# Imports

In [1]:
import gensim.models
import pandas as pd
import numpy as np

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords 
from nltk import jaccard_distance
from nltk import word_tokenize
from nltk.corpus import wordnet

from functools import reduce
from itertools import product

# Read data

## Load WE
Download the "iki-news-300d-1M.vec.zip" from https://fasttext.cc/docs/en/english-vectors.html

In [2]:
model = gensim.models.KeyedVectors.load_word2vec_format('wiki-news-300d-1M.vec')

## Movies data
Download the movies1 data from https://sites.google.com/site/anhaidgroup/useful-stuff/data

In [3]:
data_imdb = pd.read_csv('google/movies1/csv_files/imdb.csv', index_col=False)
data_rt = pd.read_csv('google/movies1/csv_files/rotten_tomatoes.csv', index_col=False)
data_imdb.set_index('Unnamed: 0', inplace=True)
data_rt.set_index('Unnamed: 0', inplace=True)
data_imdb = data_imdb.fillna('')
data_rt = data_rt.fillna('')

# get column names
data_imdb_c = list(data_imdb.columns)
print(data_imdb_c)
data_rt_c = list(data_rt.columns)
print(data_rt_c)

['Id', 'Name', 'YearRange', 'ReleaseDate', 'Director', 'Creator', 'Cast', 'Duration', 'RatingValue', 'ContentRating', 'Genre', 'Url', 'Description']
['Id', 'Name', 'Year', 'Release Date', 'Director', 'Creator', 'Actors', 'Cast', 'Language', 'Country', 'Duration', 'RatingValue', 'RatingCount', 'ReviewCount', 'Genre', 'Filming Locations', 'Description']


# Algorithms

## Preprocess data

In [4]:
def preprocess(e):
    capword_tokenizer = RegexpTokenizer('[A-Z]*[a-z\d]*')
    stop_words = set(stopwords.words('english')) 
    
    tokens = capword_tokenizer.tokenize(e)
    lower_case_tokens = map(str.lower, tokens)
    filtered_tokens = [w for w in lower_case_tokens if w and not w in stop_words] 
    
    return filtered_tokens

## Coherency factor

In [5]:
def coherency_factor(tokens1, tokens2):
    cf = 0
    
    for w1 in tokens1:
         for w2 in tokens2:
            if w1 != w2:
                if w1 not in model.vocab or w2 not in model.vocab:
                    continue
                else:
                    cf += model.similarity(w1, w2)
                    
    return cf / (len(tokens1) + len(tokens2))

## Coherent group

tau in [0.2, 0.4, 0.6, 0.8]

In [6]:
def coherent_group(e1, e2, tau):
    cf = coherency_factor(e1, e2)
    return 1 if cf > tau else -1

## Instance-based Matcher
Jaccard similarity between two sets of data values. 0 represents perfect match.

In [7]:
def jaccard_similarity(c1, c2):
    c1_tokens = [word_tokenize(w) for w in c1]
    c1_tokens = np.concatenate(c1_tokens).ravel()

    c2_tokens = [word_tokenize(w) for w in c2]
    c2_tokens = np.concatenate(c2_tokens).ravel()

    set_c1 = set(c1_tokens)
    set_c2 = set(c2_tokens)
    
#     print(len(set_c1.intersection(set_c2)))
    
    return jaccard_distance(set_c1, set_c2)

In [235]:
c1 = data_imdb['Creator']
c2 = data_rt['Creator']

jaccard_similarity(c1, c2)

0.542307266895255

## Syntactic matcher

In [8]:
def synm(e1, e2):
    allsyns1 = set(ss for word in e1 for ss in wordnet.synsets(word))
    allsyns2 = set(ss for word in e2 for ss in wordnet.synsets(word))
    allhyps1 = set(tt for word in e1 for ss in wordnet.synsets(word) for tt in ss.hypernyms())
    allhyps2 = set(tt for word in e2 for ss in wordnet.synsets(word) for tt in ss.hypernyms())
    all1 = allsyns1.union(allhyps1)
    all2 = allsyns2.union(allhyps2)

    best = max((wordnet.wup_similarity(s1, s2) or 0, s1, s2) for s1, s2 in 
            product(all1, all2) if s1 != s2)
#     print(best)
    return best[0]

In [9]:
e1 = preprocess('CellType')
e2 = preprocess('BellType')

synm(e1, e2)

0.9473684210526315