In [24]:
import csv, string, nltk, collections, os
import pandas as pd
import numpy as np

from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import euclidean_distances

from scipy.sparse import csr_matrix
from scipy.stats import wasserstein_distance

from timeit import default_timer as timer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /Users/andra/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/andra/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/andra/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Phase one
Computing distribution clusters

### Read data

In [2]:
# Read data
data_imdb = pd.read_csv('movies3/csv_files/imdb.csv')
data_rt = pd.read_csv('movies3/csv_files/rotten_tomatoes.csv')

# Clean data
data_imdb = data_imdb.fillna(0)
data_rt = data_rt.fillna(0)
data_rt = data_rt.replace({'Rating': ['N', '.']}, {'Rating': 0})

# Store data for future processing 
data1 = data_imdb
data2 = data_rt

# Get the columns
columns1 = data1.columns
columns2 = data2.columns

### Dummy numerical data based on the imdb data

In [6]:
dataX = pd.DataFrame(data1.Rating)
dataX.columns = ['Score']
dataX = dataX.reindex(np.random.permutation(dataX.index)).reset_index()
dataX['Rating'] = data1.Rating

### Small test data and CountVectorizer test

In [13]:
d1 = "Obama speaks to the media in Illinois"
d2 = "The President addresses the press in Chicago"

vect = CountVectorizer(stop_words="english").fit([d1, d2])
print("Features:",  ", ".join(vect.get_feature_names()))

Features: addresses, chicago, illinois, media, obama, president, press, speaks


### Get the google news vocabulary and store it as a memory map
The function returns an array:
- result[0] the data map
- result[1] the vocabulary map

Download the data from: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit

In [17]:
def get_vocabulary():
    if not os.path.exists("data/embed.dat"):
        print("Caching word embeddings in memmapped format...")
        from gensim.models import KeyedVectors
        wv = KeyedVectors.load_word2vec_format(
            "GoogleNews-vectors-negative300.bin.gz", binary=True)
        wv.init_sims()
        from tempfile import mkdtemp
        import os.path as path
        dat_file = path.join(mkdtemp(), 'embed.dat')
        vocab_file = path.join(mkdtemp(), 'embed.vocab')
        fp = np.memmap(dat_file, dtype=np.double, mode='w+', shape=wv.vectors_norm.shape)
        fp[:] = wv.vectors_norm[:]
        with open(vocab_file, "w+") as f:
            for _, w in sorted((voc.index, word) for word, voc in wv.vocab.items()):
                print(w, file=f)
        del fp, wv

    W = np.memmap(dat_file, dtype=np.double, mode="r", shape=(3000000, 300))
    with open(vocab_file) as f:
        vocab_list = map(str.strip, f.readlines())

    return [W, {w: k for k, w in enumerate(vocab_list)}]

In [18]:
result = get_vocabulary()
W = result[0]
vocab_dict = result[1]

Caching word embeddings in memmapped format...


### Algorithm 1 from the paper 
Algorithmically identifying the cutoff EMD threshold for a column C, given a global threshold. 

In [48]:
def compute_cutoff_threshold(C, threshold):
    t = {}
    t['e'] = threshold
    t['c'] = 0
    C.append(t)
    C = sorted(C, key = lambda i: i['e']) 
    cutoff = 0
    gap = 0.0
    i = 0
    while C[i + 1]['e'] <= threshold:
        if gap < (C[i+1]['e'] - C[i]['e']):
            gap = C[i+1]['e'] - C[i]['e']
            cutoff = C[i]['e']
        i += 1

    return cutoff      

### Get the neighbors of a column
The neighborhood NC of column C consists of all columns C′ with EMD(C, C′) ≤ cutoff.

In [47]:
def get_neighbors(C, cutoff):
    return [i['c'] for i in C if i['e'] <= cutoff]

### Algorithm 2 from the paper
Compute distribution graph and distribution cluster. 

In [5]:
def compute_distribution_clusters(data, columns, threshold):
    graph = {}
    A = {}
#     vocab_dict = get_vocabulary()
  
    for i in range(0, len(columns)):
        for j in range(i + 1, len(columns)):
            try:
                e = wasserstein_distance(data[columns[i]], data[columns[j]])
            except ValueError:
                e = word_emd(data[columns[i]], data[columns[j]], vocab_dict)
            item_j = {}
            item_j['e'] = e
            item_j['c'] = columns[j]
            if columns[i] not in A:
                A[columns[i]] = []
            A[columns[i]].append(item_j)

            item_i = {}
            item_i['e'] = e
            item_i['c'] = columns[i]
            if columns[j] not in A:
                A[columns[j]] = []
            A[columns[j]].append(item_i)
        graph[columns[i]] = []
    
    for i in range(len(columns)):
        theta = compute_cutoff_threshold(A[columns[i]], threshold)
        Nc = get_neighbors(A[columns[i]], theta)

        for c in Nc:
            graph[columns[i]] = c
      
    return graph


### Word EMD 
Word EMD algortihm according to: https://vene.ro/blog/word-movers-distance-in-python.html which is based on the paper: http://mkusner.github.io/publications/WMD.pdf

In [42]:
def word_emd(d1, d2, vocab_dict):
    corpus = d1 + d2
    l1 = len(d1)

    vect = CountVectorizer(stop_words="english").fit(corpus)
    print(len(vect.get_feature_names()))
    W_ = W[[vocab_dict[w] if w in vocab_dict else vocab_dict['unk'] for w in vect.get_feature_names()] ]
    D_ = euclidean_distances(W_)
    D_ = D_.astype(np.double)
    D_ /= D_.max() 
    
    v_ = vect.transform(corpus)
    v_1 = v_[:l1,:]
    v_2 = v_[l1:, :]
    
    print(v_1.shape)
    print(v_2.shape)
    v_1 = v_1.toarray().ravel()
    v_2 = v_2.toarray().ravel()
    v_1 = v_1.astype(np.double)
    v_2 = v_2.astype(np.double)
    v_1 /= v_1.sum()
    v_2 /= v_2.sum()
    
     
    
    print(v_1.shape)
    print(v_2.shape)
    print(D_.shape)
    
    from pyemd import emd

    return emd(v_1, v_2, D_)

### Test the methods given different data

In [9]:
threshold = 0.14

# g = compute_distribution_clusters(dataX, dataX.columns, threshold)
g = compute_distribution_clusters(data1, data1.columns, threshold)

print(g)

Caching word embeddings in memmapped format...


KeyboardInterrupt: 

In [45]:
e = word_emd(data1['Director'].astype(str).tolist(), data1['Creators'].astype(str).tolist(), vocab_dict)
# e = word_emd([d1], [d2], vocab_dict)
# data1['Creators'].tolist() + data1['Director'].tolist()

4317
(2961, 4317)
(2961, 4317)
(12782637,)
(12782637,)
(4317, 4317)


ValueError: Histogram lengths cannot be greater than the number of rows or columns of the distance matrix