## Deduplicate titles 

In [1]:
import os
import re
import json
import warnings
import numpy as np
import pandas as pd 

from gensim.models import KeyedVectors
from simhash import Simhash, SimhashIndex

data_path = '../data/'
model_path = '../model/'

warnings.filterwarnings("ignore")

def load_webhouse_data(data_name): 
    data_file = data_path + data_name + '.json'
    with open(data_file) as json_data:
        data = json.load(json_data)
        return data

def cleanup_text(text):
    text = re.sub(r'http\S+', '', text)
    text = text.replace("'s", " ")
    text = text.replace("n't", " not ")
    text = text.replace("'ve", " have ")
    text = text.replace("'re", " are ")
    text = text.replace("I'm"," I am ")
    text = text.replace("you're"," you are ")
    text = text.replace("You're"," You are ")
    text = text.replace("-"," ")
    text = text.replace("/"," ")
    text = text.replace("("," ")
    text = text.replace(")"," ")
    text = text.replace("%"," percent ")
    text = re.sub(r'[^a-zA-Z0-9 ]', '', text)
    return text

def load_wordvec_model(modelName, modelFile, flagBin):
    model = KeyedVectors.load_word2vec_format(model_path + modelFile, binary=flagBin)
    return model

# data = load_webhouse_data('microsoft')
data = load_webhouse_data('microsoft_0419_0519')
model_w2v_AP = load_wordvec_model('Word2Vec Google News', 'GoogleNews-vectors-negative300.bin.gz', True)

### Get titles, cleanup text, and apply Simhash

In [2]:
tot_title = len(data)
title_list = [cleanup_text(str(data[i]['title'])) for i in range(tot_title)]
objs = [(i, Simhash(title_list[i])) for i in range(tot_title)]

In [80]:
# function checks whether the input words are present in the vocabulary for the model
def vocab_check(vectors, words):
    output = list()
    for word in words:
        if word in vectors.vocab:
            output.append(word.strip())
    return output

# function calculates similarity between two strings using a particular word vector model
def calc_similarity(input1, input2, vectors):
    s1words = set(vocab_check(vectors, input1.split()))
    s2words = set(vocab_check(vectors, input2.split()))
    
    output = vectors.n_similarity(s1words, s2words)
    return output

def remove_duplicate(index, title_list, index_simahs, model_w2v_AP):
    duplicates = set()
    title = title_list[index]
    
    # calculate  hash value
    title_hash = Simhash(title)
    
    # find all duplicate indices
    dup_indices = index_simahs.get_near_dups(title_hash)

    # apply word2vec 
    for dupi in dup_indices:
        if int(dupi) == index: 
            continue
        try:
            score = calc_similarity(title, title_list[int(dupi)], model_w2v_AP)
        except:
            score = 0
        if score > 0.55:
            duplicates.add(int(dupi))
    return duplicates    

In [92]:
distance = 15
index_simahs = SimhashIndex(objs, k=distance)

index = 2231
print('Title: \n' + title_list[index] + '\n\nDuplicates:')
for i in remove_duplicate(index, title_list, index_simahs, model_w2v_AP): 
    print(title_list[i])

Title: 
Middle Easts sustainability pioneer Beeah selects Johnson Controls and Microsoft for its office of the future

Duplicates:
Middle Easts sustainability pioneer Beeah selects Johnson Controls and Microsoft for its office of the future
Middle Easts sustainability pioneer Beeah selects Johnson Controls and Microsoft for its office of the future
Middle Easts sustainability pioneer Beeah selects Johnson Controls and Microsoft for its office of the future
Middle East  sustainability pioneer Beeah selects Johnson Controls and Microsoft for its Office of the Future
Middle East  sustainability pioneer Beeah selects Johnson Controls and Microsoft for its Office of the Future
Middle Easts sustainability pioneer Beeah selects Johnson Controls and Microsoft for its office of the future
Middle Easts sustainability pioneer Beeah selects Johnson Controls and Microsoft for its Office of the Future
Middle Easts sustainability pioneer Beeah selects Johnson Controls and Microsoft for its office of 

In [93]:
start = time.clock()
duplicates = set()
for index in range(tot_title): 
    if index in duplicates: 
        continue 
    new_duplicates = remove_duplicate(index, title_list, index_simahs, model_w2v_AP);
    duplicates = duplicates.union(new_duplicates)
    if index % 2000 == 0 and index != 0: 
        print(str(index) + '/' + str(tot_title), len(duplicates), time.clock() - start)
    
new_feeds = data.copy()
for dup in sorted([int(dup) for dup in duplicates], reverse=True):
    del new_feeds[dup]

2000/24140 1741 362.4728809999997
6000/24140 3895 1071.9081139999998
10000/24140 5262 1793.492303
12000/24140 6049 2087.508836
14000/24140 6806 2402.7792899999995
18000/24140 7773 3063.233639
22000/24140 8959 3662.3776609999995


In [94]:
def store_data(data_name, feeds): 
    data_file = data_path + data_name + '.json'
    if os.path.isfile(data_file): 
        raise ValueError("{} file already exists".format(data_file))
    with open(data_file, 'w') as outfile:
        json.dump(feeds, outfile)
        
    
data = store_data('microsoft_clean', new_feeds)