In [1]:
import pandas as pd
import gensim
import numpy as np
import re
import multiprocessing
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)



In [2]:
input = pd.read_csv('result-with-delimiter.csv',header=0,delimiter=',', encoding='utf-8')
input.head()

Unnamed: 0,publication,title,keywords
0,http://linkeddata.econstor.eu/beta/resource/pu...,The desirability of workfare as a welfare orde...,welfareXXXYYYworkfareXXXYYYmisreportingXXXYYYm...
1,http://linkeddata.econstor.eu/beta/resource/pu...,"Ageing, Care Need and Long-Term Care Workforce...",DeutschlandXXXYYYPflegeberufeXXXYYYGesundheits...
2,http://linkeddata.econstor.eu/beta/resource/pu...,The experience of developing countries with ma...,Wirtschaftliche AnpassungXXXYYYEntwicklungslän...
3,http://linkeddata.econstor.eu/beta/resource/pu...,"Private information, human capital, and optima...",WeltXXXYYYfinancial marketsXXXYYYPortfolio-Man...
4,http://linkeddata.econstor.eu/beta/resource/pu...,Surveys of Informal Sector Enterprises - Some ...,informal sectorXXXYYYinformal sector enterpris...


In [3]:
umlaute_dict = {
    'ä': 'ae',  # U+00E4	   \xc3\xa4
    'ö': 'oe',  # U+00F6	   \xc3\xb6
    'ü': 'ue',  # U+00FC	   \xc3\xbc
    'Ä': 'Ae',  # U+00C4	   \xc3\x84
    'Ö': 'Oe',  # U+00D6	   \xc3\x96
    'Ü': 'Ue',  # U+00DC	   \xc3\x9c
    'ß': 'ss',  # U+00DF	   \xc3\x9f
}
def replace_german_umlaute(unicode_string):

    utf8_string = unicode_string# .encode('utf-8')

    for k in umlaute_dict.keys():
        utf8_string = utf8_string.replace(k, umlaute_dict[k])

    return utf8_string#.decode()

In [4]:
for i, row in input.iterrows():
    s = str(row.iloc[2])
    replaced1 = re.sub(' ','_',s)
    replaced2= re.sub('XXXYYY', ' ', replaced1)
    replaced3= replace_german_umlaute(replaced2)
    input.iat[i,2] = replaced3

In [5]:
input['similarity'] = ''

In [6]:
input.head()

Unnamed: 0,publication,title,keywords,similarity
0,http://linkeddata.econstor.eu/beta/resource/pu...,The desirability of workfare as a welfare orde...,welfare workfare misreporting means-testing ut...,
1,http://linkeddata.econstor.eu/beta/resource/pu...,"Ageing, Care Need and Long-Term Care Workforce...",Deutschland Pflegeberufe Gesundheitsberufe Pfl...,
2,http://linkeddata.econstor.eu/beta/resource/pu...,The experience of developing countries with ma...,Wirtschaftliche_Anpassung Entwicklungslaender ...,
3,http://linkeddata.econstor.eu/beta/resource/pu...,"Private information, human capital, and optima...",Welt financial_markets Portfolio-Management As...,
4,http://linkeddata.econstor.eu/beta/resource/pu...,Surveys of Informal Sector Enterprises - Some ...,informal_sector informal_sector_enterprises me...,


In [7]:
input.to_csv('input_with_ids.csv')

In [8]:
model = Word2Vec.load('defaults-without-gensim-preprocessing.model')

INFO - 16:52:45: loading Word2Vec object from defaults-without-gensim-preprocessing.model
INFO - 16:52:45: loading wv recursively from defaults-without-gensim-preprocessing.model.wv.* with mmap=None
INFO - 16:52:45: setting ignored attribute vectors_norm to None
INFO - 16:52:45: loading vocabulary recursively from defaults-without-gensim-preprocessing.model.vocabulary.* with mmap=None
INFO - 16:52:45: loading trainables recursively from defaults-without-gensim-preprocessing.model.trainables.* with mmap=None
INFO - 16:52:45: setting ignored attribute cum_table to None
INFO - 16:52:45: loaded defaults-without-gensim-preprocessing.model


In [9]:
word_vectors = model.wv
len(word_vectors.vocab)

12898

In [10]:
def get_most_similar(words):
    results = {}
    for word in words.split():
        results.update({ word.lower() : 1 })
        try:
            ms = model.wv.most_similar(word.lower())
            for result in ms:
                if result[1] > 0.98:
                    results.update ({ result[0].lower() : result[1]})
        except:
            None
    return results
    pass

In [11]:
def get_jaccard_similarty(most_similar_original_doc,comparison_doc):
    size_original_doc = 0
    size_intersection = 0
    size_comparison_doc = 0
    jaccard_similarity = 0
    for key in most_similar_original_doc:
        if most_similar_original_doc[key] == 1:
            if key in comparison_doc:
                #print(key)
                size_intersection += 1
            else:
                size_original_doc +=1
    size_comparison_doc = len(comparison_doc.split()) - size_intersection
    jaccard_similarity = size_intersection / (size_intersection + size_comparison_doc + size_original_doc)
    return jaccard_similarity

In [12]:
def get_jaccard_similarty_with_word2vec(most_similar_original_doc,most_similar_comparison_doc):
    intersection_set = set()
    original_doc_set = set()
    comparison_doc_set = set()
    for key in most_similar_original_doc:
        original_doc_set.add(key)
        if most_similar_original_doc[key] > 0.9:
            if key in most_similar_comparison_doc:
                intersection_set.add(key)
    for key in most_similar_comparison_doc:
            comparison_doc_set.add(key)
    original_minus_intersection = original_doc_set.difference(intersection_set)
    comparison_minus_intersection = comparison_doc_set.difference(intersection_set)
    jaccard_similarity = len(intersection_set) / (len(intersection_set) + len(comparison_minus_intersection) + len(original_minus_intersection))
    return jaccard_similarity

In [13]:
inputdoc = input.iloc[9000][2]
my_most_similar = get_most_similar(str(inputdoc.lower()))#get_most_similar("das test eins")#
#comparison_doc = get_most_similar(str(input.iloc[84278][2]))#get_most_similar("das test zwei")#

INFO - 16:52:57: precomputing L2-norms of word weight vectors
  if np.issubdtype(vec.dtype, np.int):


In [14]:
for i, row in input.iterrows():
    comparison_doc = get_most_similar(str(row.iloc[2]).lower())
    input.iat[i,3] = get_jaccard_similarty_with_word2vec(my_most_similar,comparison_doc)

In [15]:
values_sorted = input.sort_values(by='similarity', ascending=False)

In [16]:
values_sorted.iloc[0][2]

'fertility marriage divorce suicide child_outcomes joint_custody'

In [17]:
values_sorted.iloc[1][2]

'marriage divorce family_law Joint_custody marital-specific_investment'

In [18]:
values_sorted.iloc[0][1]

'Do joint custody laws improve family well-being?'

In [19]:
values_sorted.iloc[1][1]

'The Effect of Joint Custody on Marriage and Divorce'

In [20]:
values_sorted

Unnamed: 0,publication,title,keywords,similarity
9000,http://linkeddata.econstor.eu/beta/resource/pu...,Do joint custody laws improve family well-being?,fertility marriage divorce suicide child_outco...,1
21836,http://linkeddata.econstor.eu/beta/resource/pu...,The Effect of Joint Custody on Marriage and Di...,marriage divorce family_law Joint_custody mari...,0.440678
65587,http://linkeddata.econstor.eu/beta/resource/pu...,Which Children Stabilize Marriage?,children marriage divorce IV_approach,0.27451
22029,http://linkeddata.econstor.eu/beta/resource/pu...,Political Risk and Capital Flight,human_capital institutions marriage divorce,0.27451
28426,http://linkeddata.econstor.eu/beta/resource/pu...,Social security and divorce decisions,Marriage Social_Security Divorce,0.27451
38168,http://linkeddata.econstor.eu/beta/resource/pu...,Does the Welfare State Destroy the Family? Evi...,fertility risk_sharing Marriage welfare_state ...,0.25
70067,http://linkeddata.econstor.eu/beta/resource/pu...,Does the Welfare State Destroy the Family? Evi...,fertility risk_sharing Marriage welfare_state ...,0.25
62326,http://linkeddata.econstor.eu/beta/resource/pu...,The effect of joint custody on marriage and di...,USA Oekonomischer_Anreiz marriage divorce Maen...,0.25
68831,http://linkeddata.econstor.eu/beta/resource/pu...,Does the welfare state destroy the family? Evi...,fertility risk_sharing marriage welfare_state ...,0.25
35325,http://linkeddata.econstor.eu/beta/resource/pu...,Does the Welfare State Destroy the Family? Evi...,fertility risk_sharing Marriage welfare_state ...,0.25
