In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
%matplotlib inline

In [16]:
topics=pd.read_csv('../All data/advanced/FieldsOfStudy.txt',sep='\t', header = None,\
                   names = ['topic', 'Rank', 'NormalizedName', 'name', 'MainType', 'Level', 'PaperCount',\
                            'CitationCount', 'CreatedDate'])

In [17]:
papers = pd.read_csv('../Factorization_Machine/paper_citation_lowlevel_cs.csv', index_col = 0)

  mask |= (ar1 == a)


In [18]:
level3_topics = topics[topics['Level'] == 3]['topic'].unique()

In [19]:
paper_with_level3 = papers[papers['FieldOfStudyId'].isin(level3_topics)]

## Construct a weighted static graph

In [20]:
paper_with_level3.head()

Unnamed: 0,PaperId,FieldOfStudyId,Year,CitationCount
5,53694153,160030872,2002.0,0.0
12,107407592,2992554003,2000.0,4.0
17,107407604,161821725,2006.0,50.0
25,107407604,188598960,2006.0,50.0
41,107407649,43803900,2008.0,29.0


In [21]:
def find_link_all(target_pIds = None):
    pId_fIds = {} #key pId, value = list of fId (keywords)
    for index, row in paper_with_level3.iterrows():
        pId, fId = row["PaperId"], row["FieldOfStudyId"]
        if target_pIds != None and (pId not in target_pIds[pId]):
            continue
        if not pId in pId_fIds:
            pId_fIds[pId] = []
        pId_fIds[pId].append(fId)
    share_count = {}
    for pId in pId_fIds:
        fIds = pId_fIds[pId]
        for fId in fIds:
            if fId not in share_count:
                share_count[fId] = {}
            for fId2 in fIds:
                if fId == fId2:
                    continue
                if fId2 not in share_count[fId]:
                    share_count[fId][fId2] = 0
                share_count[fId][fId2] += 1
                
    return share_count

In [22]:
share_count = find_link_all(target_pIds = None)

In [23]:
from copy import deepcopy
share_count_copy = deepcopy(share_count)

In [24]:
# Remove less than 10 values

for fId1 in list(share_count_copy):
    for fId2 in list(share_count_copy[fId1]):
        if share_count_copy[fId1][fId2] < 10:
            del share_count_copy[fId1][fId2]
            
    if len(share_count_copy[fId1]) == 0:
        del share_count_copy[fId1]

In [25]:
import networkx as nx

In [26]:
graph = nx.Graph() 
for fId1 in list(share_count_copy):
    for fId2 in list(share_count_copy[fId1]):
        graph.add_edge(int(fId1), int(fId2))

In [27]:
for edge in graph.edges:
    graph[edge[0]][edge[1]]['weight'] = share_count_copy[edge[0]][edge[1]]

In [28]:
nx.write_edgelist(graph, "topic_cs.edgelist")

In [30]:
# Run node2vec

%%bash

module swap python3/intel  python/intel/2.7.12
# module load python/intel/2.7.12
python node2vec/src/main.py --input topic_cs.edgelist --output topic_cs.emd

Walk iteration:
1 / 10
2 / 10
3 / 10
4 / 10
5 / 10
6 / 10
7 / 10
8 / 10
9 / 10
10 / 10


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [31]:
# Identify embedding topics


emb_dict = {}
topic_list = []
i = 0
with open('topic_cs.emd', 'r') as topic_embedding:
    for line in topic_embedding:
        if i == 0:
            i += 1
            continue
        line = line.split()
        for j, num in enumerate(line):         
            if j == 0:
                topic_name = topics[topics['topic'] == int(num)]['name'].item()
                topic_list.append(topic_name)
                emb_dict[topic_name] = []
            else:
                emb_dict[topic_name].append(float(num))



In [32]:
pd.DataFrame(emb_dict.items())

Unnamed: 0,0,1
0,Radio signal strength,"[0.129276, 0.208627, -0.00491, -0.017112, 0.13..."
1,Wind power generation,"[-0.155464, -0.187098, 0.256471, 0.003277, 0.2..."
2,Image Identifier,"[0.015208, -0.011035, -0.18426, 0.085069, 0.24..."
3,Broadcast time,"[0.084968, 0.319746, -0.073701, -0.202103, 0.0..."
4,Mandarin speech recognition,"[0.009018, 0.062095, -0.257845, 0.08299, 0.322..."
...,...,...
13072,Radio broadcasting,"[-0.025745, 0.318107, -0.252779, -0.202191, -0..."
13073,Magnetic line,"[-0.005062, 0.043075, 0.044661, 0.061611, 0.17..."
13074,Identity matrix,"[0.095931, 0.231467, -0.214167, -0.033197, 0.3..."
13075,Credential,"[0.11215, -0.022872, 0.107568, -0.162411, -0.0..."


In [34]:
pd.DataFrame(emb_dict.items()).to_csv('topic_embedding_cs.csv')

In [4]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

In [None]:
## Compute cosine similarity between two words

two_word_similarity = {}
for i in range(len(topic_list)-1):
    for j in range(i+1, len(topic_list)):
        similarity = cosine_similarity(np.array(emb_dict[topic_list[i]]).reshape(1, -1), 
                                       np.array(emb_dict[topic_list[j]]).reshape(1, -1))
        two_word_similarity[(topic_list[i], topic_list[j])] = similarity.item()

In [None]:
two_word_similarity_dataframe = pd.DataFrame(two_word_similarity.items(), columns = ['Topics', 'Similarity'])

In [None]:
two_word_similarity_dataframe.to_csv('static_two_words_similarity.tsv', sep = '\t')