In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from multiprocessing import Pool
from functools import partial
%matplotlib inline

In [2]:
topics=pd.read_csv('../All data/advanced/FieldsOfStudy.txt',sep='\t', header = None,\
                   names = ['topic', 'Rank', 'NormalizedName', 'name', 'MainType', 'Level', 'PaperCount',\
                            'CitationCount', 'CreatedDate'])

In [19]:
level3_topics = topics[topics['Level'] == 3]['topic'].unique()

In [20]:
len(level3_topics)

289894

In [9]:
chunks=pd.read_csv('../All data/advanced/PaperFieldsOfStudy.txt',chunksize=1000000,sep='\t', header = None,\
                   names = ['PaperId', 'FieldOfStudyId', 'Score'])

In [10]:
paper_with_level3 = pd.DataFrame()

for chunk in chunks:
    paper_with_level3 = pd.concat([paper_with_level3, chunk[chunk['FieldOfStudyId'].isin(level3_topics)]])

In [14]:
chunks = pd.read_csv('../All data/mag/Papers.txt',chunksize=1000000,sep = '\t', header = None,\
                       names = ['PaperId', 'Rank', 'Doi', 'DocType', 'PaperTitle', 'OriginalTitle', 'BookTitle', 'Year',\
                                'Date', 'Publisher', 'JournalId', 'ConferenceSeriesId', 'ConferenceInstanceId', 'Volume',\
                                'Issue', 'FirstPage', 'LastPage', 'ReferenceCount', 'CitationCount', 'EstimatedCitation',\
                                'OriginalVenue', 'FamilyId', 'CreatedDate'], low_memory = False)

In [None]:
paper_year = pd.DataFrame()

for chunk in chunks:
    chunk['Year'] = chunk['Year'].fillna('0').astype('str')
    chunk['Year'] = pd.t`o_numeric(chunk['Year'].str[:4], errors = 'coerce')
    chunk['Year'] = chunk['Year'].fillna(0)
    paper_year = pd.concat([paper_year, 
                            chunk[(chunk['PaperId'].isin(paper_with_level3['PaperId'].unique())) & (chunk['Year'] >= 1990) &\
                                  (chunk['Year'] <= 2017)]])

In [5]:
# paper_year = pd.read_csv('paper_year.tsv', sep = '\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [25]:
paper_year.shape

(92749936, 23)

In [14]:
paper_with_level3 = paper_with_level3[paper_with_level3['PaperId'].isin(paper_year['PaperId'])]

In [16]:
# paper_with_level3.to_csv('paper_with_level3.tsv', sep = '\t')

## Construct a weighted graph

In [7]:
# paper_with_level3 = pd.read_csv('paper_with_level3.tsv', sep = '\t', index_col = 0)

  mask |= (ar1 == a)


In [8]:
paper_with_level3.head()

Unnamed: 0,PaperId,FieldOfStudyId,Score
0,217773392,2778063415,0.501878
1,178181438,2776662696,0.469016
2,36070428,2779234561,0.485536
4,107407429,163904656,0.716992
6,89287845,108578832,0.788253


In [10]:
def find_link_all(target_pIds = None):
    pId_fIds = {} #key pId, value = list of fId (keywords)
    for index, row in paper_with_level3.iterrows():
        pId, fId = row["PaperId"], row["FieldOfStudyId"]
        if target_pIds != None and (pId not in target_pIds[pId]):
            continue
        if not pId in pId_fIds:
            pId_fIds[pId] = []
        pId_fIds[pId].append(fId)
    share_count = {}
    for pId in pId_fIds:
        fIds = pId_fIds[pId]
        for fId in fIds:
            if fId not in share_count:
                share_count[fId] = {}
            for fId2 in fIds:
                if fId == fId2:
                    continue
                if fId2 not in share_count[fId]:
                    share_count[fId][fId2] = 0
                share_count[fId][fId2] += 1
                
    return share_count

In [None]:
share_count = find_link_all(target_pIds = None)

In [19]:
# Remove less than 10 values

from copy import deepcopy
share_count_copy = deepcopy(share_count)

In [20]:
for fId1 in list(share_count_copy):
    for fId2 in list(share_count_copy[fId1]):
        if share_count_copy[fId1][fId2] < 10:
            del share_count_copy[fId1][fId2]
            
    if len(share_count_copy[fId1]) == 0:
        del share_count_copy[fId1]

In [65]:
import networkx as nx

In [72]:
graph = nx.Graph() 
for fId1 in list(share_count_copy):
    for fId2 in list(share_count_copy[fId1]):
        graph.add_edge(int(fId1), int(fId2))

In [73]:
for edge in graph.edges:
    graph[edge[0]][edge[1]]['weight'] = share_count_copy[edge[0]][edge[1]]

In [74]:
nx.write_edgelist(graph, "topic.edgelist")

In [76]:
# Run node2vec

%%bash

module swap python3/intel  python/intel/2.7.12
# module load python/intel/2.7.12
python node2vec/src/main.py --input topic.edgelist --output test.emd

Walk iteration:
1 / 10
2 / 10
3 / 10
4 / 10
5 / 10
6 / 10
7 / 10
8 / 10
9 / 10
10 / 10


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [3]:
# Identify embedding topics

emb_dict = {}
topic_list = []
i = 0
with open('test.emd', 'r') as topic_embedding:
    for line in topic_embedding:
        if i == 0:
            i += 1
            continue
        line = line.split()
        for j, num in enumerate(line):         
            if j == 0:
                topic_name = topics[topics['topic'] == int(num)]['name'].item()
                topic_list.append(topic_name)
                emb_dict[topic_name] = []
            else:
                emb_dict[topic_name].append(float(num))



In [4]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

In [None]:
## Compute cosine similarity between two words

two_word_similarity = {}
for i in range(len(topic_list)-1):
    for j in range(i+1, len(topic_list)):
        similarity = cosine_similarity(np.array(emb_dict[topic_list[i]]).reshape(1, -1), 
                                       np.array(emb_dict[topic_list[j]]).reshape(1, -1))
        two_word_similarity[(topic_list[i], topic_list[j])] = similarity.item()

In [None]:
two_word_similarity_dataframe = pd.DataFrame(two_word_similarity.items(), columns = ['Topics', 'Similarity'])

In [None]:
two_word_similarity_dataframe.to_csv('static_two_words_similarity.tsv', sep = '\t')