In [1]:
import pathlib
import numpy as np
import scipy.sparse
import scipy.io
import os
import sys
import pandas as pd
import pickle
from sklearn.feature_extraction.text import CountVectorizer
import networkx as nx
# import utils.preprocess
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as sklearn_stopwords
from nltk import word_tokenize
from nltk.corpus import stopwords as nltk_stopwords
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /home/ddatta/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ddatta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
author_label = pd.read_csv('raw/DBLP/author_label.txt', sep='\t', header=None, names=['author_id', 'label', 'author_name'], keep_default_na=False, encoding='utf-8')
paper_author = pd.read_csv('raw/DBLP/paper_author.txt', sep='\t', header=None, names=['paper_id', 'author_id'], keep_default_na=False, encoding='utf-8')
paper_conf = pd.read_csv('raw/DBLP/paper_conf.txt', sep='\t', header=None, names=['paper_id', 'conf_id'], keep_default_na=False, encoding='utf-8')
paper_term = pd.read_csv('raw/DBLP/paper_term.txt', sep='\t', header=None, names=['paper_id', 'term_id'], keep_default_na=False, encoding='utf-8')
papers = pd.read_csv('raw/DBLP/paper.txt', sep='\t', header=None, names=['paper_id', 'paper_title'], keep_default_na=False, encoding='cp1252')
terms = pd.read_csv('raw/DBLP/term.txt', sep='\t', header=None, names=['term_id', 'term'], keep_default_na=False, encoding='utf-8')
confs = pd.read_csv('raw/DBLP/conf.txt', sep='\t', header=None, names=['conf_id', 'conf'], keep_default_na=False, encoding='utf-8')

In [None]:
authors = author_label['author_id'].to_list()
paper_author = paper_author[paper_author['author_id'].isin(authors)].reset_index(drop=True)
valid_papers = paper_author['paper_id'].unique()
print('Number of papers :', len(valid_papers))

papers = papers[papers['paper_id'].isin(valid_papers)].reset_index(drop=True)
paper_conf = paper_conf[paper_conf['paper_id'].isin(valid_papers)].reset_index(drop=True)
print('Number of papers :', len(paper_conf))

paper_term = paper_term[paper_term['paper_id'].isin(valid_papers)].reset_index(drop=True)
valid_terms = paper_term['term_id'].unique()
terms = terms[terms['term_id'].isin(valid_terms)].reset_index(drop=True)

# term lemmatization and grouping
lemmatizer = WordNetLemmatizer()
lemma_id_mapping = {}
lemma_list = []
lemma_id_list = []
i = 0
for _, row in terms.iterrows():
    i += 1
    lemma = lemmatizer.lemmatize(row['term'])
    lemma_list.append(lemma)
    if lemma not in lemma_id_mapping:
        lemma_id_mapping[lemma] = row['term_id']
    lemma_id_list.append(lemma_id_mapping[lemma])
terms['lemma'] = lemma_list
terms['lemma_id'] = lemma_id_list

term_lemma_mapping = {row['term_id']: row['lemma_id'] for _, row in terms.iterrows()}
lemma_id_list = []
for _, row in paper_term.iterrows():
    lemma_id_list.append(term_lemma_mapping[row['term_id']])
paper_term['lemma_id'] = lemma_id_list

paper_term = paper_term[['paper_id', 'lemma_id']]
paper_term.columns = ['paper_id', 'term_id']
paper_term = paper_term.drop_duplicates()
terms = terms[['lemma_id', 'lemma']]
terms.columns = ['term_id', 'term']
terms = terms.drop_duplicates()

# filter out stopwords from terms
stopwords = sklearn_stopwords.union(set(nltk_stopwords.words('english')))
stopword_id_list = terms[terms['term'].isin(stopwords)]['term_id'].to_list()
paper_term = paper_term[~(paper_term['term_id'].isin(stopword_id_list))].reset_index(drop=True)
terms = terms[~(terms['term'].isin(stopwords))].reset_index(drop=True)
len(terms)

author_label = author_label.sort_values('author_id').reset_index(drop=True)
papers = papers.sort_values('paper_id').reset_index(drop=True)
terms = terms.sort_values('term_id').reset_index(drop=True)
confs = confs.sort_values('conf_id').reset_index(drop=True)

print('Number of conferences ', len(confs))
print('Number of authors ', len(author_label))
print('Number of terms ', len(terms))
print('Number of papers ', len(papers))

authors_list = list(author_label['author_id'])
papers_list = list(papers['paper_id'])
term_list = list(terms['term_id'])
conf_list = list(confs['conf_id'])
dim = len(authors_list) + len(papers_list) + len(term_list) + len(confs)
print(' Total entities :: ', dim)


author_id_mapping = {row['author_id']: i for i, row in author_label.iterrows()}
paper_id_mapping = {row['paper_id']: i + len(author_label) for i, row in papers.iterrows()}
term_id_mapping = {row['term_id']: i + len(author_label) + len(papers) for i, row in terms.iterrows()}
conf_id_mapping = {row['conf_id']: i + len(author_label) + len(papers) + len(terms) for i, row in confs.iterrows()}


entity_id_map = pd.DataFrame(
    columns=['domain', 'entity_id','serial_id']
)
type_dict = { 'author': author_id_mapping, 'paper': paper_id_mapping, 'term': term_id_mapping, 'conf': conf_id_mapping }
for _type,_dict in type_dict.items():
    i = list(_dict.keys())
    j = list(_dict.values())
    _df = pd.DataFrame( data = {'entity_id': i ,'serial_id': j } )
    _df['domain'] = _type
    entity_id_map = entity_id_map.append(_df, ignore_index=True)

    
# ======================================================
# Save data
# ======================================================
data_save_path = 'processed_data/DBLP'
if not os.path.exists('processed_data'):
    os.mkdir('processed_data')
if not os.path.exists(data_save_path):
    os.mkdir(data_save_path)
entity_id_map.to_csv( os.path.join( data_save_path, 'entity_id_mapping.csv') ) 

# Create graph data
nodes_author_df = pd.DataFrame( data = { 'author' : list(author_id_mapping.values()) })
nodes_paper_df = pd.DataFrame(  data = { 'paper' : list(paper_id_mapping.values()) } )
nodes_term_df = pd.DataFrame( data = { 'term' : list(term_id_mapping.values()) } )
nodes_conf_df = pd.DataFrame(  data = { 'conf' : list(conf_id_mapping.values()) } )

nodes_author_df.to_csv(os.path.join(data_save_path,'nodes_author.csv'),index = False)
nodes_paper_df.to_csv(os.path.join(data_save_path,'nodes_paper.csv'),index = False)
nodes_term_df.to_csv(os.path.join(data_save_path,'nodes_term.csv'),index = False)
nodes_conf_df.to_csv(os.path.join(data_save_path,'nodes_conf.csv'),index = False)

PA_edge_list = []
for _, row in paper_author.iterrows():
    idx1 = paper_id_mapping[row['paper_id']]
    idx2 = author_id_mapping[row['author_id']]
    PA_edge_list.append((idx1,idx2))
    
df = pd.DataFrame ( data =  np.array(PA_edge_list), columns = ['source','target'])
fpath = os.path.join(data_save_path, 'PA_edges.csv')
df.to_csv(fpath, index=False)
    
PT_edge_list = []
for _, row in paper_term.iterrows():
    idx1 = paper_id_mapping[row['paper_id']]
    idx2 = term_id_mapping[row['term_id']]
    PT_edge_list.append((idx1,idx2))

df = pd.DataFrame ( data =  np.array(PT_edge_list), columns = ['source','target'])
fpath = os.path.join(data_save_path, 'PT_edges.csv')
df.to_csv(fpath, index=False)
    

PC_edge_list = []
for _, row in paper_conf.iterrows():
    idx1 = paper_id_mapping[row['paper_id']]
    idx2 = conf_id_mapping[row['conf_id']]
    PC_edge_list.append((idx1,idx2))

df = pd.DataFrame ( data = np.array(PC_edge_list), columns = ['source','target'])
fpath = os.path.join(data_save_path, 'PC_edges.csv')
df.to_csv(fpath, index=False)

In [8]:
# ==============================
# Create data for HIN2Vec
# ==============================

df = pd.DataFrame(columns=['node1', 'node2','rel'])
for edge in PA_edge_list:
    df = df.append({'node1':edge[0],'node2':edge[1],'rel': 0},ignore_index=True )

for edge in PT_edge_list:
    df = df.append({'node1':edge[0],'node2':edge[1],'rel': 1},ignore_index=True )
    
for edge in PC_edge_list:
    df = df.append({'node1':edge[0],'node2':edge[1],'rel': 2},ignore_index=True )
  

In [9]:
df['node1'] = df['node1'].astype(int)
df['node2'] = df['node2'].astype(int)
df['rel'] = df['rel'].astype(int)
fpath = os.path.join(data_save_path,'hin2vec_dblp_input.txt')
df.to_csv( fpath, index = None, sep=',')

In [10]:
df

Unnamed: 0,node1,node2,rel
0,4057,262,0
1,4058,263,0
2,4059,263,0
3,4059,264,0
4,4060,266,0
...,...,...,...
119778,18380,26127,2
119779,18381,26127,2
119780,18382,26127,2
119781,18383,26127,2
