# paper_author_relationship

In [1]:
import pandas as pd
import numpy as np
from itertools import combinations, chain

data = pd.read_csv('/home/swryu/GM_Project/dataset/paper_author_relationship.csv', sep='\t', header=None)
data.columns = ['author_id']

In [2]:
data.shape # 449006 papers

(449006, 1)

In [3]:
data.head(10)

Unnamed: 0,author_id
0,2098387954495
1,17912721496211
2,21109241085591
3,301841321262
4,21540482003386891741
5,1564904445108
6,671937101628
7,409024661602114086652496
8,4243461180628
9,8507121366915


In [2]:
def split_author_ids_and_make_list(row):
    result = row.tolist()[0].split(',')
    result = [int(x) for x in result]
    result = [str(x) for x in sorted(result)]
    return result

data['author_id_list'] = data.apply(split_author_ids_and_make_list, axis=1)
del data['author_id']

data.head(5)

Unnamed: 0,author_id_list
0,"[954495, 2098387]"
1,"[1496211, 1791272]"
2,"[1085591, 2110924]"
3,"[301841, 321262]"
4,"[891741, 2003386, 2154048]"


In [18]:
def find_num(row, interest):
    if isinstance(interest, int):
        interest = str(interest)
    item = row.author_id_list
    result = interest in item
    return result

data.loc[data.apply(find_num, interest = 90220, axis=1),:]

Unnamed: 0,author_id_list
17915,"[90220, 371734, 374545, 794773, 1141998, 1847602]"
20646,"[90220, 93203, 371734, 497245, 498743]"
25026,"[90220, 1007888]"
29068,"[90220, 1207974]"
32010,"[90220, 1207974, 1360094]"
...,...
429517,"[90220, 1007888, 1842914]"
430841,"[90220, 497245, 498743, 1007888]"
438501,"[90220, 732217, 1007888]"
442055,"[90220, 497245, 1078038]"


In [25]:
data.index.values

array([     0,      1,      2, ..., 449003, 449004, 449005])

In [11]:
data.author_id_list.values

array([list(['954495', '2098387']), list(['1496211', '1791272']),
       list(['1085591', '2110924']), ..., list(['604830', '1232756']),
       list(['947867', '1827957']), list(['288212', '1273964'])],
      dtype=object)

In [21]:
from itertools import chain

min(int(x) for x in list(chain(*data.author_id_list.values)))

38

In [22]:
max(int(x) for x in list(chain(*data.author_id_list.values)))

2293782

In [15]:
list(zip(*(data.index.values.astype(str), data.author_id_list.values)))

[('0', ['954495', '2098387']),
 ('1', ['1496211', '1791272']),
 ('2', ['1085591', '2110924']),
 ('3', ['301841', '321262']),
 ('4', ['891741', '2003386', '2154048']),
 ('5', ['445108', '1564904']),
 ('6', ['101628', '671937']),
 ('7', ['114086', '409024', '652496', '661602']),
 ('8', ['424346', '1180628']),
 ('9', ['850712', '1366915']),
 ('10', ['648138', '1192744', '1322437']),
 ('11', ['177031', '204311', '233245', '1471407']),
 ('12', ['459682', '1446575']),
 ('13', ['1664662', '2049040']),
 ('14', ['950551', '1622916', '1688634']),
 ('15', ['793313', '2046153']),
 ('16', ['184856', '2141978']),
 ('17', ['805617', '1026099']),
 ('18', ['27872', '339943', '1496471']),
 ('19', ['307641', '2003246']),
 ('20', ['10002', '2093917']),
 ('21', ['554961', '720993']),
 ('22', ['130685', '955004']),
 ('23', ['526305', '1348294']),
 ('24', ['1045411', '1132715']),
 ('25', ['774561', '930136']),
 ('26', ['249938', '1479275']),
 ('27', ['318402', '554740', '711107', '1179874', '2117503', '21437

In [33]:
len(set(list(chain(*data.author_id_list.values))))

61442

In [5]:
all_aid = np.hstack(data.author_id_list)

print(f'The number of author_id_list values: {len(all_aid)}')
print(f'The number of unique author_id_list values: {len(np.unique(all_aid).tolist())}')

The number of author_id_list values: 1142106
The number of unique author_id_list values: 61442


---
---




# train_dataset

In [19]:
train_data = pd.read_csv('/home/swryu/GM_Project/dataset/train_dataset.csv', sep=',')
train_data.columns = ['ID1', 'ID2', 'label']

def get_right_label(row):
    result = row.label.lstrip()
    result = (result=='True')
    return result

train_data['label'] = train_data.apply(get_right_label, axis=1)
train_data.head(5)


Unnamed: 0,ID1,ID2,label
0,1483127,2059226,True
1,90220,1837844,True
2,1114856,1167164,True
3,1034527,2187998,True
4,314932,75253,True


---
---




# valid_dataset

In [7]:
valid_data = pd.read_csv('/home/swryu/GM_Project/dataset/valid_dataset.csv', sep=',')
valid_data.columns = ['ID1', 'ID2', 'label']

valid_data['label'] = valid_data.apply(get_right_label, axis=1)
valid_data.head(5)

Unnamed: 0,ID1,ID2,label
0,1812733,1079034,True
1,303538,1581670,True
2,2265991,468383,True
3,1827317,1988611,True
4,1672018,1369014,True


In [8]:
valid_data['label'].sum()

500

---
---




# query_dataset

In [9]:
query_data = pd.read_csv('/home/swryu/GM_Project/dataset/query_dataset.csv', sep=',')
query_data.columns = ['ID1', 'ID2']

In [10]:
query_data

Unnamed: 0,ID1,ID2
0,1192880,1245611
1,372775,47462
2,1171864,1851718
3,410597,625748
4,998018,119791
...,...,...
995,746283,1124006
996,1760293,658554
997,146420,1432044
998,232228,1622720


In [4]:
import pickle

with open('/home/swryu/GM_Project/paper_author_id_pairs.pkl', 'rb') as f:
    data = pickle.load(f)
f.close()

In [6]:
import random
random.shuffle(data)

In [7]:
extracted = data[:1200]

In [8]:
extracted

[('2026523', '650289'),
 ('1806583', '1186207'),
 ('577804', '1772299'),
 ('749195', '594778'),
 ('1062631', '857385'),
 ('1557152', '2289300'),
 ('888455', '1283786'),
 ('956549', '1678947'),
 ('1469993', '1064759'),
 ('873101', '1502177'),
 ('555214', '2229417'),
 ('1542389', '2250797'),
 ('1188928', '2284054'),
 ('1563082', '752178'),
 ('323549', '2168253'),
 ('1974390', '1965854'),
 ('84027', '615768'),
 ('1895148', '786524'),
 ('1504820', '1618576'),
 ('1311113', '1140104'),
 ('1055408', '1053544'),
 ('923739', '1027078'),
 ('556120', '2147842'),
 ('189261', '298376'),
 ('712445', '681288'),
 ('1314881', '2243853'),
 ('719294', '2145699'),
 ('276415', '771184'),
 ('656031', '1800127'),
 ('1341973', '967586'),
 ('1696613', '906549'),
 ('1890857', '593997'),
 ('5018', '1623659'),
 ('819280', '1448454'),
 ('1303184', '558725'),
 ('974647', '1075347'),
 ('1882431', '459947'),
 ('1215790', '1984738'),
 ('1229992', '247202'),
 ('897144', '1101942'),
 ('906197', '120418'),
 ('488878', '7

In [None]:
import os

pred_true_list = [', '.join(x) for x in extracted]
with open(os.path.join('/home/swryu/GM_Project/answer', 'same_author.csv'), 'w') as f:
    for line in pred_true_list:
        f.write(line)
        f.write('\n')
f.close()