In [1]:
import pandas as pd
from tqdm import tqdm
import pickle
from glob import glob
from nltk.tokenize import sent_tokenize
from collections import Counter, defaultdict
# from google.colab import drive
# drive.mount('content/')
pd.set_option('display.max_columns', None)

# 1. pickle로 저장한 pybliometrics 논문 객체를 불러와서 BERT 학습할 Corpus 구축

In [None]:
# Integrate dictionaries of pybliometrics class
files = glob(r"D:\myproject\research_work\citation_prediction\rsc\preparation_data\*.pickle")
data_dict = {}
for file in files:
    with open(file, 'rb') as f:   
        data_dict.update(pickle.load(f))

In [26]:
# Define functions for data preprocessing
def copyright_tag_remove(sentence): # Remove copyright tag in abstract
    # Remove from © to end
    front_idx = sentence.find('©')
    if front_idx == -1:
        return sentence
    end_idx = len(sentence)
    erase_target = sentence[front_idx:end_idx]
    replace_sentence = sentence.replace(erase_target, "")
    # Remove 'Copyright' specified
    replace_sentence = replace_sentence.replace('Copyright', "")
    return  replace_sentence

def sentence_tokenize(sentences): # Seperate corpus in sentences
    st = sent_tokenize(sentences)
    if len(st) == 1:
        return sentences
    else:
        return '\n'.join(st)

# Define functions for data postprocessing
def build_token_dict_from_vocabs(vocab_path):
    token_dict = {}
    with open(vocab_path, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            token = line.strip()
            token_dict[token] = len(token_dict)
    return token_dict

In [None]:
# Make dataframe from class dictionary data
df = pd.DataFrame()
for obj in tqdm(data_dict.values()):
    df = df.append(pd.DataFrame(obj.results))

# Extract corpus needed from full dataframe
df_treated = df.dropna(subset=['title', 'description'])
df_treated = df_treated.reset_index(drop=True)
corpus = df_treated[['title', 'description']]

# Preprocessing
corpus['description'] = corpus['description'].apply(copyright_tag_remove)

In [None]:
# Save to corpus.text from dataframe corpus
with open("rsc/conf/corpus/corpus.txt", 'w', encoding='utf-8') as f:
    for i in tqdm(range(len(corpus))):
        # Title 이 2개 문장 이상일 경우를 대비하여 문장 토큰화
        f.write(sentence_tokenize(corpus.loc[i, 'title']))
        f.write("\n")
        f.write(sentence_tokenize(corpus.loc[i, 'description']))
        f.write("\n\n")

In [21]:
# Make vocab.txt
!python src/make_vocab/wordpiece.py --corpus=rsc/conf/corpus/corpus.txt --iter=1000 --fname=rsc/conf/vocab/vocab.txt

^C


# 2. 구축한 Vocab을 통해 학습 데이터 생성

In [23]:
token_dict = build_token_dict_from_vocabs('rsc/conf/vocab/vocab.txt')
len(token_dict)

In [27]:
token_dict

{'[PAD]': 0,
 '[unused1]': 1,
 '[unused2]': 2,
 '[unused3]': 3,
 '[unused4]': 4,
 '[unused5]': 5,
 '[unused6]': 6,
 '[unused7]': 7,
 '[unused8]': 8,
 '[unused9]': 9,
 '[unused10]': 10,
 '[unused11]': 11,
 '[unused12]': 12,
 '[unused13]': 13,
 '[unused14]': 14,
 '[unused15]': 15,
 '[unused16]': 16,
 '[unused17]': 17,
 '[unused18]': 18,
 '[unused19]': 19,
 '[unused20]': 20,
 '[unused21]': 21,
 '[unused22]': 22,
 '[unused23]': 23,
 '[unused24]': 24,
 '[unused25]': 25,
 '[unused26]': 26,
 '[unused27]': 27,
 '[unused28]': 28,
 '[unused29]': 29,
 '[unused30]': 30,
 '[unused31]': 31,
 '[unused32]': 32,
 '[unused33]': 33,
 '[unused34]': 34,
 '[unused35]': 35,
 '[unused36]': 36,
 '[unused37]': 37,
 '[unused38]': 38,
 '[unused39]': 39,
 '[unused40]': 40,
 '[unused41]': 41,
 '[unused42]': 42,
 '[unused43]': 43,
 '[unused44]': 44,
 '[unused45]': 45,
 '[unused46]': 46,
 '[unused47]': 47,
 '[unused48]': 48,
 '[unused49]': 49,
 '[unused50]': 50,
 '[unused51]': 51,
 '[unused52]': 52,
 '[unused53]': 53

# 3. 학습 데이터 준비

In [40]:
files = glob(r"D:\BERT-based-Paper-Impact-Prediction\rsc\training_data\*.pickle")

eids = []
for file in files:
    with open(file, 'rb') as f:   
        df = pd.read_pickle(f)
        eids.extend(df.eid.tolist())
pickle.dump(eids, 'rsc/preparation_data/eids.pickle')

TypeError: file must have a 'write' attribute

In [39]:
pickle.load(eids, 'rsc/`preparation_data/eids.pickle')

TypeError: load() takes exactly 1 positional argument (2 given)

In [34]:
eids

['2-s2.0-77955927984',
 '2-s2.0-77955926968',
 '2-s2.0-77955926720',
 '2-s2.0-77955926410',
 '2-s2.0-77955926208',
 '2-s2.0-77955926195',
 '2-s2.0-77955924559',
 '2-s2.0-77955924402',
 '2-s2.0-77955924296',
 '2-s2.0-77955923475',
 '2-s2.0-77955921838',
 '2-s2.0-77955921721',
 '2-s2.0-77955921491',
 '2-s2.0-77955920480',
 '2-s2.0-77953364289',
 '2-s2.0-77953364243',
 '2-s2.0-77953362241',
 '2-s2.0-77953361445',
 '2-s2.0-77953360763',
 '2-s2.0-77953359341',
 '2-s2.0-77953359284',
 '2-s2.0-77953359101',
 '2-s2.0-77953359013',
 '2-s2.0-77953358967',
 '2-s2.0-77953358785',
 '2-s2.0-77953358640',
 '2-s2.0-77953357627',
 '2-s2.0-77953356855',
 '2-s2.0-77953356361',
 '2-s2.0-77952675249',
 '2-s2.0-77952675066',
 '2-s2.0-77952674606',
 '2-s2.0-77952674573',
 '2-s2.0-77952673899',
 '2-s2.0-77952673094',
 '2-s2.0-77952673044',
 '2-s2.0-77952669691',
 '2-s2.0-77952669448',
 '2-s2.0-77952669202',
 '2-s2.0-77952668797',
 '2-s2.0-77952668621',
 '2-s2.0-77952293338',
 '2-s2.0-77952293182',
 '2-s2.0-77