# 전체 코드

In [1]:
import os
from glob import glob
import re
import json
from collections import defaultdict, OrderedDict


In [2]:
class Paper:
    """ 각 Paper의 tex file과 bbl file
    """
    def __init__(self, path):
        self.path = path
        self.tex = []
        self.bbl = []
        for file in glob(path + '/*'):
            if file[-4:] == '.tex':
                self.tex.append(file)
            elif file[-4:] == '.bbl':
                self.bbl.append(file)
        
        self.ref_id2title = []
        self.pairs = []
        self.paper_ref_ids = []
        
        self.get_ref_id2title()
        
        self.paper_ref_ids = [list(id_info.keys())[0] for id_info in self.ref_id2title]
        
        self.get_pairs()
        
        self.convert_all_references_to_title()
        
    def get_pairs(self):
        """ Reference가 포함된 문장과 Reference의 쌍을 얻는 함수.
        """
        # tex 파일 한 개인 경우에만 처리
        if len(self.tex) == 1:
            texfile = self.tex[0]
            
            pattern = re.compile('\\\\cite\[?[-\s.,:0-9a-zA-Z]*\]?\{[-\s,:0-9a-zA-Z]*\}')
            
            try:
                with open(texfile, 'r') as f:
                    all_lines = f.readlines()
                    for line in all_lines:
                        if line.count('\cite') == 1:
                            citation = re.findall(pattern, line)    # ['\\cite{Kai, Lichtenbaum, Skorobogatov}']
                                                                  # ['\\cite[Remark 21.3.2]{Patnaik}']
                            if citation:  # regex expression으로 찾아진 부분이 존재하면
                                info = {}
                                try:
                                    end = citation[0].index(']')
                                    ref = citation[0][end+2:-1]
                                except:
                                    ref = citation[0][6:-1]        # ref: 'Kai, Lichtenbaum, Skorobogatov'
                                
                                ref_list = ref.split(',')
                                ref_list = [x.strip() for x in ref_list]
                                
                                if len(ref_list) >= 1:
                                    line = line.strip()
                                    
                                    info["sentence"] = line
                                    info["reference"] = ref_list
                                
                                self.pairs.append(info)
            except:
                print('file을 열 수 없습니다(Encoding error)')
        return self.pairs

    def get_ref_id2title(self):
        """ tex file 안의 ref id와 실제 paper title을 대응시켜주는 함수
        """
        # bbl 파일 한 개인 경우에만 처리
        if len(self.bbl) == 1:
            bblfile = self.bbl[0]
            
            with open(bblfile, 'r') as f:
                all_lines = f.read()
                
                bib_items = all_lines.split('\\bibitem')[1:-1]
                
                for item in bib_items:
                    h, t = item.index('{'), item.index('}')
                    
                    ref_dict = dict()
                    
                    key = item[h+1:t]
                    value = item[t+1:].strip()
                    
                    ref_dict[key] = value
                    
                    self.ref_id2title.append(ref_dict)
                    
        return self.ref_id2title
    
    def convert_all_references_to_title(self):
        """ 각 tex file 안의 ref id를 실제 paper title로 변환하여 self.pairs에 저장하는 함수
        """
        for i, items in enumerate(self.pairs):
            if 'reference' in items.keys():
                ref_id_list = items['reference']

                title_reference = []
    
                for ref_id in ref_id_list:
                    for id_info in self.ref_id2title:
                        if ref_id == list(id_info.keys())[0]:    
                            title_reference.append(list(id_info.values())[0])
                
                self.pairs[i]['reference'] = title_reference
            
        return self.pairs 

In [34]:
def all_paper_dirs():
    """ 모든 paper directory 이름을 가져오는 함수.
    """
    subdirs = []
    for dirs in glob('../data/*'):
        if os.path.isdir(dirs):
            for path in glob(dirs + '/*'):
                if os.path.isdir(path):
                    subdirs.append(path)
    return subdirs

def main():
    subdirs = all_paper_dirs()

    data = OrderedDict()
    new_data = OrderedDict()
    final_data = OrderedDict()
    
    for path in subdirs:
        print(path)
        paper = Paper(path)
        data[path] = paper.pairs

    for paper_dir in data.keys():
        pairs = data[paper_dir]
        new_data[paper_dir] = []
        for item in pairs:
            if len(item['reference']) > 0:
                new_data[paper_dir].append(item)
        if len(new_data[paper_dir]) > 0:
            final_data[paper_dir] = new_data[paper_dir]

    with open('sentence_reference_pairs.json', 'w', encoding='utf-8') as f:
        json.dump(final_data, f, indent='\t')

main()

2101/2101.00044
2101/2101.00162
2101/2101.00020
2101/2101.00139
2101/2101.00147
2101/2101.00004
2101/2101.00078
2101/2101.00037
2101/2101.00089
2101/2101.00112
2101/2101.00116
2101/2101.00174
2101/2101.00180
2101/2101.00038
2101/2101.00190
2101/2101.00063
2101/2101.00001
2101/2101.00097
2101/2101.00199
2101/2101.00118
2101/2101.00173
2101/2101.00191
2101/2101.00057
2101/2101.00049
2101/2101.00015
2101/2101.00098
2101/2101.00007
2101/2101.00018
2101/2101.00026
file을 열 수 없습니다(Encoding error)
2101/2101.00036
2101/2101.00030
2101/2101.00029
2101/2101.00157
2101/2101.00082
2101/2101.00111
2101/2101.00042
2101/2101.00043
2101/2101.00027
2101/2101.00178
2101/2101.00150
2101/2101.00024
2101/2101.00200
2101/2101.00011
2101/2101.00023
2101/2101.00040
2101/2101.00134
2101/2101.00124
2101/2101.00041
2101/2101.00062
2101/2101.00152
2101/2101.00108
2101/2101.00045
2101/2101.00002
2101/2101.00012
2101/2101.00123
2101/2101.00126
2101/2101.00003
file을 열 수 없습니다(Encoding error)
2101/2101.00022
2101/2101.

In [4]:
with open('sentence_reference_pairs.json', 'r') as f:
    jsons = json.load(f)
print(json.dumps(jsons, indent='\t'))

{
	"2101/2101.00044": [
		{
			"sentence": "We show that Deligne's line bundle $\\< {L,M}$ can be obtained from the $\\cK_2$-gerbe $G_{L,M}$ constructed in \\cite{ER} via an integration along the fiber map for gerbes that categorifies the well known one arising from the Leray spectral sequence of $\\pi$. Our construction provides a full account of the biadditivity properties of $\\< {L,M}$.",
			"reference": [
				"E.~Aldrovandi and N.~Ramachandran, \\emph{Cup products, the {H}eisenberg group,\n  and codimension two algebraic cycles}, Doc. Math. \\textbf{21} (2016),\n  1313--1344. \\MR{3578206}"
			]
		},
		{
			"sentence": "Let $S$ be a smooth variety over a field $F$, and let $\\pi:X \\to S$ be a smooth projective morphism of relative dimension one.  Deligne \\cite{SGA4, MR902592} has constructed a bi-additive functor of Picard categories",
			"reference": [
				"\\emph{Th\\'{e}orie des topos et cohomologie \\'{e}tale des sch\\'{e}mas. {T}ome\n  3}, Lecture Notes in Mathematics, Vol.

In [5]:
for i in jsons.keys():
    print(i, len(jsons[i]))

2101/2101.00044 13
2101/2101.00020 132
2101/2101.00004 6
2101/2101.00037 12
2101/2101.00001 23
2101/2101.00097 26
2101/2101.00057 25
2101/2101.00029 23
2101/2101.00043 40
2101/2101.00150 42
2101/2101.00024 15
2101/2101.00041 21
2101/2101.00062 11
2101/2101.00002 10
2101/2101.00012 4
2101/2101.00095 9
2101/2101.00127 10
2101/2101.00129 18
2101/2101.00059 13
2101/2101.00137 2
2101/2101.00025 8
2101/2101.00193 25
2101/2101.00143 15
2101/2101.00064 1
2101/2101.00068 12
2101/2101.00085 31
2101/2101.00047 9
2101/2101.00079 5
2101/2101.00052 9
2101/2101.00054 11
2101/2101.00102 1
2101/2101.00055 6
2101/2101.00110 39
2101/2101.00072 20
2101/2101.00184 32
2101/2101.00031 59
2101/2101.00065 11
2101/2101.00159 6
2101/2101.00142 21
2101/2101.00154 43
2101/2101.00081 23


# 논문 tex 파일과 bbl 파일이 존재하는 경로 가져오기

In [6]:
subdirs = []
for dirs in glob('*'):
    if os.path.isdir(dirs):
        print(dirs)
        for path in glob(dirs + '/*'):
            
            if os.path.isdir(path):
                subdirs.append(path)

# paper = Paper(subdirs)
# pairs = paper.convert_all_references_to_title()
# print(subdirs)
# print(paper.path)
# print(paper.paper_ref_ids)
# print(pairs)

2101


In [7]:
subdirs

['2101/2101.00044',
 '2101/2101.00162',
 '2101/2101.00020',
 '2101/2101.00139',
 '2101/2101.00147',
 '2101/2101.00004',
 '2101/2101.00078',
 '2101/2101.00037',
 '2101/2101.00089',
 '2101/2101.00112',
 '2101/2101.00116',
 '2101/2101.00174',
 '2101/2101.00180',
 '2101/2101.00038',
 '2101/2101.00190',
 '2101/2101.00063',
 '2101/2101.00001',
 '2101/2101.00097',
 '2101/2101.00199',
 '2101/2101.00118',
 '2101/2101.00173',
 '2101/2101.00191',
 '2101/2101.00057',
 '2101/2101.00049',
 '2101/2101.00015',
 '2101/2101.00098',
 '2101/2101.00007',
 '2101/2101.00018',
 '2101/2101.00026',
 '2101/2101.00036',
 '2101/2101.00030',
 '2101/2101.00029',
 '2101/2101.00157',
 '2101/2101.00082',
 '2101/2101.00111',
 '2101/2101.00042',
 '2101/2101.00043',
 '2101/2101.00027',
 '2101/2101.00178',
 '2101/2101.00150',
 '2101/2101.00024',
 '2101/2101.00200',
 '2101/2101.00011',
 '2101/2101.00023',
 '2101/2101.00040',
 '2101/2101.00134',
 '2101/2101.00124',
 '2101/2101.00041',
 '2101/2101.00062',
 '2101/2101.00152',


# 최종 데이터 형태

In [12]:
new_data = OrderedDict()
final_data = OrderedDict()

for paper_dir in data.keys():
    pairs = data[paper_dir]
    new_data[paper_dir] = []
    for item in pairs:
        if len(item['reference']) > 0:
            new_data[paper_dir].append(item)
    if len(new_data[paper_dir]) > 0:
        final_data[paper_dir] = new_data[paper_dir]

In [13]:
final_data

OrderedDict([('2101/2101.00044',
              [{'sentence': "We show that Deligne's line bundle $\\< {L,M}$ can be obtained from the $\\cK_2$-gerbe $G_{L,M}$ constructed in \\cite{ER} via an integration along the fiber map for gerbes that categorifies the well known one arising from the Leray spectral sequence of $\\pi$. Our construction provides a full account of the biadditivity properties of $\\< {L,M}$.",
                'reference': ['E.~Aldrovandi and N.~Ramachandran, \\emph{Cup products, the {H}eisenberg group,\n  and codimension two algebraic cycles}, Doc. Math. \\textbf{21} (2016),\n  1313--1344. \\MR{3578206}']},
               {'sentence': 'Let $S$ be a smooth variety over a field $F$, and let $\\pi:X \\to S$ be a smooth projective morphism of relative dimension one.  Deligne \\cite{SGA4, MR902592} has constructed a bi-additive functor of Picard categories',
                'reference': ["\\emph{Th\\'{e}orie des topos et cohomologie \\'{e}tale des sch\\'{e}mas. {T}ome\n  3}

In [14]:
for paper_dir in new_data.keys():
    print(paper_dir, len(new_data[paper_dir]))
final_data = OrderedDict()

for paper_dir in new_data.keys():
    if len(new_data[paper_dir]) > 0:
        final_data[paper_dir] = new_data[paper_dir]

2101/2101.00044 13
2101/2101.00162 0
2101/2101.00020 132
2101/2101.00139 0
2101/2101.00147 0
2101/2101.00004 6
2101/2101.00078 0
2101/2101.00037 12
2101/2101.00089 0
2101/2101.00112 0
2101/2101.00116 0
2101/2101.00174 0
2101/2101.00180 0
2101/2101.00038 0
2101/2101.00190 0
2101/2101.00063 0
2101/2101.00001 23
2101/2101.00097 26
2101/2101.00199 0
2101/2101.00118 0
2101/2101.00173 0
2101/2101.00191 0
2101/2101.00057 25
2101/2101.00049 0
2101/2101.00015 0
2101/2101.00098 0
2101/2101.00007 0
2101/2101.00018 0
2101/2101.00026 0
2101/2101.00036 0
2101/2101.00030 0
2101/2101.00029 23
2101/2101.00157 0
2101/2101.00082 0
2101/2101.00111 0
2101/2101.00042 0
2101/2101.00043 40
2101/2101.00027 0
2101/2101.00178 0
2101/2101.00150 42
2101/2101.00024 15
2101/2101.00200 0
2101/2101.00011 0
2101/2101.00023 0
2101/2101.00040 0
2101/2101.00134 0
2101/2101.00124 0
2101/2101.00041 21
2101/2101.00062 11
2101/2101.00152 0
2101/2101.00108 0
2101/2101.00045 0
2101/2101.00002 10
2101/2101.00012 4
2101/2101.0012

In [15]:
final_data.keys()

odict_keys(['2101/2101.00044', '2101/2101.00020', '2101/2101.00004', '2101/2101.00037', '2101/2101.00001', '2101/2101.00097', '2101/2101.00057', '2101/2101.00029', '2101/2101.00043', '2101/2101.00150', '2101/2101.00024', '2101/2101.00041', '2101/2101.00062', '2101/2101.00002', '2101/2101.00012', '2101/2101.00095', '2101/2101.00127', '2101/2101.00129', '2101/2101.00059', '2101/2101.00137', '2101/2101.00025', '2101/2101.00193', '2101/2101.00143', '2101/2101.00064', '2101/2101.00068', '2101/2101.00085', '2101/2101.00047', '2101/2101.00079', '2101/2101.00052', '2101/2101.00054', '2101/2101.00102', '2101/2101.00055', '2101/2101.00110', '2101/2101.00072', '2101/2101.00184', '2101/2101.00031', '2101/2101.00065', '2101/2101.00159', '2101/2101.00142', '2101/2101.00154', '2101/2101.00081'])

## reference가 비어있는 paper 존재하는 딕셔너리

In [18]:
data

OrderedDict([('2101/2101.00044',
              [{'sentence': "We show that Deligne's line bundle $\\< {L,M}$ can be obtained from the $\\cK_2$-gerbe $G_{L,M}$ constructed in \\cite{ER} via an integration along the fiber map for gerbes that categorifies the well known one arising from the Leray spectral sequence of $\\pi$. Our construction provides a full account of the biadditivity properties of $\\< {L,M}$.",
                'reference': ['E.~Aldrovandi and N.~Ramachandran, \\emph{Cup products, the {H}eisenberg group,\n  and codimension two algebraic cycles}, Doc. Math. \\textbf{21} (2016),\n  1313--1344. \\MR{3578206}']},
               {'sentence': 'Let $S$ be a smooth variety over a field $F$, and let $\\pi:X \\to S$ be a smooth projective morphism of relative dimension one.  Deligne \\cite{SGA4, MR902592} has constructed a bi-additive functor of Picard categories',
                'reference': ["\\emph{Th\\'{e}orie des topos et cohomologie \\'{e}tale des sch\\'{e}mas. {T}ome\n  3}

# json 파일로 저장

In [21]:
import json

json_data = json.dumps(data, ensure_ascii=False, indent='\t')
print(json_data)

{
	"2101/2101.00044": [
		{
			"sentence": "We show that Deligne's line bundle $\\< {L,M}$ can be obtained from the $\\cK_2$-gerbe $G_{L,M}$ constructed in \\cite{ER} via an integration along the fiber map for gerbes that categorifies the well known one arising from the Leray spectral sequence of $\\pi$. Our construction provides a full account of the biadditivity properties of $\\< {L,M}$.",
			"reference": [
				"E.~Aldrovandi and N.~Ramachandran, \\emph{Cup products, the {H}eisenberg group,\n  and codimension two algebraic cycles}, Doc. Math. \\textbf{21} (2016),\n  1313--1344. \\MR{3578206}"
			]
		},
		{
			"sentence": "Let $S$ be a smooth variety over a field $F$, and let $\\pi:X \\to S$ be a smooth projective morphism of relative dimension one.  Deligne \\cite{SGA4, MR902592} has constructed a bi-additive functor of Picard categories",
			"reference": [
				"\\emph{Th\\'{e}orie des topos et cohomologie \\'{e}tale des sch\\'{e}mas. {T}ome\n  3}, Lecture Notes in Mathematics, Vol.

In [22]:
with open('sentence_reference_pairs.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, indent='\t')

# 저장된 json 파일 로드하여 잘 저장되었는지 확인

In [23]:
with open('sentence_reference_pairs.json', 'r') as f:
    jsons = json.load(f)
print(json.dumps(jsons, indent='\t'))

{
	"2101/2101.00044": [
		{
			"sentence": "We show that Deligne's line bundle $\\< {L,M}$ can be obtained from the $\\cK_2$-gerbe $G_{L,M}$ constructed in \\cite{ER} via an integration along the fiber map for gerbes that categorifies the well known one arising from the Leray spectral sequence of $\\pi$. Our construction provides a full account of the biadditivity properties of $\\< {L,M}$.",
			"reference": [
				"E.~Aldrovandi and N.~Ramachandran, \\emph{Cup products, the {H}eisenberg group,\n  and codimension two algebraic cycles}, Doc. Math. \\textbf{21} (2016),\n  1313--1344. \\MR{3578206}"
			]
		},
		{
			"sentence": "Let $S$ be a smooth variety over a field $F$, and let $\\pi:X \\to S$ be a smooth projective morphism of relative dimension one.  Deligne \\cite{SGA4, MR902592} has constructed a bi-additive functor of Picard categories",
			"reference": [
				"\\emph{Th\\'{e}orie des topos et cohomologie \\'{e}tale des sch\\'{e}mas. {T}ome\n  3}, Lecture Notes in Mathematics, Vol.

## example for 1 file

In [26]:
path = "2101/2101.00044"
paper = Paper(path)


In [27]:
paper.pairs

[{'sentence': "We show that Deligne's line bundle $\\< {L,M}$ can be obtained from the $\\cK_2$-gerbe $G_{L,M}$ constructed in \\cite{ER} via an integration along the fiber map for gerbes that categorifies the well known one arising from the Leray spectral sequence of $\\pi$. Our construction provides a full account of the biadditivity properties of $\\< {L,M}$.",
  'reference': ['E.~Aldrovandi and N.~Ramachandran, \\emph{Cup products, the {H}eisenberg group,\n  and codimension two algebraic cycles}, Doc. Math. \\textbf{21} (2016),\n  1313--1344. \\MR{3578206}']},
 {'sentence': 'Let $S$ be a smooth variety over a field $F$, and let $\\pi:X \\to S$ be a smooth projective morphism of relative dimension one.  Deligne \\cite{SGA4, MR902592} has constructed a bi-additive functor of Picard categories',
  'reference': ["\\emph{Th\\'{e}orie des topos et cohomologie \\'{e}tale des sch\\'{e}mas. {T}ome\n  3}, Lecture Notes in Mathematics, Vol. 305, Springer-Verlag, Berlin-New York,\n  1973, S\\'

In [28]:
paper.paper_ref_ids

['ER',
 'ER2',
 'MR772054',
 'MR991974',
 'MR516914',
 'MR823233',
 'BreenAst',
 'MR2362847',
 'MR962493',
 'MR902592',
 'MR1114212',
 'Duskin2001',
 'MR1005159',
 'Emsalem2017',
 'Eriksson',
 'MR1078860',
 'MR1085257',
 'Gillet',
 'Giraud',
 'MR2562455',
 'MR0354656-VII',
 'MR0491680',
 'JardineLHT',
 'Kai',
 'KS',
 'Lichtenbaum',
 'MilneEC',
 'Milne',
 'Patnaik',
 'Rost',
 'SGA4',
 'Skorobogatov']

In [29]:
b = paper.convert_all_references_to_title()
b

[{'sentence': "We show that Deligne's line bundle $\\< {L,M}$ can be obtained from the $\\cK_2$-gerbe $G_{L,M}$ constructed in \\cite{ER} via an integration along the fiber map for gerbes that categorifies the well known one arising from the Leray spectral sequence of $\\pi$. Our construction provides a full account of the biadditivity properties of $\\< {L,M}$.",
  'reference': []},
 {'sentence': 'Let $S$ be a smooth variety over a field $F$, and let $\\pi:X \\to S$ be a smooth projective morphism of relative dimension one.  Deligne \\cite{SGA4, MR902592} has constructed a bi-additive functor of Picard categories',
  'reference': []},
 {'sentence': 'Let $\\cK_{2}$ be the usual Zariski sheaf attached to the presheaf $U \\mapsto K_2(U)$ on $X$. Our main result is the following:\\footnote{Theorem \\ref{Main} was conjectured by M.~Patnaik \\cite[Remark 21.3.2]{Patnaik}.}',
  'reference': []},
 {'sentence': 'This is essentially proved in \\cite{ER}, but for the biadditivity, which we addre

In [30]:
b[0]

{'sentence': "We show that Deligne's line bundle $\\< {L,M}$ can be obtained from the $\\cK_2$-gerbe $G_{L,M}$ constructed in \\cite{ER} via an integration along the fiber map for gerbes that categorifies the well known one arising from the Leray spectral sequence of $\\pi$. Our construction provides a full account of the biadditivity properties of $\\< {L,M}$.",
 'reference': []}

In [31]:
b[1]

{'sentence': 'Let $S$ be a smooth variety over a field $F$, and let $\\pi:X \\to S$ be a smooth projective morphism of relative dimension one.  Deligne \\cite{SGA4, MR902592} has constructed a bi-additive functor of Picard categories',
 'reference': []}