# wikidata parsing
위키데이터에서 id를 이용해 entity와 relation을 구분하여 entities, relations 리스트에 저장한다.

In [24]:
import bz2
import json
import pandas as pd

def wikidata_parsing(filename):
    '''
    json.bz2 file parsing function
    '''
    with bz2.open(filename, mode='rt', encoding='utf-8') as f:
        for line in f:
            try:
                yield json.loads(line.rstrip(',\n'))
            except json.decoder.JSONDecodeError:
                continue
                
entities, relations = [], []

# number of files setting
num_of_files = 5

for num in range(num_of_files):
    filename = 'data/split-' + ('%04d' % num) + '.json.bz2'
    
    for i, data in enumerate(wikidata_parsing(filename)):
        if data['id'][0] == 'Q':
            entities.append(data)
        elif data['id'][0] == 'P' or data['id'][0] == 'p':
            relations.append(data)
        if i % 5000 == 4999:
            print(f'total entities: {len(entities)}, total relations: {len(relations)}')

total entities: 5000, total relations: 0
total entities: 10000, total relations: 0
total entities: 15000, total relations: 0
total entities: 20000, total relations: 0
total entities: 25000, total relations: 0
total entities: 30000, total relations: 0
total entities: 35000, total relations: 0
total entities: 40000, total relations: 0
total entities: 45000, total relations: 0
total entities: 50000, total relations: 0
total entities: 55000, total relations: 0
total entities: 60000, total relations: 0
total entities: 65000, total relations: 0
total entities: 70000, total relations: 0
total entities: 75000, total relations: 0
total entities: 80000, total relations: 0
total entities: 85000, total relations: 0
total entities: 90000, total relations: 0
total entities: 95000, total relations: 0
total entities: 104999, total relations: 0
total entities: 109999, total relations: 0
total entities: 114999, total relations: 0
total entities: 119999, total relations: 0
total entities: 124999, total r

In [35]:
len(entities), len(relations)

(499994, 5)

# extract entity information
parsing한 wikidata에서 필요한 정보만을 추출하여 딕셔너리 형태로 저장한다.
<br/> 저장하는 정보는 entity의 'id','labels','descriptions','aliases','claims'로 5가지이다.
<br/>영어와 한국어에 대해 따로 ent_info_list_en, ent_info_list_ko 리스트에 저장한다(한국어로 존재하지 않는 property가 있음).

In [36]:
from collections import defaultdict

def extract_entity_info(data, lang='en'):
    '''
    includes 5 properties of wikidata(id, labels, descriptions, aliases, claims)
    in dictionary type
    '''
    new = defaultdict(list)
    
    new['id'] = data['id']
    
    if lang in data['labels'].keys():
        new['labels'] = data['labels'][lang]['value']
        
    if lang in data['descriptions'].keys():
        new['descriptions'] = data['descriptions'][lang]['value']
        
    if lang in data['aliases'].keys():
        new['aliases'] = [x['value'] for x in data['aliases'][lang]]
    new['claims'] = list(data['claims'].keys())
    
    return new

In [37]:
ent_info_list_en, ent_info_list_ko = [], []

for entity in entities:
    ent_info_list_en.append(extract_entity_info(entity))
    ent_info_list_ko.append(extract_entity_info(entity, lang='ko'))

In [38]:
print(len(ent_info_list_en), len(ent_info_list_ko))

499994 499994


In [39]:
ent_info_list_en[0]

defaultdict(list,
            {'id': 'Q31',
             'labels': 'Belgium',
             'descriptions': 'country in western Europe',
             'aliases': ['Kingdom of Belgium', 'BEL', 'be', '🇧🇪', 'BE'],
             'claims': ['P2924',
              'P1344',
              'P1082',
              'P1667',
              'P1151',
              'P3348',
              'P1333',
              'P1546',
              'P5125',
              'P349',
              'P38',
              'P3365',
              'P3221',
              'P2581',
              'P1566',
              'P227',
              'P487',
              'P1792',
              'P2852',
              'P3916',
              'P2853',
              'P395',
              'P3238',
              'P1335',
              'P2633',
              'P298',
              'P1448',
              'P1313',
              'P3417',
              'P3529',
              'P417',
              'P998',
              'P17',
              'P486',
           

In [40]:
ent_info_list_ko[0]

defaultdict(list,
            {'id': 'Q31',
             'labels': '벨기에',
             'descriptions': '서유럽에 위치한 국가',
             'aliases': ['벨기에 왕국', '벨기에왕국'],
             'claims': ['P2924',
              'P1344',
              'P1082',
              'P1667',
              'P1151',
              'P3348',
              'P1333',
              'P1546',
              'P5125',
              'P349',
              'P38',
              'P3365',
              'P3221',
              'P2581',
              'P1566',
              'P227',
              'P487',
              'P1792',
              'P2852',
              'P3916',
              'P2853',
              'P395',
              'P3238',
              'P1335',
              'P2633',
              'P298',
              'P1448',
              'P1313',
              'P3417',
              'P3529',
              'P417',
              'P998',
              'P17',
              'P486',
              'P2959',
              'P3106',
           

# relation counting
각 entity마다 존재하는 relation을 확인하여 count하고 이를 rel_cnt_en, rel_cnt_ko 리스트에 저장한다.

In [41]:
from collections import Counter

def get_relation_count(entity_info_list):
    '''
    calculate relations frequency
    '''
    rel_info_list_en = []

    for info in entity_info_list:
        rel_info_list_en.extend(info['claims'])

    rel_cnt_en = Counter(rel_info_list_en)

    # sort by frequency of relation
    rel_cnt_en = sorted(rel_cnt_en.items(), key=lambda x: -x[1])
    
    # relation count is same for en and ko
    return rel_cnt_en, rel_cnt_en

In [42]:
# relation counting
rel_cnt_en, rel_cnt_ko = get_relation_count(ent_info_list_en)

In [43]:
rel_cnt_en

[('P31', 463154),
 ('P646', 212974),
 ('P17', 154666),
 ('P18', 140776),
 ('P373', 137764),
 ('P2671', 125261),
 ('P21', 122705),
 ('P569', 118078),
 ('P106', 116256),
 ('P27', 110987),
 ('P735', 108656),
 ('P131', 103463),
 ('P625', 102166),
 ('P19', 102069),
 ('P214', 98454),
 ('P7859', 79596),
 ('P570', 64513),
 ('P734', 63378),
 ('P244', 61894),
 ('P227', 60438),
 ('P171', 59680),
 ('P225', 59676),
 ('P105', 59676),
 ('P213', 56070),
 ('P846', 54904),
 ('P1412', 51528),
 ('P20', 50022),
 ('P856', 48589),
 ('P641', 48554),
 ('P910', 45270),
 ('P571', 43556),
 ('P1566', 43023),
 ('P5055', 42200),
 ('P268', 36699),
 ('P421', 35997),
 ('P281', 34691),
 ('P136', 32668),
 ('P830', 32143),
 ('P2044', 31949),
 ('P2046', 31294),
 ('P815', 29814),
 ('P6766', 29603),
 ('P269', 29444),
 ('P345', 29311),
 ('P1559', 29254),
 ('P69', 28489),
 ('P1082', 28454),
 ('P577', 26128),
 ('P1006', 25844),
 ('P166', 25108),
 ('P495', 24472),
 ('P7902', 22659),
 ('P3151', 22132),
 ('P2163', 22099),
 ('P1207

In [49]:
df = pd.DataFrame(rel_cnt_en, columns=['relation id', 'count'])
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6529,6530,6531,6532,6533,6534,6535,6536,6537,6538
relation id,P31,P646,P17,P18,P373,P2671,P21,P569,P106,P27,...,P4051,P2490,P3821,P6062,P6420,P8373,P8377,P6101,P3996,P5436
count,463154,212974,154666,140776,137764,125261,122705,118078,116256,110987,...,1,1,1,1,1,1,1,1,1,1


In [52]:
df.head(20)

Unnamed: 0,relation id,count
0,P31,463154
1,P646,212974
2,P17,154666
3,P18,140776
4,P373,137764
5,P2671,125261
6,P21,122705
7,P569,118078
8,P106,116256
9,P27,110987


entity 499994개에 대해 조사했을 때 relation의 종류는 6539가지.

| P31 | P646 | P17 | P18 | P373 | P2671 | P21 | P569 | P106 | P27 |
|:--- |:----:|:---:|:---:|:----:|:-----:|:---:|:----:|:----:|----:|
|instance of| Freebase ID | country | image | Commons category | Google Knowledge Graph ID | sex or gender | date of birth | occupation | country of citizenship |
|다음 종류에 속함 | Freebase 식별자 | 다음 나라의 것임 | 모습 | 이 주제를 다루는 공용 분류 | 구글 지식 그래프 ID | 성별 | 태어난 날 | 직업 | 국적 |

In [56]:
entities[0]['claims']['P373']

[{'mainsnak': {'snaktype': 'value',
   'property': 'P373',
   'datavalue': {'value': 'Belgium', 'type': 'string'},
   'datatype': 'string'},
  'type': 'statement',
  'id': 'q31$912C9A63-F124-4F42-8C0C-8A3E5AC8833B',
  'rank': 'normal'}]

In [59]:
ent_info_list_en[1]['labels']

'happiness'

In [60]:
entities[1]['claims']['P373']

[{'mainsnak': {'snaktype': 'value',
   'property': 'P373',
   'datavalue': {'value': 'Happiness', 'type': 'string'},
   'datatype': 'string'},
  'type': 'statement',
  'id': 'q8$0FC7FB9A-B5CA-4762-98AE-1B0BDC1EEF39',
  'rank': 'normal'}]

# 남은 works
- 각 relation id에 대해 labels로 치환하는 문제 - api?
- 특정 domain의 free text에서 relation extraction을 한다고 할 때 필요있을 relation과 필요없을 relation 구분?