<a href="https://colab.research.google.com/github/cuhkrsdi/newspaper-analytics/blob/main/GeoTemCo_dataitems_(Public).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Prepare data set for GeoTemCo to display

- Read the result of named-entity recognition
- Select specific word in specific type (GPE), and turn into frequency dictionary
- Restructure data to JSON with GeoTemCo requirement

###Import libraries

In [1]:
import json
from urllib.request import urlopen
# Ignore SSL for url demo
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
import random

###Read the result and filters

In [2]:
# Here the result share EXACTLY THE SAME structure as the sample demo in the "data_preprocessing_and_NLP (Public).ipynb", the only difference is the total amount of data
# url_result = "https://dsprojects.lib.cuhk.edu.hk/media/filer_public/ea/69/ea69bab5-552f-4dd3-a5ef-02aa79faa7a7/ymd_frequency001.json" # No longer works

url_result = "https://researchdata.cuhk.edu.hk/api/access/datafile/:persistentId?persistentId=doi:10.48668/JYYEBG/Y9L8HX"
result_file = urlopen(url_result)
ymd_freq_dict = json.loads(result_file.read().decode("utf-8"))

In [3]:
# Structure:    ymd_freq_dict = {ymd: {word: {type1: freq, type2:freq, ...}}}
print(len(ymd_freq_dict.keys()))

3250


In [4]:
# read filters, expect: 'PERSON_list', 'PERSON_dict', 'ORG_list', 'ORG_dict', 'EVENT_list', 'EVENT_dict', 'ymd_to_pid_dict', 'pid_to_parent', 'GPE_dict', 'GPE_list'
# url_filterdict = "https://dsprojects.lib.cuhk.edu.hk/media/filer_public/f6/71/f671a0da-1533-409a-a014-ec31d4aeac1c/filteranddict.json" # No longer works

url_filterdict = "https://researchdata.cuhk.edu.hk/api/access/datafile/:persistentId?persistentId=doi:10.48668/JYYEBG/ADMALJ"
filter_file = urlopen(url_filterdict)
dict_list_filters = json.loads(filter_file.read().decode("utf-8"))

In [5]:
# structure:    dict_list_filters = {'list_name': [...], 'dict_name': {...}}
dict_list_filters.keys()

dict_keys(['PERSON_list', 'PERSON_dict', 'ORG_list', 'ORG_dict', 'EVENT_list', 'EVENT_dict', 'ymd_to_pid_dict', 'pid_to_parent', 'GPE_dict', 'GPE_list'])

###Select type GPE words (mainly countries) and generate a frequency dictionary from the data set

In [None]:

"""
# From:
# ymd_freq_dict = {ymd: {word: {ner_type: freq, ner_type: freq, ...}}}
# Goal: 
# ymd_gpe_dict = {"ymd1": freq, "ymd2" = freq, "ymd3" : freq}
"""

# Moderfy the list of selected GPE and ner type before run !!!!!!
word_to_filter = ['蘇聯', '俄', '蘇俄', '蘇']




type_to_filter = "GPE"

ymd_gpe_dict = dict()
error_count = 0
# for each ymd
for ymd_key in ymd_freq_dict.keys():
    
  # for each (unique) word
  for ner_word in ymd_freq_dict[ymd_key].keys():

    # for each (unique) type
    for ner_type in ymd_freq_dict[ymd_key][ner_word].keys():

      # if not yet, get it from the previous DF
      if (ner_word in word_to_filter) and (ner_type == type_to_filter) and (ymd_key not in ymd_gpe_dict):
        ymd_gpe_dict[ymd_key] = ymd_freq_dict[ymd_key][ner_word][ner_type]

      # if already exist, get 
      elif (ner_word in word_to_filter) and (ner_type == type_to_filter) and (ymd_key in ymd_gpe_dict):
        ymd_gpe_dict[ymd_key] = ymd_gpe_dict[ymd_key] + ymd_freq_dict[ymd_key][ner_word][ner_type]
        # print('combined')
        # print(ner_word)

In [None]:
# check the length of result dictionary
len(ymd_gpe_dict)

3067

###GeoTemCo standard JSON structure for dataitems


```
[
    {
        "id": someId,
        "weight": someWeight,
        "name": someName,
        "description": someDescription,
        "lon": someLongitude,
        "lat": someLatitude,
        "place": somePlaceName,
        "time": someDate,
        "tableContent": someValuesList
    }
    .
    .
]
```



###Restructuring the data to GeoTemCo form with added html code for thumbnails, related words, href

In [None]:
"""
HTML code added to the JSON to slot in thumbnails, related words, href links to the webpage
"""

# Change MANUALLY before run
# longitude, latitude, place for dataitem
lon = 99.084489
lat = 61.7295
place = f"蘇聯"



# read filters 
PERSON_list = dict_list_filters['PERSON_list']
PERSON_dict = dict_list_filters['PERSON_dict']
ORG_list = dict_list_filters['ORG_list']
ORG_dict = dict_list_filters['ORG_dict']
EVENT_list = dict_list_filters['EVENT_list']
EVENT_dict = dict_list_filters['EVENT_dict']
ymd_to_pid_dict = dict_list_filters['ymd_to_pid_dict']
pid_to_parent = dict_list_filters['pid_to_parent']
GPE_list = dict_list_filters['GPE_list']
GPE_dict = dict_list_filters['GPE_dict']



geotemco_json_list = list()

# count the longest samples for the record
longest_person = 0
longest_org = 0
longest_event = 0

for each_ymd in ymd_gpe_dict.keys():
  dict_in_iterate = dict()

  person_to_insert = list()
  org_to_insert = list()
  event_to_insert = list()

  # if the 199X00XX is in JSON
  if each_ymd in ymd_freq_dict.keys():
    for word in ymd_freq_dict[each_ymd].keys():
      for ner_type in ymd_freq_dict[each_ymd][word].keys():
        
        # get type PERSON
        if (word in PERSON_list) and (ner_type == "PERSON"):
          if (word in PERSON_dict):
            person_to_insert.append((PERSON_dict[word], ymd_freq_dict[each_ymd][word]["PERSON"]))
          else:
            person_to_insert.append((word, ymd_freq_dict[each_ymd][word]["PERSON"]))
        
        # get type ORG
        if (word in ORG_list) and (ner_type == "ORG"):
          if (word in ORG_dict):
            org_to_insert.append((ORG_dict[word], ymd_freq_dict[each_ymd][word]["ORG"]))
          else:
            org_to_insert.append((word, ymd_freq_dict[each_ymd][word]["ORG"]))

        # get type EVENT
        if (word in EVENT_list) and (ner_type == "EVENT"):
          if (word in EVENT_dict):
            event_to_insert.append((EVENT_dict[word], ymd_freq_dict[each_ymd][word]["EVENT"]))
          else:
            event_to_insert.append((word, ymd_freq_dict[each_ymd][word]["EVENT"]))
  
  sorted_person_to_insert = sorted(person_to_insert, key=lambda tup: tup[1], reverse=True)
  sorted_org_to_insert = sorted(org_to_insert, key=lambda tup: tup[1], reverse=True)
  sorted_event_to_insert = sorted(event_to_insert, key=lambda tup: tup[1], reverse=True)

  # id for dataitem
  id = f"{each_ymd}_{word_to_filter}"

  # weight for dataitem
  weight = ymd_gpe_dict[each_ymd]

  pid = str()
  parent = str()
  if each_ymd in ymd_to_pid_dict.keys():
    pid = ymd_to_pid_dict[each_ymd]
  if pid in pid_to_parent.keys():
    parent = pid_to_parent[pid]

  # name for dataitem
  # wrtie html into JSON for thumbnails and href links
  name = f"<a href='https://repository.lib.cuhk.edu.hk/en/item/cuhk-{parent}' target='_blank'><image title='Newspaper Thumbnail' alt='Newspaper Thumbnail \
Image' src='https://repository.lib.cuhk.edu.hk/en/islandora/object/cuhk%3A{pid}/datastream/TN/view'/></a>"


  # description for dataitem
  description = None

  # time with 2 steps
  s = each_ymd[:4] + '-' + each_ymd[4:]
  time = s[:7] + '-' + s[7:]
  
  related_PERSON = str()
  related_ORG = str()
  related_EVENT = str()
  related_keywords = str()

  # seperate to 3 Independent for loops to organize the related keywords into words with href in html form 
  # PERSON
  person_log = list()
  if len(sorted_person_to_insert) > 0:
    # get only HALF of the totla related keywords (reserved usage) for better display
    for i in range(len(sorted_person_to_insert) // 2):
      related_person = sorted_person_to_insert[random.randint(0, len(sorted_person_to_insert) - 1)][0]
      if related_person in person_log:
        continue
      person_search = f"<a href='https://repository.lib.cuhk.edu.hk/en/islandora/search/\"{related_person}\"?type=edismax&cp=cuhk%3Ahk-tabloid' target='_blank'>"
      person_item = person_search + related_person + '</a>'
      related_PERSON += person_item + ' '
      person_log.append(related_person)

    if len(person_log) > longest_person:
      longest_person = len(person_log)

  # ORG
  org_log = list()
  if len(sorted_org_to_insert) > 0:
    # get only HALF of the totla related keywords (reserved usage) for better display
    for j in range(len(sorted_org_to_insert) // 2):
      related_org = sorted_org_to_insert[random.randint(0, len(sorted_org_to_insert) - 1)][0] # get the first element of tuple
      if related_org in org_log:
        continue
      org_search = f"<a href='https://repository.lib.cuhk.edu.hk/en/islandora/search/\"{related_org}\"?type=edismax&cp=cuhk%3Ahk-tabloid' target='_blank'>"
      org_item = org_search + related_org + '</a>'
      related_ORG += org_item + ' '
      org_log.append(related_org)

    if len(org_log) > longest_org:
      longest_org = len(org_log)

  # EVENT
  event_log = list()
  if len(sorted_event_to_insert) > 0:
    # get only HALF of the totla related keywords (reserved usage) for better display
    for k in range(len(sorted_event_to_insert) // 2):
      related_event = sorted_event_to_insert[random.randint(0, len(sorted_event_to_insert) - 1)][0]
      if related_event in event_log:
        continue
      event_search = f"<a href='https://repository.lib.cuhk.edu.hk/en/islandora/search/\"{related_event}\"?type=edismax&cp=cuhk%3Ahk-tabloid' target='_blank'>"
      event_item = event_search + related_event + '</a>'
      related_EVENT += event_item + ' '
      event_log.append(related_event)

    if len(event_log) > longest_event:
      longest_event = len(event_log)
  
  keywords_PERSON_str = str()
  keywords_ORG_str = str()
  keywords_EVENT_str = str()

  if len(related_PERSON) > 0:
    keywords_PERSON_str = "Person: " + related_PERSON + "<br>"
    
  if len(related_ORG) > 0:
    keywords_ORG_str = "Org: " + related_ORG + "<br>"

  if len(related_EVENT) > 0:
    keywords_EVENT_str = "Event: " + related_EVENT + "<br>"

  #print(related_PERSON_str)
  #print(related_ORG_str)
  #print(related_EVENT_str)

  related_keywords = keywords_PERSON_str + keywords_ORG_str + keywords_EVENT_str

  # table content combined for dataitem
  tableContent = {"當日報章":name, "關聯字": related_keywords, "日期": f"{time}"}

  # slot in 
  dict_in_iterate["id"] = id
  dict_in_iterate["weight"] = weight
  dict_in_iterate["name"] = name
  dict_in_iterate["description"] = description
  dict_in_iterate["lon"] = lon
  dict_in_iterate["lat"] = lat
  dict_in_iterate["place"] = place
  dict_in_iterate["time"] = time
  dict_in_iterate["tableContent"] = tableContent

  geotemco_json_list.append(dict_in_iterate)

print(len(geotemco_json_list))
print(longest_person)
print(longest_org)
print(longest_event)




3067
22
21
7


In [None]:
# download the finished nested structure as JSON
with open(f'/content/{place}.json', 'w') as fp:
  json.dump(geotemco_json_list, fp)

In [None]:
# show the restructured result
geotemco_json_list

[{'description': None,
  'id': "19501009_['蘇聯', '俄', '蘇俄', '蘇']",
  'lat': 61.7295,
  'lon': 99.084489,
  'name': "<a href='https://repository.lib.cuhk.edu.hk/en/item/cuhk-2592821' target='_blank'><image title='Newspaper Thumbnail' alt='Newspaper Thumbnail Image' src='https://repository.lib.cuhk.edu.hk/en/islandora/object/cuhk%3A2592824/datastream/TN/view'/></a>",
  'place': '蘇聯',
  'tableContent': {'日期': '1950-10-09',
   '當日報章': "<a href='https://repository.lib.cuhk.edu.hk/en/item/cuhk-2592821' target='_blank'><image title='Newspaper Thumbnail' alt='Newspaper Thumbnail Image' src='https://repository.lib.cuhk.edu.hk/en/islandora/object/cuhk%3A2592824/datastream/TN/view'/></a>",
   '關聯字': 'Person: <a href=\'https://repository.lib.cuhk.edu.hk/en/islandora/search/"孔明"?type=edismax&cp=cuhk%3Ahk-tabloid\' target=\'_blank\'>孔明</a> <a href=\'https://repository.lib.cuhk.edu.hk/en/islandora/search/"周恩來"?type=edismax&cp=cuhk%3Ahk-tabloid\' target=\'_blank\'>周恩來</a> <a href=\'https://repository.l