In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Import modules

In [2]:
%cd '/content/drive/My Drive/Colab Notebooks/opinion-lab-group-2.3/refactorization'
%pwd 

/content/drive/My Drive/Colab Notebooks/opinion-lab-group-2.3/refactorization


'/content/drive/My Drive/Colab Notebooks/opinion-lab-group-2.3/refactorization'

In [3]:
import numpy as np
import json
from util import list2json

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Load global ID of desired pre-processed sentences
i.e. Only sentences with length > 15, total number of desired pre-proceesed sentences is 328091.

In [4]:
directory = '/content/drive/My Drive/Colab Notebooks/opinion-lab-group-2.3/refactorization/data/'

en_global_ids = np.load(directory + 'en_sentence_global_id.npy')
de_global_ids = np.load(directory + 'de_sentence_global_id.npy')
global_ids = np.concatenate((en_global_ids, de_global_ids))

# choose k = 15 as optimal number of clusters from kmean clustering
kmean_labels = np.load(directory + '15_kmean_labels.npy')

en_kmean_labels = kmean_labels[:en_global_ids.shape[0]]
de_kmean_labels = kmean_labels[en_global_ids.shape[0] : en_global_ids.shape[0] + de_global_ids.shape[0]]

assert en_kmean_labels.shape == en_global_ids.shape
assert de_kmean_labels.shape == de_global_ids.shape

print('English global IDs: ', en_global_ids)
print(type(en_global_ids), 'shape:', en_global_ids.shape)
print()
print('German global IDs:', de_global_ids)
print(type(de_global_ids), 'shape:', de_global_ids.shape)
print()
print('kmean labels:', kmean_labels)
print(type(kmean_labels), 'shape:', kmean_labels.shape)

English global IDs:  [     0      1      2 ... 136422 136423 136424]
<class 'numpy.ndarray'> shape: (127464,)

German global IDs: [136425 136426 136427 ... 353783 353784 353785]
<class 'numpy.ndarray'> shape: (200627,)

kmean labels: [ 1  1  3 ... 12 10  7]
<class 'numpy.ndarray'> shape: (328091,)


# Get related information for each pre-processed sentences

In [5]:
# fetch useful information from processed indexing file for all sentences
with open(directory + 'all_sentences_index_with_date_cluster_senti.json', 'r') as f: 
  loaded_data = json.load(f)

source, document_id, comment_id, date, sentiment = [], [], [], [], []
# source, document_id, comment_id, date = [], [], [], []
for item in loaded_data:
  source.append(item['corpus_name'])
  document_id.append(item['doc_id'])
  comment_id.append(item['com_id'] if item['com_id'] != None else -1) # assign -1 to comment id for sentences belongs to articles inteads of comments
  date.append(item['date'])
  sentiment.append(item['sentiment'])

# obtain small subset of information by lanuages for desired pre-processed sentences only
en_source_sm = np.array(source)[en_global_ids]
en_document_id_sm = np.array(document_id)[en_global_ids]
en_comment_id_sm = np.array(comment_id)[en_global_ids]
en_date_sm = np.array(date)[en_global_ids]
en_sentiment_sm = np.array(sentiment)[en_global_ids]

de_source_sm = np.array(source)[de_global_ids]
de_document_id_sm = np.array(document_id)[de_global_ids]
de_comment_id_sm = np.array(comment_id)[de_global_ids]
de_date_sm = np.array(date)[de_global_ids]
de_sentiment_sm = np.array(sentiment)[de_global_ids]

assert en_global_ids.shape == en_source_sm.shape == en_document_id_sm.shape == en_comment_id_sm.shape == en_date_sm.shape # == en_sentiment_sm.shape
assert de_global_ids.shape == de_source_sm.shape == de_document_id_sm.shape == de_comment_id_sm.shape == de_date_sm.shape # == de_sentiment_sm.shape


# Save sentences related information as .npy files

In [None]:
np.save('en_sources.npy', en_source_sm)
np.save('en_document_id.npy', en_document_id_sm)
np.save('en_comment_id.npy', en_comment_id_sm)
np.save('en_date.npy', en_date_sm)
# np.save('en_sentiment.npy', en_sentiment_sm)

np.save('de_sources.npy', de_source_sm)
np.save('de_document_id.npy', de_document_id_sm)
np.save('de_comment_id.npy', de_comment_id_sm)
np.save('de_date.npy', de_date_sm)
# np.save('de_sentiment.npy', de_sentiment_sm)

/content/drive/My Drive/Colab Notebooks/opinion-lab-group-2.3/refactorization/data


In [None]:
# Simple random check to ensure the information matched
directory = '/content/drive/My Drive/Colab Notebooks/opinion-lab-group-2.3/refactorization/data/'

tmp_en_source_sm = np.load(directory + 'en_sources.npy')
tmp_en_document_id_sm = np.load(directory + 'en_document_id.npy')
tmp_en_comment_id_sm = np.load(directory + 'en_comment_id.npy')
tmp_en_date_sm= np.load(directory + 'en_date.npy')
# tmp_en_sentiment_sm = np.load(directory + 'en_sentiment.npy')

tmp_de_source_sm = np.load(directory + 'de_sources.npy')
tmp_de_document_id_sm = np.load(directory + 'de_document_id.npy')
tmp_de_comment_id_sm = np.load(directory + 'de_comment_id.npy')
tmp_de_date_sm = np.load(directory + 'de_date.npy')
# tmp_de_sentiment_sm = np.load(directory + 'de_sentiment.npy')


print('A random record from original dictionary:')
print(loaded_data[353785])
print()
print('Corresponding records from smaller set of sentences, i.e. sentences with length <= 15 is ignored:')
i = np.where(de_global_ids == 353785)[0]
print('index:', i)
print('global id:', de_global_ids[i])
print('source:', tmp_de_source_sm[i])
print('document id:', tmp_de_document_id_sm[i])
print('comment_id:', tmp_de_comment_id_sm[i])
print('date:', tmp_de_date_sm[i])
# print('sentiment:', tmp_de_sentiment_sm[i])

A random record from original dictionary:
{'global_id': 353785, 'corpus_name': 'spiegel', 'doc_id': 151, 'com_id': None, 'date': '2008-03-29', 'cluster': 11, 'sentiment': 0}

Corresponding records from smaller set of sentences, i.e. sentences with length <= 15 is ignored:
index: [200626]
global id: [353785]
source: ['spiegel']
document id: [151]
comment_id: [-1]
date: ['2008-03-29']


# Save sentences related information as .json files

In [8]:
list_of_dictionary = []
for i in range(len(en_global_ids)):
  list_of_dictionary.append({
    'global_id': en_global_ids[i],
    'corpus_name': en_source_sm[i],
    'doc_id': en_document_id_sm[i], 
    'com_id': en_comment_id_sm[i], 
    'date': en_date_sm[i], 
    'cluster': en_kmean_labels[i], 
    'sentiment': en_sentiment_sm[i]
  })

for i in range(len(de_global_ids)):
  list_of_dictionary.append({
    'global_id': de_global_ids[i],
    'corpus_name': de_source_sm[i],
    'doc_id': de_document_id_sm[i], 
    'com_id': de_comment_id_sm[i], 
    'date': de_date_sm[i], 
    'cluster': de_kmean_labels[i], 
    'sentiment': de_sentiment_sm[i]
  })

In [9]:
list2json(list_of_dictionary, 'sentence_cluster_sentiment_dict.json')