Skip to content

Commit

Permalink
Work log is written.
Browse files Browse the repository at this point in the history
  • Loading branch information
dvmorozov committed Mar 4, 2023
1 parent 013b695 commit 5019433
Show file tree
Hide file tree
Showing 5 changed files with 74 additions and 37 deletions.
16 changes: 10 additions & 6 deletions ArxivNavigator/common/estimated_time.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@


import datetime
from common.log import *


class EstimatedTime(object):
Expand All @@ -24,13 +25,16 @@ def print_estimate_time(self, item_processed):
if item_processed % item_step == 0:
processed_percents: float = item_processed * 100.0 / self.item_count
elapsed_sec = (datetime.datetime.now() - self.started_time).total_seconds()
# print('elapsed_sec', elapsed_sec, 'item_processed', item_processed, 'item_count', item_count)
estimated_sec = datetime.timedelta(
seconds=elapsed_sec * (self.item_count - item_processed) / item_processed)
estimated_time = datetime.datetime(1, 1, 1) + estimated_sec
print(self.caption, '%.2f' % processed_percents, '%, estimated time=',
"%d:%d:%d:%d" % (
estimated_time.day - 1, estimated_time.hour, estimated_time.minute, estimated_time.second),
', processed', str(item_processed), '.')
text = self.caption + ' {:.2f} %, estimated time={}:{}:{}:{}, processed {}.'.\
format(processed_percents,
estimated_time.day-1,
estimated_time.hour,
estimated_time.minute,
estimated_time.second,
item_processed)
write_log_to_file(get_work_log(), text)
else:
print(self.caption, 'finished. Item number', str(item_processed), '.')
write_log_to_file(get_work_log(), self.caption + ' finished. Item number {0}.'.format(item_processed))
18 changes: 18 additions & 0 deletions ArxivNavigator/common/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,21 @@ def write_log_to_file(self, text):
textfile.write(text)
textfile.write('\n')
textfile.close()


def write_log_to_file(log, text):
assert (log is not None)
log.write_log_to_file(text)


work_log = Log("")


def init_work_log(path_to_work_log):
global work_log

work_log = Log(path_to_work_log)


def get_work_log():
return work_log
19 changes: 11 additions & 8 deletions ArxivNavigator/topic-mining/collect_dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
from nltk.stem.wordnet import WordNetLemmatizer

from common.estimated_time import *
from common.log import *


nltk.download('wordnet')
nltk.download('omw-1.4')
Expand Down Expand Up @@ -123,7 +125,8 @@ def get_bag_of_words_from_file(file_path, encoding):
def write_dictionary_to_file(file_path):
global dictionary

print('Dictionary having', str(len(dictionary)), 'items is saved into', file_path, '.')
write_log_to_file(get_work_log(), 'Dictionary having {0} items is saved into {1}.'.format(len(dictionary),
file_path))
dictionary.save_as_text(file_path)


Expand All @@ -132,15 +135,15 @@ def read_dictionary_from_file(file_path):

# The dictionary is supposed to be encoded by UTF-8.
dictionary = dictionary.load_from_text(file_path)
print(dictionary)
write_log_to_file(get_work_log(), 'Dictionary: {0}.'.format(dictionary))


def get_corpus_directory():
assert (len(sys.argv) > 1)

result = sys.argv[1]

print('Corpus directory is', result, '.')
write_log_to_file(get_work_log(), 'Corpus directory is {0}.'.format(result))
return result


Expand All @@ -152,7 +155,7 @@ def get_path_to_dictionary():
script_path = sys.argv[0]
result = os.path.abspath(os.path.join(os.path.dirname(script_path), '../data', dictionary_file_name))

print('Path to dictionary is', result, '.')
write_log_to_file(get_work_log(), 'Path to dictionary is {0}.'.format(result))
return result


Expand All @@ -162,7 +165,7 @@ def get_corpus_encoding():
else:
result = 'utf8'

print('Corpus encoding is', result, '.')
print('Corpus encoding is {0}.'.format(result))
return result


Expand Down Expand Up @@ -191,12 +194,13 @@ def get_text_file_list(path_to_texts):

total_text_file_count += 1

print('Number of files to process', str(len(result)), 'from total', str(total_text_file_count))
write_log_to_file(get_work_log(), 'Number of files to process {0} from total {1}.'.format(len(result),
total_text_file_count))
return result


def collect_corpus_dictionary(corpus_directory, path_to_dictionary, encoding):
print('Collecting corpus dictionary...')
write_log_to_file(get_work_log(), 'Collecting corpus dictionary...')

processed_files_count = 0
text_file_list = get_text_file_list(corpus_directory)
Expand All @@ -221,7 +225,6 @@ def collect_corpus_dictionary(corpus_directory, path_to_dictionary, encoding):


def file_to_bow(file_path, encoding):
# print('File', file_path, 'is processed.')
text = read_file(file_path, encoding)
words = do_preprocessing(text)
vector = dictionary.doc2bow(words)
Expand Down
37 changes: 23 additions & 14 deletions ArxivNavigator/topic-mining/collect_topics.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@
# N4 - path to model.
########################################################################################################################


from corpus_iterator import *
from common.log import *


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

Expand All @@ -21,15 +24,16 @@ def get_path_to_model(model_file_name):
script_path = sys.argv[0]
path_to_model = os.path.abspath(os.path.join(os.path.dirname(script_path), '../data', model_file_name))

print('Path to model is', path_to_model, '.')
write_log_to_file(get_work_log(), 'Path to model is {0}.'.format(path_to_model))
return path_to_model


def read_model_from_file(file_path, lsi_model, num_topics):
print('Model is read from file', file_path)
write_log_to_file(get_work_log(), 'Model is read from file {0}.'.format(file_path))
lsi_model.load(file_path)
print('======================================== Model topics ========================================')
print(lsi_model.print_topics(num_topics))
write_log_to_file(get_work_log(),
'======================================== Model topics ========================================')
write_log_to_file(get_work_log(), lsi_model.print_topics(num_topics))


def topic_items_to_js(topic_items):
Expand Down Expand Up @@ -65,11 +69,12 @@ def expression_to_array(topic_expression):


def get_topics_from_model(model, num_topics):
print('======================================== Model topics ========================================')
write_log_to_file(get_work_log(),
'======================================== Model topics ========================================')

result = []
for topic in model.print_topics(num_topics):
print(topic)
write_log_to_file(get_work_log(), 'Topic: {0}.'.format(topic))

topic_name = str(topic[0])
topic_expression = topic[1]
Expand All @@ -81,8 +86,9 @@ def get_topics_from_model(model, num_topics):


def write_topics_to_js_file(file_path, topics):
print('======================================== Model topics ========================================')
print('Topics are written to JavaScript file', file_path)
write_log_to_file(get_work_log(),
'======================================== Model topics ========================================')
write_log_to_file(get_work_log(), 'Topics are written to JavaScript file {0}.'.format(file_path))

first_topic = True
topics_js = 'var flare = {name: "main topics", children: ['
Expand All @@ -101,17 +107,18 @@ def write_topics_to_js_file(file_path, topics):

topics_js += ']};'

print(topics_js)
write_log_to_file(get_work_log(), 'Topics: {0}.'.format(topics_js))
text_file = open(file_path, "w", encoding='utf8')
text_file.write(topics_js)
text_file.close()


def write_model_to_file(file_path, model, num_topics):
print('======================================== Model topics ========================================')
print(model.print_topics(num_topics))

print('Model is written to file', file_path)
write_log_to_file(get_work_log(),
'======================================== Model topics '
'===========================================')
write_log_to_file(get_work_log(), model.print_topics(num_topics))
write_log_to_file(get_work_log(), 'Model is written to file {0}.'.format(file_path))
model.save(file_path)


Expand All @@ -126,7 +133,9 @@ def create_model(corpus_directory, corpus_encoding, num_topics):
# lsi_model = models.LsiModel(tfidf_iterator, id2word=get_dictionary(), num_topics=num_topics)
# return lsi_model

print('========================================= LDA ==========================================')
write_log_to_file(get_work_log(),
'======================================== LDA '
'===================================================')
lda_model = models.LdaMulticore(corpus_iterator, id2word=get_dictionary(), num_topics=num_topics)
'''
Multicore algorithm is crashed on the test dataset.
Expand Down
21 changes: 12 additions & 9 deletions ArxivNavigator/topic-mining/collect_topics_by_months.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,18 @@ def get_temporary_directory():
assert (len(sys.argv) > 2)

result = sys.argv[2]
print('Temporary directory is', result)
print('Temporary directory is {0}.'.format(result))
return result


def remove_text_files_from_directory(directory_path):
print('Directory', directory_path, 'is cleaned.')
write_log_to_file(get_work_log(), 'Directory {0} is cleaned.'.format(directory_path))

if not os.path.exists(directory_path):
os.makedirs(directory_path, exist_ok=True)

files_to_delete = get_text_file_list(directory_path)
for file_path in files_to_delete:
# print('File', file_path, 'is removed.')
os.remove(file_path)


Expand All @@ -48,7 +47,6 @@ def get_file_name_from_article_id(article_id):

def copy_articles_into_directory(directory_path, article_ids):
global copy_log
assert(copy_log is not None)

corpus_directory = get_corpus_directory()
copied_files_count = 0
Expand All @@ -59,13 +57,11 @@ def copy_articles_into_directory(directory_path, article_ids):
src_path = os.path.join(corpus_directory, file_name)
dst_path = os.path.join(directory_path, file_name)
if os.path.exists(src_path):
text = 'File {0} is copied to {1}.'.format(src_path, directory_path)
copy_log.write_log_to_file(text)
write_log_to_file(copy_log, 'File {0} is copied to {1}.'.format(src_path, directory_path))
shutil.copyfile(src_path, dst_path)
copied_files_count += 1
else:
text = 'File {0} does not exits. Article id. is {1}.'.format(src_path, article_id)
copy_log.write_log_to_file(text)
write_log_to_file(copy_log, 'File {0} does not exits. Article id. is {1}.'.format(src_path, article_id))
not_existing_files_count += 1

return copied_files_count, not_existing_files_count
Expand All @@ -79,15 +75,21 @@ def mine_topics_month_by_month():
corpus_encoding = get_corpus_encoding()
path_to_dictionary = os.path.join(temporary_directory, 'dictionary.txt')
path_to_copy_log = os.path.join(temporary_directory, 'copy.log.txt')
path_to_work_log = os.path.join(temporary_directory, 'work.log.txt')
path_to_topic_by_months_js = "../data/topic_by_months.js"

copy_log = Log(path_to_copy_log)
init_work_log(path_to_work_log)

clear_months()
read_months_from_json('../data/months.json')

for month_name in months.keys():
month = months[month_name]
write_log_to_file(get_work_log(),
'======================================== Month {0} ========================================'.
format(month_name))

article_ids = month.get_article_ids()
if len(article_ids) == 0:
continue
Expand All @@ -98,7 +100,8 @@ def mine_topics_month_by_month():
if copied_files_count == 0:
continue

print('Number of not existing files', str(not_existing_files_count), 'for month', month_name, '.')
write_log_to_file(get_work_log(), 'Number of not existing files {0} for month {1} .'.format(
not_existing_files_count, month_name))

# It's Ok to collect dictionary for a bunch of files for every month.
collect_corpus_dictionary(corpus_directory, path_to_dictionary, corpus_encoding)
Expand Down

0 comments on commit 5019433

Please sign in to comment.