Work log is written.

#111
dvmorozov · Mar 4, 2023 · 5019433 · 5019433
1 parent 013b695
commit 5019433
Show file tree

Hide file tree

Showing 5 changed files with 74 additions and 37 deletions.
diff --git a/ArxivNavigator/common/estimated_time.py b/ArxivNavigator/common/estimated_time.py
@@ -6,6 +6,7 @@
 
 
 import datetime
+from common.log import *
 
 
 class EstimatedTime(object):
@@ -24,13 +25,16 @@ def print_estimate_time(self, item_processed):
             if item_processed % item_step == 0:
                 processed_percents: float = item_processed * 100.0 / self.item_count
                 elapsed_sec = (datetime.datetime.now() - self.started_time).total_seconds()
-                # print('elapsed_sec', elapsed_sec, 'item_processed', item_processed, 'item_count', item_count)
                 estimated_sec = datetime.timedelta(
                     seconds=elapsed_sec * (self.item_count - item_processed) / item_processed)
                 estimated_time = datetime.datetime(1, 1, 1) + estimated_sec
-                print(self.caption, '%.2f' % processed_percents, '%, estimated time=',
-                      "%d:%d:%d:%d" % (
-                          estimated_time.day - 1, estimated_time.hour, estimated_time.minute, estimated_time.second),
-                      ', processed', str(item_processed), '.')
+                text = self.caption + ' {:.2f} %, estimated time={}:{}:{}:{}, processed {}.'.\
+                    format(processed_percents,
+                           estimated_time.day-1,
+                           estimated_time.hour,
+                           estimated_time.minute,
+                           estimated_time.second,
+                           item_processed)
+                write_log_to_file(get_work_log(), text)
         else:
-            print(self.caption, 'finished. Item number', str(item_processed), '.')
+            write_log_to_file(get_work_log(), self.caption + ' finished. Item number {0}.'.format(item_processed))
diff --git a/ArxivNavigator/common/log.py b/ArxivNavigator/common/log.py
@@ -21,3 +21,21 @@ def write_log_to_file(self, text):
         textfile.write(text)
         textfile.write('\n')
         textfile.close()
+
+
+def write_log_to_file(log, text):
+    assert (log is not None)
+    log.write_log_to_file(text)
+
+
+work_log = Log("")
+
+
+def init_work_log(path_to_work_log):
+    global work_log
+
+    work_log = Log(path_to_work_log)
+
+
+def get_work_log():
+    return work_log
diff --git a/ArxivNavigator/topic-mining/collect_dictionary.py b/ArxivNavigator/topic-mining/collect_dictionary.py
@@ -18,6 +18,8 @@
 from nltk.stem.wordnet import WordNetLemmatizer
 
 from common.estimated_time import *
+from common.log import *
+
 
 nltk.download('wordnet')
 nltk.download('omw-1.4')
@@ -123,7 +125,8 @@ def get_bag_of_words_from_file(file_path, encoding):
 def write_dictionary_to_file(file_path):
     global dictionary
 
-    print('Dictionary having', str(len(dictionary)), 'items is saved into', file_path, '.')
+    write_log_to_file(get_work_log(), 'Dictionary having {0} items is saved into {1}.'.format(len(dictionary),
+                                                                                              file_path))
     dictionary.save_as_text(file_path)
 
 
@@ -132,15 +135,15 @@ def read_dictionary_from_file(file_path):
 
     # The dictionary is supposed to be encoded by UTF-8.
     dictionary = dictionary.load_from_text(file_path)
-    print(dictionary)
+    write_log_to_file(get_work_log(), 'Dictionary: {0}.'.format(dictionary))
 
 
 def get_corpus_directory():
     assert (len(sys.argv) > 1)
 
     result = sys.argv[1]
 
-    print('Corpus directory is', result, '.')
+    write_log_to_file(get_work_log(), 'Corpus directory is {0}.'.format(result))
     return result
 
 
@@ -152,7 +155,7 @@ def get_path_to_dictionary():
         script_path = sys.argv[0]
         result = os.path.abspath(os.path.join(os.path.dirname(script_path), '../data', dictionary_file_name))
 
-    print('Path to dictionary is', result, '.')
+    write_log_to_file(get_work_log(), 'Path to dictionary is {0}.'.format(result))
     return result
 
 
@@ -162,7 +165,7 @@ def get_corpus_encoding():
     else:
         result = 'utf8'
 
-    print('Corpus encoding is', result, '.')
+    print('Corpus encoding is {0}.'.format(result))
     return result
 
 
@@ -191,12 +194,13 @@ def get_text_file_list(path_to_texts):
 
             total_text_file_count += 1
 
-    print('Number of files to process', str(len(result)), 'from total', str(total_text_file_count))
+    write_log_to_file(get_work_log(), 'Number of files to process {0} from total {1}.'.format(len(result),
+                                                                                              total_text_file_count))
     return result
 
 
 def collect_corpus_dictionary(corpus_directory, path_to_dictionary, encoding):
-    print('Collecting corpus dictionary...')
+    write_log_to_file(get_work_log(), 'Collecting corpus dictionary...')
 
     processed_files_count = 0
     text_file_list = get_text_file_list(corpus_directory)
@@ -221,7 +225,6 @@ def collect_corpus_dictionary(corpus_directory, path_to_dictionary, encoding):
 
 
 def file_to_bow(file_path, encoding):
-    # print('File', file_path, 'is processed.')
     text = read_file(file_path, encoding)
     words = do_preprocessing(text)
     vector = dictionary.doc2bow(words)

diff --git a/ArxivNavigator/topic-mining/collect_topics.py b/ArxivNavigator/topic-mining/collect_topics.py
@@ -9,7 +9,10 @@
 #   N4 - path to model.
 ########################################################################################################################
 
+
 from corpus_iterator import *
+from common.log import *
+
 
 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 
@@ -21,15 +24,16 @@ def get_path_to_model(model_file_name):
         script_path = sys.argv[0]
         path_to_model = os.path.abspath(os.path.join(os.path.dirname(script_path), '../data', model_file_name))
 
-    print('Path to model is', path_to_model, '.')
+    write_log_to_file(get_work_log(), 'Path to model is {0}.'.format(path_to_model))
     return path_to_model
 
 
 def read_model_from_file(file_path, lsi_model, num_topics):
-    print('Model is read from file', file_path)
+    write_log_to_file(get_work_log(), 'Model is read from file {0}.'.format(file_path))
     lsi_model.load(file_path)
-    print('======================================== Model topics ========================================')
-    print(lsi_model.print_topics(num_topics))
+    write_log_to_file(get_work_log(),
+                      '======================================== Model topics ========================================')
+    write_log_to_file(get_work_log(), lsi_model.print_topics(num_topics))
 
 
 def topic_items_to_js(topic_items):
@@ -65,11 +69,12 @@ def expression_to_array(topic_expression):
 
 
 def get_topics_from_model(model, num_topics):
-    print('======================================== Model topics ========================================')
+    write_log_to_file(get_work_log(),
+                      '======================================== Model topics ========================================')
 
     result = []
     for topic in model.print_topics(num_topics):
-        print(topic)
+        write_log_to_file(get_work_log(), 'Topic: {0}.'.format(topic))
 
         topic_name = str(topic[0])
         topic_expression = topic[1]
@@ -81,8 +86,9 @@ def get_topics_from_model(model, num_topics):
 
 
 def write_topics_to_js_file(file_path, topics):
-    print('======================================== Model topics ========================================')
-    print('Topics are written to JavaScript file', file_path)
+    write_log_to_file(get_work_log(),
+                      '======================================== Model topics ========================================')
+    write_log_to_file(get_work_log(), 'Topics are written to JavaScript file {0}.'.format(file_path))
 
     first_topic = True
     topics_js = 'var flare = {name: "main topics", children: ['
@@ -101,17 +107,18 @@ def write_topics_to_js_file(file_path, topics):
 
     topics_js += ']};'
 
-    print(topics_js)
+    write_log_to_file(get_work_log(), 'Topics: {0}.'.format(topics_js))
     text_file = open(file_path, "w", encoding='utf8')
     text_file.write(topics_js)
     text_file.close()
 
 
 def write_model_to_file(file_path, model, num_topics):
-    print('======================================== Model topics ========================================')
-    print(model.print_topics(num_topics))
-
-    print('Model is written to file', file_path)
+    write_log_to_file(get_work_log(),
+                      '======================================== Model topics '
+                      '===========================================')
+    write_log_to_file(get_work_log(), model.print_topics(num_topics))
+    write_log_to_file(get_work_log(), 'Model is written to file {0}.'.format(file_path))
     model.save(file_path)
 
 
@@ -126,7 +133,9 @@ def create_model(corpus_directory, corpus_encoding, num_topics):
     # lsi_model = models.LsiModel(tfidf_iterator, id2word=get_dictionary(), num_topics=num_topics)
     # return lsi_model
 
-    print('========================================= LDA ==========================================')
+    write_log_to_file(get_work_log(),
+                      '======================================== LDA '
+                      '===================================================')
     lda_model = models.LdaMulticore(corpus_iterator, id2word=get_dictionary(), num_topics=num_topics)
     '''
     Multicore algorithm is crashed on the test dataset.

diff --git a/ArxivNavigator/topic-mining/collect_topics_by_months.py b/ArxivNavigator/topic-mining/collect_topics_by_months.py
@@ -23,19 +23,18 @@ def get_temporary_directory():
     assert (len(sys.argv) > 2)
 
     result = sys.argv[2]
-    print('Temporary directory is', result)
+    print('Temporary directory is {0}.'.format(result))
     return result
 
 
 def remove_text_files_from_directory(directory_path):
-    print('Directory', directory_path, 'is cleaned.')
+    write_log_to_file(get_work_log(), 'Directory {0} is cleaned.'.format(directory_path))
 
     if not os.path.exists(directory_path):
         os.makedirs(directory_path, exist_ok=True)
 
     files_to_delete = get_text_file_list(directory_path)
     for file_path in files_to_delete:
-        # print('File', file_path, 'is removed.')
         os.remove(file_path)
 
 
@@ -48,7 +47,6 @@ def get_file_name_from_article_id(article_id):
 
 def copy_articles_into_directory(directory_path, article_ids):
     global copy_log
-    assert(copy_log is not None)
 
     corpus_directory = get_corpus_directory()
     copied_files_count = 0
@@ -59,13 +57,11 @@ def copy_articles_into_directory(directory_path, article_ids):
         src_path = os.path.join(corpus_directory, file_name)
         dst_path = os.path.join(directory_path, file_name)
         if os.path.exists(src_path):
-            text = 'File {0} is copied to {1}.'.format(src_path, directory_path)
-            copy_log.write_log_to_file(text)
+            write_log_to_file(copy_log, 'File {0} is copied to {1}.'.format(src_path, directory_path))
             shutil.copyfile(src_path, dst_path)
             copied_files_count += 1
         else:
-            text = 'File {0} does not exits. Article id. is {1}.'.format(src_path, article_id)
-            copy_log.write_log_to_file(text)
+            write_log_to_file(copy_log, 'File {0} does not exits. Article id. is {1}.'.format(src_path, article_id))
             not_existing_files_count += 1
 
     return copied_files_count, not_existing_files_count
@@ -79,15 +75,21 @@ def mine_topics_month_by_month():
     corpus_encoding = get_corpus_encoding()
     path_to_dictionary = os.path.join(temporary_directory, 'dictionary.txt')
     path_to_copy_log = os.path.join(temporary_directory, 'copy.log.txt')
+    path_to_work_log = os.path.join(temporary_directory, 'work.log.txt')
     path_to_topic_by_months_js = "../data/topic_by_months.js"
 
     copy_log = Log(path_to_copy_log)
+    init_work_log(path_to_work_log)
 
     clear_months()
     read_months_from_json('../data/months.json')
 
     for month_name in months.keys():
         month = months[month_name]
+        write_log_to_file(get_work_log(),
+                          '======================================== Month {0} ========================================'.
+                          format(month_name))
+
         article_ids = month.get_article_ids()
         if len(article_ids) == 0:
             continue
@@ -98,7 +100,8 @@ def mine_topics_month_by_month():
         if copied_files_count == 0:
             continue
 
-        print('Number of not existing files', str(not_existing_files_count), 'for month', month_name, '.')
+        write_log_to_file(get_work_log(), 'Number of not existing files {0} for month {1} .'.format(
+            not_existing_files_count, month_name))
 
         #  It's Ok to collect dictionary for a bunch of files for every month.
         collect_corpus_dictionary(corpus_directory, path_to_dictionary, corpus_encoding)