Skip to content

Commit

Permalink
Class has been added to collect articles by months. Time function has…
Browse files Browse the repository at this point in the history
… been moved into generic module.

#111
  • Loading branch information
dvmorozov committed Feb 15, 2023
1 parent 2294208 commit 098d524
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 7 deletions.
15 changes: 15 additions & 0 deletions ArxivNavigator/common/time.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
########################################################################################################################
# File "time.py"
# Copyright © Dmitry Morozov 2023
# Module contains generic functions to work with time intervals.
# If you want to use this file please contact me by dvmorozov@hotmail.com.
########################################################################################################################


import datetime


def get_years_range():
current_date = datetime.datetime.now().date()
current_year = int(current_date.strftime("%Y"))
return range(1985, current_year + 1)
6 changes: 4 additions & 2 deletions ArxivNavigator/meta2js/meta2js.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,17 +56,19 @@ def extract_topics_data():
date = datetime.strptime(version_date, '%a, %d %b %Y %H:%M:%S %Z')
version_dates.append(date)

last_version_date = max(version_dates)

for source_id in categories[0].split():
topic = add_unique_topic(source_id)
topic.add_article(Article(article["id"], article["title"], max(version_dates)))
topic.add_article(Article(article["id"], article["title"], last_version_date))

for target_id in categories[0].split():
if source_id != target_id:
add_unique_link(source_id, target_id)

processed_article_count += 1
if processed_article_count % 10000 == 0:
print ('Processed: ', str(processed_article_count))
print('Processed: ', str(processed_article_count))

# Reads the time when data has been collected.
# New parser should be created, otherwise another type of objects is not returned.
Expand Down
44 changes: 44 additions & 0 deletions ArxivNavigator/meta2js/month.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
########################################################################################################################
# File "month.py"
# Copyright © Dmitry Morozov 2022
# Class represents single month for aggregating article identifiers.
# If you want to use this file please contact me by dvmorozov@hotmail.com.
########################################################################################################################


from common.time import *


months = dict()
month_names = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']


class Month(object):
def __init__(self, year, month):
self.year = year
self.month = month
self.article_ids = []

def add_article_id(self, article_id):
if article_id not in self.article_ids:
self.article_ids.append(article_id)


def create_months():
global months, month_names

months.clear()
for year in get_years_range():
for month in month_names:
month_name = month + '_' + str(year)
months[month_name] = Month(year, month)


def get_month(year, month_number):
global month_names

assert(0 < month_number <= len(month_names))

month = month_names[month_number - 1]
month_name = month + '_' + str(year)
return months[month_name]
9 changes: 5 additions & 4 deletions ArxivNavigator/meta2js/topic.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
########################################################################################################################
# File "topic.py"
# Copyright © Dmitry Morozov 2022
# Class represents single category of articles extracted from metadata.
# If you want to use this file please contact me by dvmorozov@hotmail.com.
########################################################################################################################
import datetime


from article import *
from common.time import *


class Topic(object):
Expand All @@ -17,9 +20,7 @@ def __init__(self, topic_id):
self.max_last_articles_count = 10
self.articles_by_year = dict()
# Fills the dictionary with years.
current_date = datetime.datetime.now().date()
current_year = int(current_date.strftime("%Y"))
for year in range(1985, current_year + 1):
for year in get_years_range():
self.articles_by_year[year] = 0

def inc_articles_count(self):
Expand Down
1 change: 0 additions & 1 deletion ArxivNavigator/topic-mining/collect_topics.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# Copyright © Dmitry Morozov 2022
# If you want to use this file please contact me by dvmorozov@hotmail.com.
# Script parameters:
# Script parameters:
# N1 - path to corpus directory,
# N2 - path to dictionary,
# N3 - corpus encoding,
Expand Down

0 comments on commit 098d524

Please sign in to comment.