In [1]:
import pandas as pd
import numpy as np
import datetime

dateparse = lambda x: datetime.datetime.strptime(x, '%Y-%m-%d')

pd.set_option('display.max_columns', None)
train = pd.read_csv('train.csv', parse_dates=['submitted_date'], date_parser=dateparse)

category_to_id = {cat: idx for idx, cat in enumerate(sorted(train['category'].unique()))}
train['id'] = train['category'].map(category_to_id)

train['year'] = train['submitted_date'].dt.isocalendar().year
train['week'] = train['submitted_date'].dt.isocalendar().week
min_year = train['year'].min()
train['week_id'] = (train['year'] - min_year) * 53 + train['week']

train = train.drop(columns=['category', 'year', 'week', 'submitted_date'])

tables_by_id = [group for _, group in train.groupby('id')]
for i, table in enumerate(tables_by_id):
    week_counts = table['week_id'].value_counts()
    full_weeks = week_counts[week_counts == 7].index
    tables_by_id[i] = table[table['week_id'].isin(full_weeks)].reset_index(drop=True)


  train = pd.read_csv('train.csv', parse_dates=['submitted_date'], date_parser=dateparse)


In [2]:
final_train = pd.DataFrame(columns=train.columns)

for i, table in enumerate(tables_by_id):
    table_grouped = table.groupby('week_id', as_index=False)['num_papers'].sum()
    table_grouped['id'] = i
    final_train = pd.concat([final_train, table_grouped], ignore_index=True)

final_train

Unnamed: 0,num_papers,id,week_id
0,145,0,54
1,160,0,55
2,130,0,56
3,130,0,57
4,179,0,58
...,...,...,...
176079,42,139,1380
176080,34,139,1381
176081,32,139,1382
176082,53,139,1383


In [3]:
final_train['year'] = (final_train['week_id'] - 1) // 53 + min_year
final_train['week'] = (final_train['week_id'] - 1) % 53 + 1
final_train['week_start'] = pd.to_datetime(final_train['year'].astype(str) + '-W' + final_train['week'].astype(str) + '-1', format='%G-W%V-%u')

final_train['week_start'] = pd.to_datetime(final_train['week_start'], errors='coerce')

final_train = final_train.drop(columns=['week_id'])
cols = ['week_start', 'id', 'num_papers']
final_train = final_train[cols]

final_train

Unnamed: 0,week_start,id,num_papers
0,2000-01-03,0,145
1,2000-01-10,0,160
2,2000-01-17,0,130
3,2000-01-24,0,130
4,2000-01-31,0,179
...,...,...,...
176079,2025-01-06,139,42
176080,2025-01-13,139,34
176081,2025-01-20,139,32
176082,2025-01-27,139,53


In [4]:
final_train

Unnamed: 0,week_start,id,num_papers
0,2000-01-03,0,145
1,2000-01-10,0,160
2,2000-01-17,0,130
3,2000-01-24,0,130
4,2000-01-31,0,179
...,...,...,...
176079,2025-01-06,139,42
176080,2025-01-13,139,34
176081,2025-01-20,139,32
176082,2025-01-27,139,53


In [8]:
from gluonts.mx.model.n_beats import NBEATSEstimator
from gluonts.dataset.common import ListDataset
from gluonts.mx.trainer._base import Trainer
import torch

# Prepare the dataset as before
final_train['week_start'] = pd.to_datetime(final_train['week_start'])
final_train['num_papers'] = final_train['num_papers'].astype(float)
final_train['id'] = final_train['id'].astype(str)

series_list = []
for cat_id in final_train['id'].unique():
    df_cat = final_train[final_train['id'] == cat_id].sort_values('week_start')
    series_list.append({
        "start": df_cat['week_start'].iloc[0],
        "target": df_cat['num_papers'].values,
        "item_id": cat_id
    })

train_ds = ListDataset(series_list, freq="W-MON")

trainer = Trainer(add_default_callbacks=True, callbacks=None, clip_gradient=10.0, ctx=None, epochs=25, hybridize=True, init='xavier', learning_rate=0.001, num_batches_per_epoch=50, weight_decay=1e-08)
estimator = NBEATSEstimator(
    freq="W-MON",
    prediction_length=8,
    context_length=30,
    trainer=trainer
)

predictor = estimator.train(train_ds)


100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:06<00:00,  7.37it/s, epoch=1/25, avg_epoch_loss=14.7]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:06<00:00,  8.31it/s, epoch=2/25, avg_epoch_loss=4.2]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:06<00:00,  8.26it/s, epoch=3/25, avg_epoch_loss=4.08]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:05<00:00,  8.54it/s, epoch=4/25, avg_epoch_loss=3.87]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:06<00:00,  8.08it/s, epoch=5/25, avg_epoch_loss=3.99]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:06<00:00,  7.37it/s, epoch=6/25, a

In [11]:
test = pd.read_csv('test.csv', date_parser=dateparse)

test

  test = pd.read_csv('test.csv', date_parser=dateparse)


Unnamed: 0,category,week_id,week_start,week_end,num_papers
0,astro-ph,1,2025-02-10,2025-02-16,0
1,astro-ph,2,2025-02-17,2025-02-23,0
2,astro-ph,3,2025-02-24,2025-03-02,0
3,astro-ph,4,2025-03-03,2025-03-09,0
4,astro-ph,5,2025-03-10,2025-03-16,0
...,...,...,...,...,...
1115,stat.TH - Statistics Theory,4,2025-03-03,2025-03-09,0
1116,stat.TH - Statistics Theory,5,2025-03-10,2025-03-16,0
1117,stat.TH - Statistics Theory,6,2025-03-17,2025-03-23,0
1118,stat.TH - Statistics Theory,7,2025-03-24,2025-03-30,0


In [12]:
test['id'] = test['category'].map(category_to_id)

test = test[['id', 'week_start', 'num_papers']]
test = pd.concat([final_train])

test

Unnamed: 0,week_start,id,num_papers
0,2000-01-03,0,145.0
1,2000-01-10,0,160.0
2,2000-01-17,0,130.0
3,2000-01-24,0,130.0
4,2000-01-31,0,179.0
...,...,...,...
176079,2025-01-06,139,42.0
176080,2025-01-13,139,34.0
176081,2025-01-20,139,32.0
176082,2025-01-27,139,53.0


In [13]:
test['id'] = test['id'].astype(str)
test['num_papers'] = test['num_papers'].astype(float)

test

Unnamed: 0,week_start,id,num_papers
0,2000-01-03,0,145.0
1,2000-01-10,0,160.0
2,2000-01-17,0,130.0
3,2000-01-24,0,130.0
4,2000-01-31,0,179.0
...,...,...,...
176079,2025-01-06,139,42.0
176080,2025-01-13,139,34.0
176081,2025-01-20,139,32.0
176082,2025-01-27,139,53.0


In [17]:
test_series_list = []
for cat_id in test['id'].unique():
    df_cat = test[test['id'] == cat_id].sort_values('week_start')
    test_series_list.append({
        "start": df_cat['week_start'].iloc[0],
        "target": df_cat['num_papers'].values,
        "item_id": cat_id
    })

test_ds = ListDataset(test_series_list, freq="W-MON")
predictions = list(predictor.predict(test_ds))

predictions

[gluonts.model.forecast.SampleForecast(info=None, item_id='0', samples=array([[352.1613 , 385.91064, 399.9316 , 371.01065, 346.82516, 396.63373,
         407.96255, 468.18472],
        [448.07166, 351.27167, 355.153  , 389.40616, 352.2076 , 397.70352,
         334.18137, 378.41354],
        [319.163  , 357.19965, 349.57562, 325.24713, 393.27823, 361.72668,
         306.87088, 377.57297],
        [288.52753, 322.03552, 335.30374, 331.26288, 385.50024, 353.45816,
         314.95935, 382.48285],
        [356.42587, 392.58258, 393.64252, 395.3383 , 382.88934, 378.97894,
         368.21545, 440.37064],
        [365.99686, 328.90845, 344.4605 , 395.6236 , 417.93243, 399.15857,
         411.14423, 350.01334],
        [399.6535 , 365.13983, 332.76926, 338.03183, 375.16016, 402.05777,
         397.1995 , 385.40955],
        [336.5319 , 346.527  , 415.40436, 378.0126 , 364.19904, 382.84955,
         407.2066 , 408.78156],
        [341.26715, 401.03415, 345.55313, 338.4965 , 416.28937, 394.90427,

In [19]:
import pandas as pd

rows = []
for i, pred in enumerate(predictions):
    item_id = pred.item_id
    start = pd.Period(test_series_list[i]['start'], freq="W-MON").start_time
    for j, value in enumerate(pred.mean):
        week_start = start + pd.Timedelta(weeks=j)
        rows.append({'item_id': item_id, 'week_start': week_start, 'num_papers': value})

answer = pd.DataFrame(rows)
answer

Unnamed: 0,item_id,week_start,num_papers
0,0,1999-12-28,363.876862
1,0,2000-01-04,365.522827
2,0,2000-01-11,375.998474
3,0,2000-01-18,366.788696
4,0,2000-01-25,377.252625
...,...,...,...
1115,139,2000-02-29,42.077160
1116,139,2000-03-07,41.054256
1117,139,2000-03-14,40.539783
1118,139,2000-03-21,40.219494


In [20]:
id_to_category = {idx: cat for cat, idx in category_to_id.items()}
answer['item_id'] = answer['item_id'].astype(int)
answer['id'] = answer['item_id'].map(id_to_category)

answer

Unnamed: 0,item_id,week_start,num_papers,id
0,0,1999-12-28,363.876862,astro-ph
1,0,2000-01-04,365.522827,astro-ph
2,0,2000-01-11,375.998474,astro-ph
3,0,2000-01-18,366.788696,astro-ph
4,0,2000-01-25,377.252625,astro-ph
...,...,...,...,...
1115,139,2000-02-29,42.077160,stat.TH - Statistics Theory
1116,139,2000-03-07,41.054256,stat.TH - Statistics Theory
1117,139,2000-03-14,40.539783,stat.TH - Statistics Theory
1118,139,2000-03-21,40.219494,stat.TH - Statistics Theory


In [21]:
counts = {}
def make_unique(name):
    cnt = counts.get(name, 0) + 1
    counts[name] = cnt
    return name if cnt == 0 else f"{name}__{cnt}"

answer['id'] = answer['id'].apply(make_unique)

answer

Unnamed: 0,item_id,week_start,num_papers,id
0,0,1999-12-28,363.876862,astro-ph__1
1,0,2000-01-04,365.522827,astro-ph__2
2,0,2000-01-11,375.998474,astro-ph__3
3,0,2000-01-18,366.788696,astro-ph__4
4,0,2000-01-25,377.252625,astro-ph__5
...,...,...,...,...
1115,139,2000-02-29,42.077160,stat.TH - Statistics Theory__4
1116,139,2000-03-07,41.054256,stat.TH - Statistics Theory__5
1117,139,2000-03-14,40.539783,stat.TH - Statistics Theory__6
1118,139,2000-03-21,40.219494,stat.TH - Statistics Theory__7


In [22]:
final_answer = pd.DataFrame()

In [23]:
final_answer[['id', 'num_papers']] = answer[['id', 'num_papers']]

final_answer

Unnamed: 0,id,num_papers
0,astro-ph__1,363.876862
1,astro-ph__2,365.522827
2,astro-ph__3,375.998474
3,astro-ph__4,366.788696
4,astro-ph__5,377.252625
...,...,...
1115,stat.TH - Statistics Theory__4,42.077160
1116,stat.TH - Statistics Theory__5,41.054256
1117,stat.TH - Statistics Theory__6,40.539783
1118,stat.TH - Statistics Theory__7,40.219494


In [None]:
final_answer.to_csv('submission.csv', index=False)