In [1]:
import pandas as pd
import numpy as np
import datetime

dateparse = lambda x: datetime.datetime.strptime(x, '%Y-%m-%d')

pd.set_option('display.max_columns', None)
train = pd.read_csv('train.csv', parse_dates=['submitted_date'], date_parser=dateparse)

category_to_id = {cat: idx for idx, cat in enumerate(sorted(train['category'].unique()))}
train['id'] = train['category'].map(category_to_id)

train['year'] = train['submitted_date'].dt.isocalendar().year
train['week'] = train['submitted_date'].dt.isocalendar().week
min_year = train['year'].min()
train['week_id'] = (train['year'] - min_year) * 53 + train['week']

train = train.drop(columns=['category', 'year', 'week', 'submitted_date'])

tables_by_id = [group for _, group in train.groupby('id')]
for i, table in enumerate(tables_by_id):
    week_counts = table['week_id'].value_counts()
    full_weeks = week_counts[week_counts == 7].index
    tables_by_id[i] = table[table['week_id'].isin(full_weeks)].reset_index(drop=True)


  train = pd.read_csv('train.csv', parse_dates=['submitted_date'], date_parser=dateparse)


In [None]:
final_train = pd.DataFrame(columns=train.columns)

for i, table in enumerate(tables_by_id):
    table_grouped = table.groupby('week_id', as_index=False)['num_papers'].sum()
    table_grouped['id'] = i
    final_train = pd.concat([final_train, table_grouped], ignore_index=True)

final_train

  final_train = pd.concat([final_train, table_grouped], ignore_index=True)


Unnamed: 0,num_papers,id,week_id
0,145,0,54
1,160,0,55
2,130,0,56
3,130,0,57
4,179,0,58
...,...,...,...
176079,42,139,1380
176080,34,139,1381
176081,32,139,1382
176082,53,139,1383


In [None]:
final_train['year'] = (final_train['week_id'] - 1) // 53 + min_year
final_train['week'] = (final_train['week_id'] - 1) % 53 + 1
final_train['week_start'] = pd.to_datetime(final_train['year'].astype(str) + '-W' + final_train['week'].astype(str) + '-1', format='%G-W%V-%u')

final_train['week_start'] = pd.to_datetime(final_train['week_start'], errors='coerce')

final_train = final_train.drop(columns=['week_id'])
cols = ['week_start', 'id', 'num_papers']
final_train = final_train[cols]

final_train

Unnamed: 0,week_start,id,num_papers
0,2000-01-03,0,145
1,2000-01-10,0,160
2,2000-01-17,0,130
3,2000-01-24,0,130
4,2000-01-31,0,179
...,...,...,...
176079,2025-01-06,139,42
176080,2025-01-13,139,34
176081,2025-01-20,139,32
176082,2025-01-27,139,53


In [None]:
final_train

Unnamed: 0,week_start,id,num_papers
0,2000-01-03,0,145
1,2000-01-10,0,160
2,2000-01-17,0,130
3,2000-01-24,0,130
4,2000-01-31,0,179
...,...,...,...
176079,2025-01-06,139,42
176080,2025-01-13,139,34
176081,2025-01-20,139,32
176082,2025-01-27,139,53


In [None]:
import torch
from darts.models import TFTModel
from darts.utils.likelihood_models import QuantileRegression
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

from darts import TimeSeries
from darts.dataprocessing.transformers import Scaler

# Ensure correct dtypes
final_train['week_start'] = pd.to_datetime(final_train['week_start'])
final_train['num_papers'] = final_train['num_papers'].astype(float)
final_train['id'] = final_train['id'].astype(str)

series_list = []
for cat_id in final_train['id'].unique():
    df_cat = final_train[final_train['id'] == cat_id].sort_values('week_start')
    ts = TimeSeries.from_dataframe(df_cat, time_col='week_start', value_cols='num_papers', fill_missing_dates=True, freq='W-MON')
    series_list.append(ts)

scaler = Scaler()
series_list = scaler.fit_transform(series_list)

tft = TFTModel(
    input_chunk_length=30,
    output_chunk_length=8,
    hidden_size=16,
    lstm_layers=1,
    num_attention_heads=1,
    dropout=0.1,
    batch_size=64,
    n_epochs=25,
    likelihood=QuantileRegression(quantiles=[0.1, 0.5, 0.9]),
    random_state=42,
    add_relative_index=True,
    pl_trainer_kwargs={
        "accelerator": "gpu" if torch.cuda.is_available() else "cpu",
        "callbacks": [
            EarlyStopping(monitor="train_loss", patience=5, mode="min"),
        ]
    }
)

tft.fit(series_list, verbose=True)


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

   | Name                              | Type                             | Params | Mode 
------------------------------------------------------------------------------------------------
0  | train_metrics                     | MetricCollection                 | 0      | train
1  | val_metrics                       | MetricCollection                 | 0      | train
2  | input_embeddings                  | _MultiEmbedding                  | 0      | train
3  | static_covariates_vsn             | _VariableSelectionNetwork        | 0      | train
4  | encoder_vsn                       | _VariableSelectionNetwork        | 1.2 K  | train
5  | decoder_vsn                       | _VariableSelectionNetwork        | 528    | train
6  | static_context_grn                | _GatedResidualNetwork            | 1.1 K  | train
7  | static_context_hidden_encoder_grn | _GatedResidualNetwork 

Epoch 0: 100%|██████████| 2785/2785 [04:02<00:00, 11.51it/s, train_loss=0.0667]

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 2785/2785 [04:02<00:00, 11.51it/s, train_loss=0.0667]


TFTModel(output_chunk_shift=0, hidden_size=16, lstm_layers=1, num_attention_heads=1, full_attention=False, feed_forward=GatedResidualNetwork, dropout=0.1, hidden_continuous_size=8, categorical_embedding_sizes=None, add_relative_index=True, loss_fn=None, likelihood=QuantileRegression(quantiles=[0.1, 0.5, 0.9], prior_strength=1.0), norm_type=LayerNorm, use_static_covariates=True, input_chunk_length=30, output_chunk_length=8, batch_size=64, n_epochs=1, random_state=42, pl_trainer_kwargs={'accelerator': 'cpu', 'callbacks': [<pytorch_lightning.callbacks.early_stopping.EarlyStopping object at 0x0000021FB0ADE3B0>]})

In [29]:
test = pd.read_csv('test.csv', date_parser=dateparse)

test

  test = pd.read_csv('test.csv', date_parser=dateparse)


Unnamed: 0,category,week_id,week_start,week_end,num_papers
0,astro-ph,1,2025-02-10,2025-02-16,0
1,astro-ph,2,2025-02-17,2025-02-23,0
2,astro-ph,3,2025-02-24,2025-03-02,0
3,astro-ph,4,2025-03-03,2025-03-09,0
4,astro-ph,5,2025-03-10,2025-03-16,0
...,...,...,...,...,...
1115,stat.TH - Statistics Theory,4,2025-03-03,2025-03-09,0
1116,stat.TH - Statistics Theory,5,2025-03-10,2025-03-16,0
1117,stat.TH - Statistics Theory,6,2025-03-17,2025-03-23,0
1118,stat.TH - Statistics Theory,7,2025-03-24,2025-03-30,0


In [30]:
test['id'] = test['category'].map(category_to_id)

test = test[['id', 'week_start', 'num_papers']]
test = pd.concat([final_train])

test

Unnamed: 0,week_start,id,num_papers
0,2000-01-03,0,145.0
1,2000-01-10,0,160.0
2,2000-01-17,0,130.0
3,2000-01-24,0,130.0
4,2000-01-31,0,179.0
...,...,...,...
176079,2025-01-06,139,42.0
176080,2025-01-13,139,34.0
176081,2025-01-20,139,32.0
176082,2025-01-27,139,53.0


In [31]:
test['id'] = test['id'].astype(str)
test['num_papers'] = test['num_papers'].astype(float)

test

Unnamed: 0,week_start,id,num_papers
0,2000-01-03,0,145.0
1,2000-01-10,0,160.0
2,2000-01-17,0,130.0
3,2000-01-24,0,130.0
4,2000-01-31,0,179.0
...,...,...,...
176079,2025-01-06,139,42.0
176080,2025-01-13,139,34.0
176081,2025-01-20,139,32.0
176082,2025-01-27,139,53.0


In [37]:
predictions = tft.predict(n=8, series=series_list, num_samples=100)
predictions = scaler.inverse_transform(predictions)

predictions

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Predicting DataLoader 0: 100%|██████████| 3/3 [00:06<00:00,  0.44it/s]


[<TimeSeries (DataArray) (week_start: 8, component: 1, sample: 100)> Size: 6kB
 array([[[282.34633852, 397.51146019, 269.52886195, 343.59351624,
          356.41892931, 364.4619454 , 285.86512326, 361.07271499,
          269.52886195, 352.26601195, 323.70556782, 352.6111447 ,
          281.5793904 , 275.20376096, 327.51148329, 269.52886195,
          329.5267022 , 397.51146019, 287.4628935 , 397.51146019,
          302.7454049 , 269.52886195, 335.68294006, 285.74651033,
          269.52886195, 360.96740574, 372.42636307, 313.22592271,
          289.97937589, 345.56437588, 296.66070467, 397.51146019,
          375.12167233, 307.87649137, 292.62154395, 356.86188086,
          389.62091439, 275.85060606, 278.41722738, 328.34948461,
          344.00001955, 397.51146019, 325.66889282, 353.43315274,
          397.51146019, 397.51146019, 396.47339894, 373.70863757,
          282.31964793, 290.81986806, 378.23503588, 389.55328973,
          269.52886195, 362.19492711, 328.1072754 , 397.5114601

In [50]:
answer = []
for item_id, pred in enumerate(predictions):
    df_pred = pred.quantile_timeseries(0.5).to_dataframe().reset_index()
    df_pred['item_id'] = item_id
    answer.append(df_pred[['item_id', 'week_start', 'num_papers_0.5']])

answer = pd.concat(answer, ignore_index=True)
answer


component,item_id,week_start,num_papers_0.5
0,0,2025-02-10,334.277938
1,0,2025-02-17,350.860437
2,0,2025-02-24,343.839095
3,0,2025-03-03,350.019539
4,0,2025-03-10,338.422620
...,...,...,...
1115,139,2025-03-03,41.328199
1116,139,2025-03-10,41.176406
1117,139,2025-03-17,40.550143
1118,139,2025-03-24,41.637579


In [52]:
answer['num_papers'] = answer['num_papers_0.5']
answer = answer.drop(columns=['num_papers_0.5'])

answer

component,item_id,week_start,num_papers
0,0,2025-02-10,334.277938
1,0,2025-02-17,350.860437
2,0,2025-02-24,343.839095
3,0,2025-03-03,350.019539
4,0,2025-03-10,338.422620
...,...,...,...
1115,139,2025-03-03,41.328199
1116,139,2025-03-10,41.176406
1117,139,2025-03-17,40.550143
1118,139,2025-03-24,41.637579


In [53]:
id_to_category = {idx: cat for cat, idx in category_to_id.items()}
answer['item_id'] = answer['item_id'].astype(int)
answer['id'] = answer['item_id'].map(id_to_category)

answer

component,item_id,week_start,num_papers,id
0,0,2025-02-10,334.277938,astro-ph
1,0,2025-02-17,350.860437,astro-ph
2,0,2025-02-24,343.839095,astro-ph
3,0,2025-03-03,350.019539,astro-ph
4,0,2025-03-10,338.422620,astro-ph
...,...,...,...,...
1115,139,2025-03-03,41.328199,stat.TH - Statistics Theory
1116,139,2025-03-10,41.176406,stat.TH - Statistics Theory
1117,139,2025-03-17,40.550143,stat.TH - Statistics Theory
1118,139,2025-03-24,41.637579,stat.TH - Statistics Theory


In [54]:
counts = {}
def make_unique(name):
    cnt = counts.get(name, 0) + 1
    counts[name] = cnt
    return name if cnt == 0 else f"{name}__{cnt}"

answer['id'] = answer['id'].apply(make_unique)

answer

component,item_id,week_start,num_papers,id
0,0,2025-02-10,334.277938,astro-ph__1
1,0,2025-02-17,350.860437,astro-ph__2
2,0,2025-02-24,343.839095,astro-ph__3
3,0,2025-03-03,350.019539,astro-ph__4
4,0,2025-03-10,338.422620,astro-ph__5
...,...,...,...,...
1115,139,2025-03-03,41.328199,stat.TH - Statistics Theory__4
1116,139,2025-03-10,41.176406,stat.TH - Statistics Theory__5
1117,139,2025-03-17,40.550143,stat.TH - Statistics Theory__6
1118,139,2025-03-24,41.637579,stat.TH - Statistics Theory__7


In [55]:
final_answer = pd.DataFrame()

In [56]:
final_answer[['id', 'num_papers']] = answer[['id', 'num_papers']]

final_answer

Unnamed: 0,id,num_papers
0,astro-ph__1,334.277938
1,astro-ph__2,350.860437
2,astro-ph__3,343.839095
3,astro-ph__4,350.019539
4,astro-ph__5,338.422620
...,...,...
1115,stat.TH - Statistics Theory__4,41.328199
1116,stat.TH - Statistics Theory__5,41.176406
1117,stat.TH - Statistics Theory__6,40.550143
1118,stat.TH - Statistics Theory__7,41.637579


In [None]:
final_answer.to_csv('submission.csv', index=False)