Process MS news dataset for data stream 

See

https://www.kaggle.com/datasets/arashnic/mind-news-dataset

https://github.com/microsoft/recommenders

!pip install recommenders

In [None]:
import os
import re
import pandas as pd

from datetime import timezone
from collections import defaultdict

from recommenders.datasets import mind as msdataloader
from recommenders.datasets.download_utils import unzip_file

In [None]:
DATASET_PATH = os.path.expanduser('./dataset_msnews')

DATASET_TYPE = 'large'  # small/large

if not os.path.exists(DATASET_PATH):
    DATASET_PATH = None  # temp dir

train_zip, valid_zip = msdataloader.download_mind(size=DATASET_TYPE, dest_path=DATASET_PATH)

In [None]:
train_zip, valid_zip

In [None]:
unzip_file(train_zip, os.path.join(DATASET_PATH, 'train'), clean_zip_file=False)
unzip_file(valid_zip, os.path.join(DATASET_PATH, 'valid'), clean_zip_file=False)

In [None]:
!ls {os.path.join(DATASET_PATH, 'train')} {os.path.join(DATASET_PATH, 'valid')}

# Explore

In [None]:
news = pd.read_table(os.path.join(DATASET_PATH, 'train', 'news.tsv'),
                     names=['newid', 'vertical', 'subvertical', 'title',
                            'abstract', 'url', 'entities in title', 'entities in abstract'],
                     usecols = ['vertical', 'subvertical', 'title', 'abstract'])
news.head()

In [None]:
news.describe()

In [None]:
news_valid = pd.read_table(os.path.join(DATASET_PATH, 'valid', 'news.tsv'),
                     names=['newid', 'vertical', 'subvertical', 'title',
                            'abstract', 'url', 'entities in title', 'entities in abstract'],
                     usecols = ['vertical', 'subvertical', 'title', 'abstract'])
news_valid.head()

In [None]:
news_valid.describe()

## Behaviors

In [None]:
df_clicks = pd.read_table(os.path.join(DATASET_PATH, 'train', 'behaviors.tsv'),
                          parse_dates=[2], infer_datetime_format=True,                          
                          names=['impressionid', 'userid', 'impression_time', 'user_click_history', 'news'])
df_clicks.head()

In [None]:
df_clicks['impression_time'].min(), df_clicks['impression_time'].max()

In [None]:
df_clicks.describe()

In [None]:
df_clicks = df_clicks.sort_values(by="impression_time")

df_clicks.head()

In [None]:
df_clicks_valid = pd.read_table(os.path.join(DATASET_PATH, 'valid', 'behaviors.tsv'),
                                parse_dates=[2], infer_datetime_format=True,
                                names=['impressionid', 'userid', 'impression_time', 'user_click_history', 'news'])
df_clicks_valid.head()

In [None]:
df_clicks_valid['impression_time'].min(), df_clicks_valid['impression_time'].max()

In [None]:
df_clicks_valid.describe()

In [None]:
df_clicks_valid.iloc[0, 2]

In [None]:
df_clicks_valid = df_clicks_valid.sort_values(by="impression_time")

df_clicks_valid.head()

# Generate dataset

File columns

    names=['Item','User','Rating','Timestamp']
    
File example

head ratings_Video_Games.csv 

    AB9S9279OZ3QO,0078764343,5.0,1373155200
    A24SSUT5CSW8BH,0078764343,5.0,1377302400
    AK3V0HEBJMQ7J,0078764343,4.0,1372896000

In [None]:
def parse_news_column(news):
    if not news:
        return []
    
    items = re.findall(r'([^\-]+)-([01])', news)
    
    if items:
        return [x[0].strip() for x in items if x and len(x) == 2 and x[1] and x[1] == '1']
    
    return []

def convert_df(df, def_rating=5.0):
    mapUser = defaultdict(lambda: 1+len(mapUser))
    mapItem = defaultdict(lambda: 1+len(mapItem))
    
    result = []
    
    for _, row in df.iterrows():
        news = row[4]
        
        parsed_news = parse_news_column(news)
        
        if parsed_news:
            user = row[1]
            impression_time = row[2]
            
            if user and impression_time:
                user = mapUser[user]
                impression_time = int(impression_time.replace(tzinfo=timezone.utc).timestamp())
                
                for news_id in parsed_news:
                    result.append([mapItem[news_id], user, def_rating, impression_time])

    print(f'Rows {len(result)}, users {len(mapUser)}, items {len(mapItem)}')
    return result


In [None]:
df = pd.concat((df_clicks, df_clicks_valid))

df.sort_values(by="impression_time")

df.head()

In [None]:
df['impression_time'].min(), df['impression_time'].max()

In [None]:
result = convert_df(df)
len(result)

In [None]:
# SAVE TO FILE

fname = os.path.join(DATASET_PATH or '.', f'msnews_{DATASET_TYPE}.csv')

with open(fname, 'w') as fp:
    for line in result:
        fp.write(','.join([str(x) for x in line]))
        fp.write('\n')
     