In [1]:
%load_ext autoreload

### Load data

In [43]:
import pandas as pd
a = pd.read_feather("final_dataset.feather")
a.head()

Unnamed: 0,username,anime_id,score,timestamp,status
0,Cezar10,48661,10,1648065035,2
1,Cezar10,43608,10,1656365421,2
2,Cezar10,48583,10,1649023161,2
3,Gonzo2,241,6,1293559897,2
4,Gonzo2,650,8,1293560393,2


### Count "popular" items

In [3]:
b = a['anime_id'].value_counts()
b[b>100].sum()

81253137

### Remove rare items

In [4]:
a = a[a['anime_id'].isin(b[b>100].index)]
a.shape

(81253137, 5)

### Count and remove rare users

In [5]:
b = a['username'].value_counts()
b[b>4].sum()

81120187

In [6]:
a = a[a['username'].isin(b[b>4].index)]
a.shape

(81120187, 5)

### Map usernames to numbers

In [7]:
mapper = dict(zip(a['username'].unique(),[*range(a['username'].nunique())]))


In [8]:
a['username'] = a['username'].map(mapper)

In [9]:
a.head()

Unnamed: 0,username,anime_id,score,timestamp,status
3,0,241,6,1293559897,2
4,0,650,8,1293560393,2
5,0,8074,9,1293560058,2
6,0,142,6,1294080780,2
7,0,252,10,1293559649,2


### Remove extra columns

In [10]:
a = a.iloc[:,:4].set_index('username').astype("int32")

In [11]:
a.shape

(81120187, 3)

### Remap items and store original item -> new item mapping in csv

In [12]:
# b = pd.DataFrame(a.loc[:,"anime_id"])

uniques = a['anime_id'].unique()
mapper = dict(zip(uniques,range(len(uniques))))
# d = a['anime_id'].map(mapper)
# b['train_id'] = d

c = pd.DataFrame(zip(uniques,[*range(len(uniques))]),columns=['item_id','train_id'])
c.to_csv("item_map.csv",index=False)
a['anime_id'] = a['anime_id'].map(mapper)
# a.head()
c.shape

(10175, 2)

In [13]:
e = a.groupby("username",sort=False)["score"].transform(pd.Series.median)
a_small = a[(a['score'] >= e) & (e>2)& (e<10)]
a.shape,a_small.shape

((81120187, 3), (50909142, 3))

In [41]:
e.index.nunique()

604420

### Split data

In [14]:
import numpy as np
validation = np.random.choice(a_small.index.unique(),size=60000,replace=False)
validation_df = a_small[a_small.index.isin(validation[len(validation)//2:])]
test_df = a_small[a_small.index.isin(validation[:len(validation)//2])]
train = a_small[~a_small.index.isin(validation)]
train.shape,validation_df.shape,test_df.shape

((45358603, 3), (2764379, 3), (2786160, 3))

### Store and load data

In [15]:
train.reset_index().to_feather("train_processed.feather")
test_df.reset_index().to_feather("test_processed.feather")
validation_df.reset_index().to_feather("valid_processed.feather")

In [45]:
p = pd.read_feather("data/train_processed.feather")
p['score'].min(),p['score'].max(),p['anime_id'].max()

(3, 10, 10174)

In [47]:
p['username'].nunique()

489803

In [58]:
import torch
a = torch.tensor([[1,0,3],[0,5,6]])
b = torch.arange(2,5)
a/b

tensor([[0.5000, 0.0000, 0.7500],
        [0.0000, 1.6667, 1.5000]])

In [56]:
c = torch.tensor([[1,8,3],[9,5,6]])
c[b[:,0],b[:,1]] = 100
c

tensor([[100,   8, 100],
        [  9, 100, 100]])