# Import and setup

In [1]:
!which python

/home/gkoren2/miniconda3/envs/difsr/bin/python


In [2]:
import os
import sys
difsr_root= os.path.dirname(os.getcwd())
sys.path.insert(1, difsr_root)
sys.path

['/home/gkoren2/code/git/gkoren2/DIF-SR/notebooks',
 '/home/gkoren2/code/git/gkoren2/DIF-SR',
 '/home/gkoren2/miniconda3/envs/difsr/lib/python39.zip',
 '/home/gkoren2/miniconda3/envs/difsr/lib/python3.9',
 '/home/gkoren2/miniconda3/envs/difsr/lib/python3.9/lib-dynload',
 '',
 '/home/gkoren2/miniconda3/envs/difsr/lib/python3.9/site-packages']

In [3]:
import os 
import pandas as pd
import numpy as np
from datetime import datetime
from recbole.config import Config
from recbole.data import create_dataset
from recbole.data.utils import get_dataloader
from recbole.utils import init_logger, init_seed, get_model, get_trainer, set_color
import matplotlib.pyplot as plt

In [4]:
dataset_root = os.path.abspath('../dataset')
# os.listdir(dataset_root)
os.listdir('../dataset/')

['Steam',
 'Amazon_Toys_and_Games',
 'Amazon_Sports_and_Outdoors',
 'yelp',
 'Amazon_Beauty']

In [5]:
from dataclasses import dataclass

@dataclass
class Arguments:
    model:str = 'SASRecD'
    dataset:str = 'Amazon_Beauty'
    config_files:str = None





# Amazon Beauty

In [25]:
dataset_path=os.path.join(dataset_root,'Amazon_Beauty')
os.listdir(dataset_path)

['Amazon_Beauty.inter', 'Amazon_Beauty.item']

## Reading the dataset using recbole 

In [7]:
# reading the dataset through the recbole framework
args=Arguments(dataset="Amazon_Beauty",config_files=os.path.join(difsr_root,'configs/Amazon_Beauty_opt.yaml'))
config_file_list = args.config_files.strip().split(' ') if args.config_files else None
config = Config(model=args.model, dataset=f'{args.dataset}', config_file_list=config_file_list)
config.final_config_dict['data_path'] = os.path.join(difsr_root,config.final_config_dict['data_path'])
config.final_config_dict

{'gpu_id': 0,
 'use_gpu': True,
 'seed': 212,
 'state': 'INFO',
 'reproducibility': True,
 'data_path': '/home/guy/workspace/work/git/gkoren2/DIF-SR/dataset/Amazon_Beauty',
 'checkpoint_dir': 'saved',
 'show_progress': True,
 'save_dataset': False,
 'save_dataloaders': False,
 'epochs': 200,
 'train_batch_size': 1536,
 'learner': 'adam',
 'learning_rate': 0.0001,
 'neg_sampling': None,
 'eval_step': 2,
 'stopping_step': 10,
 'clip_grad_norm': None,
 'weight_decay': 0.0,
 'multi_gpus': False,
 'eval_args': {'split': {'LS': 'valid_and_test'},
  'group_by': 'user',
  'order': 'TO',
  'mode': 'full'},
 'repeatable': True,
 'metrics': ['Recall', 'NDCG'],
 'topk': [3, 5, 10, 20],
 'valid_metric': 'Recall@20',
 'valid_metric_bigger': True,
 'eval_batch_size': 128,
 'loss_decimal_place': 4,
 'metric_decimal_place': 4,
 'n_layers': 4,
 'n_heads': 8,
 'hidden_size': 256,
 'attribute_hidden_size': [64],
 'inner_size': 256,
 'hidden_dropout_prob': 0.5,
 'attn_dropout_prob': 0.3,
 'hidden_act': 'ge

In [8]:
dataset = create_dataset(config)
dataset

[1;35mAmazon_Beauty[0m
[1;34mThe number of users[0m: 22364
[1;34mAverage actions of users[0m: 8.876358270357287
[1;34mThe number of items[0m: 12102
[1;34mAverage actions of items[0m: 16.403768283612923
[1;34mThe number of inters[0m: 198502
[1;34mThe sparsity of the dataset[0m: 99.92665707018277%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'rating', 'timestamp', 'title', 'sales_type', 'sales_rank', 'categories', 'price', 'brand']

In [9]:
dataset.item_feat.head()

Unnamed: 0,item_id,title,sales_type,sales_rank,categories,price,brand
0,0,0.0,0.0,277296.568007,[],17.005026,0.0
1,1,1.0,1.0,10486.0,"[1, 2, 3, 4, 5, 6]",5.04,1.0
2,2,2.0,1.0,52254.0,"[1, 7, 8, 9, 10, 11, 12, 5, 13]",19.99,2.0
3,3,3.0,1.0,78916.0,"[1, 14, 15, 16, 17, 18]",65.86,3.0
4,4,4.0,1.0,764.0,"[1, 14, 15, 16, 17, 19]",52.33,4.0


In [38]:
dataset.item_feat['brand'].value_counts()

0.0       2099
40.0       291
124.0      203
10.0       191
41.0       159
          ... 
1048.0       1
1045.0       1
1043.0       1
1039.0       1
2076.0       1
Name: brand, Length: 2077, dtype: int64

In [37]:
# check the distribution of the number of categories per item
dataset.item_feat['categories'].apply(lambda x:len(x)).value_counts()



7     2178
6     2149
4     2120
5     1801
8     1295
9      876
11     681
10     610
12     144
14     109
3       82
2       29
13      27
0        1
Name: categories, dtype: int64

In [39]:
dataset.item_feat['sales_type'].value_counts()

1.0     10169
2.0      1307
0.0       287
5.0        77
3.0        71
4.0        32
13.0       30
7.0        27
8.0        18
12.0       16
17.0       15
6.0        12
10.0       11
16.0       10
15.0        8
11.0        5
9.0         3
14.0        1
18.0        1
19.0        1
20.0        1
Name: sales_type, dtype: int64

In [27]:
dataset.item_feat['sales_rank'].value_counts()

277296.568007    287
2123.000000        3
120451.000000      3
14119.000000       2
21570.000000       2
                ... 
234978.000000      1
983834.000000      1
230650.000000      1
195874.000000      1
136798.000000      1
Name: sales_rank, Length: 11708, dtype: int64

In [40]:
dataset.item_feat['title'].value_counts()

0.0        8
11242.0    3
5300.0     2
10588.0    2
820.0      2
          ..
4042.0     1
4035.0     1
4045.0     1
4039.0     1
12066.0    1
Name: title, Length: 12067, dtype: int64

## read item data

In [26]:
bdf=pd.read_csv(os.path.join(dataset_path,'Amazon_Beauty.item'),sep='\t')
bdf.head()

Unnamed: 0,item_id:token,title:token,sales_type:token,sales_rank:float,categories:token_seq,price:float,brand:token
0,205616461,Bio-Active Anti-Aging Serum (Firming Ultra-Hyd...,Health & Personal Care,461765.0,"'Beauty', 'Skin Care', 'Face', 'Creams & Moist...",,
1,558925278,Eco Friendly Ecotools Quality Natural Bamboo C...,Beauty,402875.0,"'Beauty', 'Tools & Accessories', 'Makeup Brush...",,
2,733001998,Mastiha Body Lotion,Beauty,540255.0,"'Beauty', 'Skin Care', 'Body', 'Moisturizers',...",,
3,737104473,Hello Kitty Lustre Lipstick (See sellers comme...,Beauty,931125.0,"'Beauty', 'Makeup', 'Lips', 'Lipstick'",,
4,762451459,Stephanie Johnson Mermaid Round Snap Mirror,,,"'Beauty', 'Tools & Accessories', 'Mirrors', 'M...",19.98,


In [27]:
# check how many non-Nan values are there in each column
bdf.count()/len(bdf)


item_id:token           1.000000
title:token             0.998287
sales_type:token        0.979865
sales_rank:float        0.979865
categories:token_seq    1.000000
price:float             0.732743
brand:token             0.490112
dtype: float64

it looks liket the only feature that exist for each item is categories (although there might be an empty string, right?)

In [29]:
# check how many Nans in brand:token
bdf['sales_type:token'].value_counts()

Beauty                       215131
Health & Personal Care        30479
Clothing                       2448
Home &amp; Kitchen             1421
Sports &amp; Outdoors           683
Toys & Games                    642
Music                           568
Shoes                           551
Jewelry                         534
Industrial & Scientific         399
Arts, Crafts & Sewing           272
Pet Supplies                    268
Kitchen & Dining                202
Electronics                     137
Patio, Lawn & Garden            100
Movies & TV                      52
Grocery & Gourmet Food           23
Baby                             17
Watches                          13
Automotive                        9
Home Improvement                  9
Camera &amp; Photo                8
Musical Instruments               6
Software                          5
Cell Phones & Accessories         3
Office Products                   2
Magazines                         1
Books                       

# Amazon Toys

In [30]:
dataset_path=os.path.join(dataset_root,'Amazon_Toys_and_Games')
os.listdir(dataset_path)

['Amazon_Toys_and_Games.inter', 'Amazon_Toys_and_Games.item']

## Reading the dataset using recbole 

In [8]:
# reading the dataset through the recbole framework
args=Arguments(dataset="Amazon_Toys_and_Games",config_files=os.path.join(difsr_root,'configs/orig/Amazon_Toys_and_Games.yaml'))
config_file_list = args.config_files.strip().split(' ') if args.config_files else None
config = Config(model=args.model, dataset=f'{args.dataset}', config_file_list=config_file_list)
config.final_config_dict['data_path'] = os.path.join(difsr_root,config.final_config_dict['data_path'])
config.final_config_dict

{'gpu_id': 0,
 'use_gpu': True,
 'seed': 212,
 'state': 'INFO',
 'reproducibility': True,
 'data_path': '/home/gkoren2/code/git/gkoren2/DIF-SR/dataset/Amazon_Toys_and_Games',
 'checkpoint_dir': 'saved',
 'show_progress': True,
 'save_dataset': False,
 'save_dataloaders': False,
 'epochs': 200,
 'train_batch_size': 2048,
 'learner': 'adam',
 'learning_rate': 0.0001,
 'neg_sampling': None,
 'eval_step': 2,
 'stopping_step': 10,
 'clip_grad_norm': None,
 'weight_decay': 0.0,
 'multi_gpus': False,
 'eval_args': {'split': {'LS': 'valid_and_test'},
  'group_by': 'user',
  'order': 'TO',
  'mode': 'full'},
 'repeatable': True,
 'metrics': ['Recall', 'NDCG'],
 'topk': [3, 5, 10, 20],
 'valid_metric': 'Recall@20',
 'valid_metric_bigger': True,
 'eval_batch_size': 256,
 'loss_decimal_place': 4,
 'metric_decimal_place': 4,
 'n_layers': 3,
 'n_heads': 4,
 'hidden_size': 256,
 'attribute_hidden_size': [64],
 'inner_size': 256,
 'hidden_dropout_prob': 0.5,
 'attn_dropout_prob': 0.3,
 'hidden_act': '

In [11]:
dataset = create_dataset(config)
dataset

[1;35mAmazon_Toys_and_Games[0m
[1;34mThe number of users[0m: 19413
[1;34mAverage actions of users[0m: 8.633680197815783
[1;34mThe number of items[0m: 11925
[1;34mAverage actions of items[0m: 14.055434417980543
[1;34mThe number of inters[0m: 167597
[1;34mThe sparsity of the dataset[0m: 99.92760389550713%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'rating', 'timestamp', 'title', 'price', 'sales_type', 'sales_rank', 'brand', 'categories']

In [12]:
dataset.item_feat.head()

Unnamed: 0,item_id,title,price,sales_type,sales_rank,brand,categories
0,0,0.0,30.077028,0.0,369185.891589,0.0,[]
1,1,1.0,16.19,0.0,369185.891589,1.0,"[1, 2, 3, 4, 5, 6, 7, 2, 8, 5, 9, 2, 10, 11]"
2,2,2.0,1.44,1.0,1009.0,2.0,"[1, 2, 3, 12, 2, 13, 14, 2, 15, 16, 14, 2, 17,..."
3,3,3.0,11.13,1.0,1462.0,3.0,"[1, 2, 3, 19, 2, 20, 21, 22, 23, 2, 24]"
4,4,4.0,16.99,2.0,27977.0,0.0,"[1, 2, 3, 25, 26, 27, 28, 29, 2, 30, 31]"


In [15]:
dataset.item_feat['brand'].value_counts()

0.0       1726
98.0       663
32.0       640
108.0      561
23.0       555
          ... 
676.0        1
677.0        1
679.0        1
680.0        1
1312.0       1
Name: brand, Length: 1313, dtype: int64

## Read item data

In [31]:
tdf=pd.read_csv(os.path.join(dataset_path,'Amazon_Toys_and_Games.item'),sep='\t')
tdf.head()

Unnamed: 0,item_id:token,title:token,price:float,sales_type:token,sales_rank:float,brand:token,categories:token_seq
0,191639,Dr. Suess 19163 Dr. Seuss Puzzle 3 Pack Bundle,37.12,Toys & Games,612379.0,Dr. Seuss,"'Toys & Games', 'Puzzles', 'Jigsaw Puzzles'"
1,5069491,Nursery Rhymes Felt Book,,Toys & Games,576683.0,,'Toys & Games'
2,76561046,Fraction Decimal Percent Card Deck,,Toys & Games,564211.0,,"'Toys & Games', 'Flash Cards', 'Learning & Edu..."
3,131358936,,36.22,Software,8080.0,,"'Toys & Games', 'Mathematics & Counting', 'Lea..."
4,133642984,Algebra 2 California Teacher Center,731.93,Toys & Games,1150291.0,Prentice Hall,"'Toys & Games', 'Mathematics & Counting', 'Lea..."


In [32]:
# check how many non-Nan values are there in each column
tdf.count()/len(tdf)

item_id:token           1.000000
title:token             0.997328
price:float             0.722140
sales_type:token        0.973571
sales_rank:float        0.973571
brand:token             0.543758
categories:token_seq    1.000000
dtype: float64

In [33]:
tdf['sales_type:token'].value_counts()

Toys & Games                 306930
Sports &amp; Outdoors          6434
Home &amp; Kitchen             2973
Clothing                       1689
Arts, Crafts & Sewing          1677
Industrial & Scientific        1561
Kitchen & Dining               1452
Patio, Lawn & Garden            948
Health & Personal Care          931
Musical Instruments             639
Video Games                     314
Jewelry                         289
Electronics                     258
Pet Supplies                    253
Beauty                          208
Software                        198
Camera &amp; Photo              180
Shoes                            92
Cell Phones & Accessories        49
Watches                          39
Office Products                  27
Baby                             17
Computers & Accessories          15
Home Improvement                  9
Automotive                        4
Movies & TV                       2
Grocery & Gourmet Food            1
Books                       

# Amazon Sports

In [34]:
dataset_path=os.path.join(dataset_root,'Amazon_Sports_and_Outdoors')
os.listdir(dataset_path)

['Amazon_Sports_and_Outdoors.item', 'Amazon_Sports_and_Outdoors.inter']

## Reading the dataset using recbole

In [18]:
# reading the dataset through the recbole framework
args=Arguments(dataset="Amazon_Sports_and_Outdoors",config_files=os.path.join(difsr_root,'configs/orig/Amazon_Sports_and_Outdoors.yaml'))
config_file_list = args.config_files.strip().split(' ') if args.config_files else None
config = Config(model=args.model, dataset=f'{args.dataset}', config_file_list=config_file_list)
config.final_config_dict['data_path'] = os.path.join(difsr_root,config.final_config_dict['data_path'])
config.final_config_dict

{'gpu_id': 0,
 'use_gpu': True,
 'seed': 212,
 'state': 'INFO',
 'reproducibility': True,
 'data_path': '/home/gkoren2/code/git/gkoren2/DIF-SR/dataset/Amazon_Sports_and_Outdoors',
 'checkpoint_dir': 'saved',
 'show_progress': True,
 'save_dataset': False,
 'save_dataloaders': False,
 'epochs': 200,
 'train_batch_size': 2048,
 'learner': 'adam',
 'learning_rate': 0.0001,
 'neg_sampling': None,
 'eval_step': 2,
 'stopping_step': 10,
 'clip_grad_norm': None,
 'weight_decay': 0.0,
 'multi_gpus': False,
 'eval_args': {'split': {'LS': 'valid_and_test'},
  'group_by': 'user',
  'order': 'TO',
  'mode': 'full'},
 'repeatable': True,
 'metrics': ['Recall', 'NDCG'],
 'topk': [3, 5, 10, 20],
 'valid_metric': 'Recall@20',
 'valid_metric_bigger': True,
 'eval_batch_size': 256,
 'loss_decimal_place': 4,
 'metric_decimal_place': 4,
 'n_layers': 3,
 'n_heads': 8,
 'hidden_size': 256,
 'attribute_hidden_size': [64],
 'inner_size': 256,
 'hidden_dropout_prob': 0.5,
 'attn_dropout_prob': 0.3,
 'hidden_ac

In [None]:
dataset = create_dataset(config)
dataset

In [None]:
dataset.item_feat.head()

## Read item data

In [35]:
sdf=pd.read_csv(os.path.join(dataset_path,'Amazon_Sports_and_Outdoors.item'),sep='\t')
sdf.head()

Unnamed: 0,item_id:token,title:token,price:float,brand:token,categories:token_seq,sales_type:token,sales_rank:float
0,32069,Adult Ballet Tutu Cheetah Pink,7.89,BubuBibi,"'Sports & Outdoors', 'Skirts', 'Clothing', 'Gi...",,
1,31909,Girls Ballet Tutu Neon Pink,7.0,Unknown,"'Other Sports', 'Dance', 'Sports & Outdoors'",Toys & Games,201847.0
2,32034,Adult Ballet Tutu Yellow,7.87,BubuBibi,"'Sports & Outdoors', 'Skirts', 'Clothing', 'Gi...",,
3,31852,Girls Ballet Tutu Zebra Hot Pink,3.17,Coxlures,"'Other Sports', 'Dance', 'Sports & Outdoors'",Toys & Games,211836.0
4,32050,Adult Ballet Tutu Purple,12.85,BubuBibi,"'Sports & Outdoors', 'Skirts', 'Clothing', 'Gi...",,


In [36]:
# check how many non-Nan values are there in each column
sdf.count()/len(sdf)

item_id:token           1.000000
title:token             0.995675
price:float             0.540762
brand:token             0.283859
categories:token_seq    1.000000
sales_type:token        0.913271
sales_rank:float        0.913271
dtype: float64

In [37]:
sdf['sales_type:token'].value_counts()

Sports &amp; Outdoors        370419
Clothing                      78003
Watches                       11104
Toys & Games                   4167
Shoes                          3550
Home &amp; Kitchen             2889
Patio, Lawn & Garden           2728
Health & Personal Care         2237
Kitchen & Dining               1800
Jewelry                        1747
Pet Supplies                   1566
Industrial & Scientific        1222
Cell Phones & Accessories       829
Automotive                      702
Electronics                     665
Music                           604
Video Games                     527
Camera &amp; Photo              456
Beauty                          335
Arts, Crafts & Sewing           215
Musical Instruments              81
Home Improvement                 74
Software                         46
Movies & TV                      33
Baby                             16
Computers & Accessories          12
Office Products                   6
Grocery & Gourmet Food      

# Yelp

In [10]:
dataset_path=os.path.join(dataset_root,'yelp')
os.listdir(dataset_path)

['yelp.user', 'yelp.inter', 'yelp.item', 'README.md']

## Reading the dataset using recbole

In [None]:
# reading the dataset through the recbole framework
args=Arguments(dataset="yelp",config_files=os.path.join(difsr_root,'configs/orig/yelp.yaml'))
config_file_list = args.config_files.strip().split(' ') if args.config_files else None
config = Config(model=args.model, dataset=f'{args.dataset}', config_file_list=config_file_list)
config.final_config_dict['data_path'] = os.path.join(difsr_root,config.final_config_dict['data_path'])
config.final_config_dict

In [None]:
dataset = create_dataset(config)
dataset

In [None]:
dataset.item_feat.head()

## read item data

In [11]:
ydf=pd.read_csv(os.path.join(dataset_path,'yelp.item'),sep='\t')
ydf.head()

Unnamed: 0,business_id:token,item_name:token_seq,address:token_seq,city:token_seq,state:token,postal_code:token,latitude:float,longitude:float,item_stars:float,item_review_count:float,is_open:float,categories:token_seq
0,f9NumwFMBDn751xgFiRbNA,The Range At Lake Norman,10913 Bailey Rd,Cornelius,NC,28031,35.462724,-80.852612,3.5,36,1,"Active Life, Gun/Rifle Ranges, Guns & Ammo, Sh..."
1,Yzvjg0SayhoZgCljUJRF9Q,"Carlos Santo, NMD","8880 E Via Linda, Ste 107",Scottsdale,AZ,85258,33.569404,-111.890264,5.0,4,1,"Health & Medical, Fitness & Instruction, Yoga,..."
2,XNoUzKckATkOD1hP6vghZg,Felinus,3554 Rue Notre-Dame O,Montreal,QC,H4C 1P4,45.479984,-73.58007,5.0,5,1,"Pets, Pet Services, Pet Groomers"
3,6OAZjbxqM5ol29BuHsil3w,Nevada House of Hose,1015 Sharp Cir,North Las Vegas,NV,89030,36.219728,-115.127725,2.5,3,0,"Hardware Stores, Home Services, Building Suppl..."
4,51M2Kk903DFYI6gnB5I6SQ,USE MY GUY SERVICES LLC,4827 E Downing Cir,Mesa,AZ,85205,33.428065,-111.726648,4.5,26,1,"Home Services, Plumbing, Electricians, Handyma..."


In [13]:
# check how many non-Nan values are there in each column
ydf.count()/len(ydf)

business_id:token          1.000000
item_name:token_seq        0.999995
address:token_seq          0.958552
city:token_seq             0.999990
state:token                1.000000
postal_code:token          0.997569
latitude:float             1.000000
longitude:float            1.000000
item_stars:float           1.000000
item_review_count:float    1.000000
is_open:float              1.000000
categories:token_seq       1.000000
dtype: float64

# Steam

In [7]:
dataset_path=os.path.join(dataset_root,'Steam')
os.listdir(dataset_path)

['steam.inter', 'steam.item']

## Reading the dataset using recbole

In [None]:
# reading the dataset through the recbole framework
args=Arguments(dataset="Steam",config_files=os.path.join(difsr_root,'configs/Steam.yaml'))
config_file_list = args.config_files.strip().split(' ') if args.config_files else None
config = Config(model=args.model, dataset=f'{args.dataset}', config_file_list=config_file_list)
config.final_config_dict['data_path'] = os.path.join(difsr_root,config.final_config_dict['data_path'])
config.final_config_dict

In [None]:
dataset = create_dataset(config)
dataset

In [None]:
dataset.item_feat.head()

## read item data

In [8]:
sdf=pd.read_csv(os.path.join(dataset_path,'steam.item'),sep='\t')
sdf.head()

Unnamed: 0,app_name:token,developer:token,early_access:token,genres:token_seq,id:token,metascore:float,price:float,publisher:token,timestamp:float,sentiment:token,specs:token_seq,tags:token_seq,title:token
0,Ironbound,Secret Level SRL,False,"[Free to Play, Indie, RPG, Strategy]",643980.0,,Free To Play,"Making Fun, Inc.",1515024000.0,Mostly Positive,"[Single-player, Multi-player, Online Multi-Pla...","[Free to Play, Strategy, Indie, RPG, Card Game...",Ironbound
1,Real Pool 3D - Poolians,Poolians.com,False,"[Casual, Free to Play, Indie, Simulation, Sports]",670290.0,,Free to Play,Poolians.com,1500854000.0,Mostly Positive,"[Single-player, Multi-player, Online Multi-Pla...","[Free to Play, Simulation, Sports, Casual, Ind...",Real Pool 3D - Poolians
2,\u5f39\u70b8\u4eba2222,\u5f7c\u5cb8\u9886\u57df,False,"[Action, Adventure, Casual]",767400.0,,0.99,\u5f7c\u5cb8\u9886\u57df,1512605000.0,,[Single-player],"[Action, Adventure, Casual]",\u5f39\u70b8\u4eba2222
3,Log Challenge,,False,,773570.0,,2.99,,,,"[Single-player, Full controller support, HTC V...","[Action, Indie, Casual, Sports]",
4,Battle Royale Trainer,Trickjump Games Ltd,False,"[Action, Adventure, Simulation]",772540.0,,3.99,Trickjump Games Ltd,1515024000.0,Mixed,"[Single-player, Steam Achievements]","[Action, Adventure, Simulation, FPS, Shooter, ...",Battle Royale Trainer


In [9]:
# check how many non-Nan values are there in each column
sdf.count()/len(sdf)

app_name:token        0.999938
developer:token       0.897336
early_access:token    1.000000
genres:token_seq      0.897834
id:token              0.999938
metascore:float       0.081129
price:float           0.957148
publisher:token       0.749113
timestamp:float       0.926806
sentiment:token       0.776530
specs:token_seq       0.979150
tags:token_seq        0.994927
title:token           0.936205
dtype: float64