In [53]:
import pandas as pd
from typing import Callable, List, Optional

import os.path as osp
import os

import torch

from torch_geometric.data import HeteroData, InMemoryDataset, download_url

In [64]:
class Gowalla(InMemoryDataset):
    url = 'https://snap.stanford.edu/data/loc-gowalla_totalCheckins.txt.gz'
  

    def __init__(
        self,
        root: str,
        transform: Optional[Callable] = None,
        pre_transform: Optional[Callable] = None,
        force_reload: bool = False,
    ) -> None:
        super().__init__(root, transform, pre_transform,
                         force_reload=force_reload)
        self.load(self.processed_paths[0], data_cls=HeteroData)
    @property
    def raw_file_names(self) -> List[str]:
        return ['loc-gowalla_totalCheckins.txt', 'filtered_total.txt', 'user_id_map.txt', 'item_id_map.txt']

    @property
    def processed_file_names(self) -> str:
        return 'data.pt'
    
    @property
    def ziped_file_name(self) -> str:
        return 'loc-gowalla_totalCheckins.txt.gz'

    def download(self) -> None:
        import gzip

        if not osp.isfile(osp.join(self.root, 'data/Gowalla/loc-gowalla_totalCheckins.txt.gz')):
            download_url(f'{self.url}', self.root)
        if not osp.isfile(osp.join(self.root,'data/Gowalla/raw/loc-gowalla_totalCheckins.txt')): 
            os.system(f'gzip -d {osp.join(self.root,"/data/Gowalla/loc-gowalla_totalCheckins.txt.gz")}')
            os.system(f'mv {osp.join(self.root,"/data/Gowalla/loc-gowalla_totalCheckins.txt")} {osp.join(self.root,"/data/Gowalla/loc-gowalla_totalCheckins.txt.gz")}')
    
    def filter(self, df, threshold):
            df = pd.read_csv(osp.join(self.raw_dir, self.raw_file_names[0]),sep = '\t', names = ['user', 'date', 'long', 'lat', 'item'])
            filtered_df = self.filtering(df, threshold)
            processed_df, user_id, item_id  = self.refactoring_from_0(filtered_df)
            processed_df.to_csv(osp.join(self.raw_dir, self.raw_file_names[1]),sep=" ", index=False, header=None)
            user_id.to_csv(osp.join(self.raw_dir, self.raw_file_names[2]),sep=" ", index=False, header=None)
            item_id.to_csv(osp.join(self.raw_dir, self.raw_file_names[3]),sep=" ", index=False, header=None)

    def refactoring_from_0(self, df):
        out_df = pd.DataFrame() 
        
        original_uid = np.sort(df['user'].unique())
        original_iid = np.sort(df['item'].unique())

        u_range = range(len(original_uid))
        i_range = range(len(original_iid))

        uid_mapping = { o_id: n_id for o_id, n_id in zip(original_uid, u_range)} # 원래 유저 아이디 (중간중간 비어있음) : 순서대로 유저 아이디
        iid_mapping = { o_id: n_id for o_id, n_id in zip(original_iid,i_range)} # 원래 아이템 아이디 : 순서대로 아이템 아이디

        uid_map = pd.DataFrame({'o_id' : list(uid_mapping.keys()), 'n_id' : list(uid_mapping.values())})

        iid_map = pd.DataFrame({'o_id' : list(iid_mapping.keys()),'n_id':list(iid_mapping.values())})


        out_df['user'] = df['user'].map(uid_mapping)
        out_df['item'] = df['item'].map(iid_mapping)
        out_df['time'] = df['time']
        return out_df, uid_map, iid_map

    def filtering(self, df, threshold) :
        fdf = df
        while fdf.user.value_counts().min() < threshold or fdf.item.value_counts().min() < threshold:
            df_item = fdf.groupby('item').count()
            df_item = df_item[df_item.user < threshold]
            li = df_item.index.to_list()
            fdf = fdf.drop(fdf.loc[fdf.item.isin(li)].index)
            # print_info(fdf)
            df_usr = fdf.groupby('user').count()
            df_usr = df_usr[df_usr.item < threshold]
            li = df_usr.index.to_list()
            fdf = fdf.drop(fdf.loc[fdf.user.isin(li)].index)
            # print_info(fdf)
            # print(f"Total Edges : {len(fdf)}\nTotal User : {len(fdf['user'].unique())}\nTotal item : {len(fdf['item'].unique())} \
            #             \nMin Interaction Per user : {fdf.user.value_counts().min()} \
            #             \nMax Interaction Per user : {fdf.user.value_counts().max()} \
            #             \nAvg Interaction Per user : {fdf.user.value_counts().mean()}\
            #             \nMin Interaction Per item : {fdf.item.value_counts().min()} \
            #             \nMax Interaction Per item : {fdf.item.value_counts().max()} \
            #             \nAvg Interaction Per item : {fdf.item.value_counts().mean()}")
        
        fdf = fdf.reset_index().drop(columns = ['index'])
        return fdf

    def process(self) -> None:
        data = HeteroData()

        # Process number of nodes for each node type:
        node_types = ['user', 'item']

        
        if osp.isfile({osp.join(self.root,f"/data/Gowalla/raw/{self.raw_file_name[1]}")}) \
            and osp.isfile({osp.join(self.root,f"/data/Gowalla/raw/{self.raw_file_name[2]}")}) \
                and osp.isfile({osp.join(self.root,f"/data/Gowalla/raw/{self.raw_file_name[3]}")}):
            df = read_csv(osp.join(self.raw_dir, self.raw_file_names[0]), names = ['user', 'item', 'time'])
        else:
            df = read_csv(osp.join(self.raw_dir, self.raw_file_names[0]), names = ['user', 'item', 'time'])
            filter(df, threshold)
            df = read_csv(osp.join(self.raw_dir, self.raw_file_names[1]), names = ['user', 'item', 'time'])
    
        for node_type in node_types :
            data[node_type].num_nodes = len(df[node_type].unique())

        # Process edge information for training and testing:
        attr_names = ['edge_index', 'edge_label_index']

        for path, attr_name in zip(self.raw_paths[2:], attr_names):
            rows, cols = [], []
            with open(path) as f:
                lines = f.readlines()
            for line in lines:
                indices = line.strip().split(' ')
                for dst in indices[1:]:
                    rows.append(int(indices[0]))
                    cols.append(int(dst))
            index = torch.tensor([rows, cols])

            data['user', 'rates', 'book'][attr_name] = index
            if attr_name == 'edge_index':
                data['book', 'rated_by', 'user'][attr_name] = index.flip([0])

        if self.pre_transform is not None:
            data = self.pre_transform(data)

        self.save([data], self.processed_paths[0])


In [65]:
path = osp.join('./', 'data', 'Gowalla')
dataset = Gowalla(path, filter = 10)

AttributeError: 'Gowalla' object has no attribute 'filter'

In [44]:
dataset[0]

HeteroData(
  user={ num_nodes=107092 },
  item={ num_nodes=1280969 }
)

In [45]:
import torch

In [46]:
ss = torch.load('/home/jiwon/Jiwon_Rsch/mount/pygbased/data/Gowalla/processed/data.pt')

  ss =torch.load('/home/jiwon/Jiwon_Rsch/mount/pygbased/data/Gowalla/processed/data.pt')


In [47]:
ss

({'_global_store': {},
  'user': {'num_nodes': 107092},
  'item': {'num_nodes': 1280969}},
 None,
 torch_geometric.data.hetero_data.HeteroData)

In [49]:
dataset[0]

HeteroData(
  user={ num_nodes=107092 },
  item={ num_nodes=1280969 }
)