# Document Overview
**Purpose:**
1. Generate training and testing sequences: 
2. Generate negative samples for each sequence
3. Generate category distribution matrix for each collective POI
4. Generate index map for reindexed POIs and Users

**Input file:** 
1. Original Data: 'data_CHA.csv'
2. Helper Function Libaray: 'Helper_Functions.py'

**Output file:** 
1. Sample sets consisting POI, distance, time, type, category, negative sequence
2. POI, user, category id mapping from old to new
3. Collective POI's category distribution dictionary
4. POI distance matrix 
5. Other paramters: POI max_distance and max_sequence_length

**Creation Date:** 4th Nov 2019

In [1]:
# dependencies
import numpy as np
import pandas as pd
import os
# import argparse

In [2]:
# import helper functions

import Helper_Functions as Helper

### Adjustable parameters:

1. **small_sample** *(boolean)*: Whether to use a small sample (1000 visits) for testing
2. **augment_sample** *(boolean)*: Whether to perform sample augmentation
3. **pad_data** *(boolean)*: Whether to perform padding on data sequence

4. **min_seq_len** *(int)*: Minimum No. POIs for a valid sequence
5. **min_seq_num** *(int)*: Minimun No. valid sequences for a valid user
6. **neg_sample_num** *(int)*: Number of negative samples for each POI

In [3]:
# setup parameters (for ipython execution)

small_sample = False
augment_sample = True
pad_data = False

min_seq_len = 2
min_seq_num = 2
neg_sample_num = 5

## 1.Import data

In [4]:
if small_sample:  
    data = pd.read_csv('./data_CHA.csv')[:20000] 
else: 
    data = pd.read_csv('./data_CHA.csv')

In [5]:
data.head()

Unnamed: 0.1,Unnamed: 0,Unnamed,Location_id,POI_id,POI_Type,Org_id,User_id,TimeStamp,Latitude,Longitude,...,Unnamed: 0_y,POI_id_y,POI_Type_y,Org_id_y,Latitude_y,Longitude_y,Category_2_y,yelp_ID_y,name_y,stars
0,0,0,0,0,Independent,4e91b6459a52db7dbabab2ea,9,Sun Sep 09 23:20:38 +0000 2012,35.227758,-80.853371,...,0,0,Independent,4e91b6459a52db7dbabab2ea,35.227758,-80.853371,Stadium,Z5BHNFjt7ZmAse2kKlw2uA,Bank of America Stadium,4.0
1,1,181,2,2,Independent,4bddc45c6198c9b6859711ff,9,Sun Sep 09 23:20:56 +0000 2012,35.228426,-80.854568,...,1,2,Independent,4bddc45c6198c9b6859711ff,35.228426,-80.854568,Pub,sXj4ZFR9SmftEDXOzmZscw,Hartigan's Irish Pub,3.0
2,2,407,5,5,Independent,4b5a16e9f964a5207cac28e3,60,Fri Jun 08 15:58:58 +0000 2012,35.224868,-80.955935,...,2,5,Independent,4b5a16e9f964a5207cac28e3,35.224868,-80.955935,Scenic Lookout,WiWCtinRtykBC6DCg6w3Fg,Charlotte Douglas Airport Overlook,4.5
3,3,199,4,4,Independent,4b155049f964a52045b023e3,60,Fri Jun 08 15:59:30 +0000 2012,35.224912,-80.839655,...,3,4,Independent,4b155049f964a52045b023e3,35.224912,-80.839655,Basketball Stadium,sYJaNxG8KLus6U7TJ2JHWA,Charlotte Hornets,4.0
4,4,9,1,1,Independent,4c9f60f246978cfa453eaa7f,63,Thu Apr 04 23:07:45 +0000 2013,35.225884,-80.852895,...,4,1,Independent,4c9f60f246978cfa453eaa7f,35.225884,-80.852895,Football Stadium,Z5BHNFjt7ZmAse2kKlw2uA,Bank of America Stadium,4.0


In [6]:
data.columns

Index(['Unnamed: 0', 'Unnamed', 'Location_id', 'POI_id', 'POI_Type', 'Org_id',
       'User_id', 'TimeStamp', 'Latitude', 'Longitude', 'Category_2',
       'yelp_ID', 'name_x', 'Time', 'date', 'Local_sg_time', 'L2_id',
       'Unnamed: 0_y', 'POI_id_y', 'POI_Type_y', 'Org_id_y', 'Latitude_y',
       'Longitude_y', 'Category_2_y', 'yelp_ID_y', 'name_y', 'stars'],
      dtype='object')

## 2. Generate Visit Sequence 
Generate valid index sequences for each valid user

In [7]:
# form visit sequences 

visit_sequences, max_seq_len, valid_visits, user_reIndex_mapping = Helper.generate_sequence(data, min_seq_len, min_seq_num)

assert bool(visit_sequences), 'no qualified sequence after filtering!' # check if output sequence is empty



In [8]:
Helper.peep_dictionary(visit_sequences)

617  :
 [list([118, 119]) list([129, 130, 131]) list([140, 141, 142, 143])
 list([176, 177]) list([192, 193]) list([153, 154]) list([111, 112])
 list([88, 89]) list([100, 101]) list([158, 159, 160, 161])
 list([184, 185]) list([202, 203]) list([216, 217]) list([211, 212])]
dictionary size:  131


In [9]:
max_seq_len 

11

In [10]:
len(valid_visits)

1606

In [11]:
user_reIndex_mapping

array([   617,    730,    737,   1112,   2614,   3276,   3387,   4171,
         4270,   5112,   7338,   7636,   8130,   8199,   8664,   8826,
         9030,  10405,  10825,  12962,  13021,  13222,  13543,  13683,
        14070,  15243,  16408,  17298,  18158,  18443,  24083,  24740,
        26071,  26285,  26457,  27740,  27748,  28688,  28698,  28863,
        35778,  37782,  38545,  40245,  40927,  41485,  43919,  44624,
        46465,  46901,  47373,  50816,  50880,  51047,  51190,  51769,
        53862,  55353,  55596,  56450,  59334,  61460,  63459,  63988,
        66874,  67823,  69479,  70502,  70832,  72268,  74760,  80341,
        83543,  84975,  85615,  86543,  87742,  89173,  91174,  92758,
        93492,  95685,  97327,  99033,  99718, 100128, 100489, 100649,
       102816, 106173, 109188, 113244, 113864, 122982, 124042, 125054,
       128376, 133043, 134206, 135792, 135945, 136377, 139990, 142717,
       143814, 153585, 156570, 159219, 165372, 166260, 167750, 172283,
      

In [12]:
# augment sequences (optional)

if augment_sample:
#     visit_sequences = Helper.aug_sequence(visit_sequences, min_len=3)
    visit_sequences, ground_truth_dict = Helper.aug_sequence(visit_sequences, min_len=3)

In [13]:
Helper.peep_dictionary(visit_sequences)

617  :
 [list([118, 119]) list([129, 130, 131]) list([140, 141, 142])
 list([140, 141, 142, 143]) list([176, 177]) list([192, 193])
 list([153, 154]) list([111, 112]) list([88, 89]) list([100, 101])
 list([158, 159, 160]) list([158, 159, 160, 161]) list([184, 185])
 list([202, 203]) list([216, 217]) list([211, 212])]
dictionary size:  131


In [14]:
Helper.peep_dictionary(ground_truth_dict)

617  :
 [list([119]) list([131]) list([142, 143]) list([143]) list([177])
 list([193]) list([154]) list([112]) list([89]) list([101])
 list([160, 161]) list([161]) list([185]) list([203]) list([217])
 list([212])]
dictionary size:  131


In [15]:
# pad sequences (optional)

if pad_data:
    
    visit_sequences = Helper.pad_sequence(visit_sequences, max_seq_len)

In [16]:
Helper.peep_dictionary(visit_sequences)

617  :
 [list([118, 119]) list([129, 130, 131]) list([140, 141, 142])
 list([140, 141, 142, 143]) list([176, 177]) list([192, 193])
 list([153, 154]) list([111, 112]) list([88, 89]) list([100, 101])
 list([158, 159, 160]) list([158, 159, 160, 161]) list([184, 185])
 list([202, 203]) list([216, 217]) list([211, 212])]
dictionary size:  131


## 3. Prepare Input Sequences
Five input sequences paralleled with the Visit Sequence are prepared:
1. POI sequence
2. Distance sequence
3. Time sequence
4. Type sequence
5. Category sequence

In [17]:
# generate POI sequence

POI_sequences, POI_reIndex_mapping = Helper.generate_POI_sequences(data, visit_sequences)

In [18]:
POI_sequences[0] # POI_sequence for first user

[[543, 21],
 [23, 566, 17],
 [18, 566, 17],
 [18, 566, 17, 27],
 [30, 25],
 [19, 18],
 [30, 18],
 [31, 542],
 [28, 18],
 [23, 18],
 [26, 29, 18],
 [26, 29, 18, 20],
 [543, 18],
 [559, 18],
 [15, 16],
 [18, 542]]

In [19]:
POI_reIndex_mapping

array([   1,    4,    5,   13,   17,   19,   21,   25,   26,   31,   40,
         47,   48,   54,   55,   56,   61,   62,   64,   72,   74,   76,
         80,   81,   93,   94,   96,   98,  100,  102,  105,  108,  125,
        127,  128,  129,  131,  134,  135,  137,  138,  139,  145,  147,
        150,  151,  152,  153,  154,  155,  156,  158,  159,  160,  161,
        163,  166,  171,  172,  183,  186,  192,  196,  197,  198,  199,
        201,  205,  206,  208,  212,  215,  218,  219,  224,  225,  229,
        231,  235,  238,  240,  241,  242,  243,  245,  248,  250,  251,
        252,  266,  267,  271,  274,  283,  288,  291,  295,  299,  300,
        302,  308,  311,  317,  319,  327,  328,  329,  330,  341,  345,
        346,  359,  376,  384,  392,  396,  397,  408,  412,  414,  418,
        419,  420,  423,  424,  425,  427,  429,  430,  431,  432,  434,
        439,  440,  441,  442,  446,  450,  451,  462,  464,  467,  469,
        472,  475,  476,  478,  483,  485,  486,  4

In [20]:
# generate distance sequence

dist_sequences, max_dist = Helper.generate_dist_sequences(data, visit_sequences)

In [21]:
dist_sequences[0] # dist_sequence for first user # can perform analysis

[[0, 5],
 [0, 1, 1],
 [0, 2, 1],
 [0, 2, 1, 5],
 [0, 5],
 [0, 2],
 [0, 1],
 [0, 7],
 [0, 7],
 [0, 1],
 [0, 5, 2],
 [0, 5, 2, 1],
 [0, 6],
 [0, 8],
 [0, 4],
 [0, 4]]

In [22]:
max_dist # maximum distance between two consecutive visits 

31

In [23]:
# generate time sequence

time_sequences = Helper.generate_time_sequences(data, visit_sequences)

In [24]:
time_sequences[0] # time_sequence for first user

[[18, 19],
 [16, 17, 17],
 [14, 14, 14],
 [14, 14, 14, 14],
 [17, 18],
 [17, 17],
 [17, 17],
 [0, 18],
 [14, 15],
 [17, 18],
 [1, 17, 18],
 [1, 17, 18, 18],
 [20, 21],
 [0, 20],
 [19, 19],
 [18, 19]]

In [25]:
# generage Type sequence

type_sequences = Helper.generate_type_sequence(data, visit_sequences)

In [26]:
type_sequences[0] # type_sequence for first user

[[1, 0],
 [0, 1, 0],
 [0, 1, 0],
 [0, 1, 0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 1],
 [0, 0],
 [0, 0],
 [0, 0, 0],
 [0, 0, 0, 0],
 [1, 0],
 [1, 0],
 [0, 0],
 [0, 1]]

In [27]:
# generate category sequence

cat_sequences, cat_reIndex_mapping = Helper.generate_cat_sequences(data, visit_sequences)

In [28]:
cat_sequences[0] # cat_sequence for first user

[[27, 24],
 [38, 44, 36],
 [30, 44, 36],
 [30, 44, 36, 46],
 [31, 34],
 [32, 30],
 [31, 30],
 [40, 36],
 [13, 30],
 [38, 30],
 [45, 34, 30],
 [45, 34, 30, 24],
 [27, 30],
 [50, 30],
 [44, 30],
 [30, 36]]

In [29]:
cat_reIndex_mapping 

array([  1,   2,   3,   4,   5,   6,   8,   9,  10,  12,  13,  14,  15,
        16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,
        29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,
        42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,
        55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  68,  69,
        70,  71,  72,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
        84,  85,  86,  87,  88,  89,  90,  92,  93,  94,  95,  98,  99,
       100, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 114, 116,
       117, 118, 120, 121, 122, 123, 124, 125, 126, 127, 128, 130, 131,
       132, 133, 134, 135, 136, 137, 138, 140, 142, 143, 144, 146, 147,
       148, 150, 151, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163,
       164, 166, 167, 168, 170, 174, 175, 176, 178, 179, 182, 183, 184,
       185, 188, 190, 193, 194, 197, 198, 200, 212, 218, 219, 230, 232],
      dtype=int64)

In [30]:
# generate ground truth for each sequence

ground_truth_sequences = Helper.generate_ground_truth_sequences(data, ground_truth_dict, POI_reIndex_mapping)

In [31]:
ground_truth_sequences[0]

[[21],
 [17],
 [17, 27],
 [27],
 [25],
 [18],
 [18],
 [542],
 [18],
 [18],
 [18, 20],
 [20],
 [18],
 [18],
 [16],
 [542]]

In [32]:
# generate specific poi ground truth for each sequence

specific_poi_sequences = Helper.generate_specific_poi_sequences(data, ground_truth_dict)

In [33]:
specific_poi_sequences[0]

[[76],
 [62],
 [62, 98],
 [98],
 [94],
 [64],
 [64],
 [66],
 [64],
 [64],
 [64, 74],
 [74],
 [64],
 [64],
 [61],
 [66]]

## 4. Extra Data Preperation

### Collective POI's category distribution

For each collective POI, count the number stores belongs to each category it has.
The distribution is recorded in a 2-layer dictionary of form:

{ POI_id (new id) : { category_id (new id): store count (int)} }

In [34]:
# generate collective POI's category distribution

poi_cat_distrib = Helper.generate_cat_distrib(data, valid_visits, POI_reIndex_mapping, cat_reIndex_mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  collective_POI_visit['L2_id'] = collective_POI_visit['L2_id'].apply(lambda x: _old_id_to_new(cat_reIndex_mapping, x))


In [35]:
Helper.peep_dictionary(poi_cat_distrib)

542  :
 Counter({36: 2, 37: 1})
dictionary size:  68


In [36]:
poi_cat_distrib

{542: Counter({36: 2, 37: 1}),
 543: Counter({27: 2, 45: 1}),
 566: Counter({44: 2, 33: 2, 93: 1, 24: 1, 18: 2}),
 559: Counter({50: 1, 34: 2, 32: 5, 62: 2, 37: 1}),
 573: Counter({54: 1, 42: 2, 17: 1, 8: 1, 34: 1, 37: 1, 56: 1}),
 526: Counter({56: 1, 27: 6, 113: 1, 44: 2, 17: 2}),
 506: Counter({15: 25, 149: 2}),
 525: Counter({57: 11, 59: 1, 161: 1}),
 533: Counter({0: 1}),
 554: Counter({27: 3,
          44: 2,
          113: 1,
          20: 2,
          42: 1,
          80: 1,
          41: 1,
          32: 1,
          137: 1}),
 515: Counter({63: 9}),
 568: Counter({49: 3, 34: 1, 42: 1, 74: 3, 99: 2, 45: 3, 27: 1}),
 564: Counter({13: 1, 49: 1}),
 527: Counter({32: 1, 76: 5, 72: 2}),
 561: Counter({74: 2, 151: 1, 148: 2, 47: 5, 59: 2, 17: 1}),
 512: Counter({74: 1, 47: 3}),
 539: Counter({81: 2, 83: 1, 100: 1, 45: 1, 34: 1}),
 562: Counter({47: 2, 51: 2, 74: 1, 92: 3}),
 569: Counter({74: 1, 34: 1, 87: 3, 53: 1, 43: 1, 35: 1}),
 570: Counter({12: 4,
          34: 8,
          8

In [37]:
valid_visit_data = data[data.index.isin(valid_visits)]

### Negative Samples for Each Sequence

For each user's each sequence, generate 'neg_sample_num' number of negative POIs

Negative POIs statisfy following criteria:

1. The POI does not appear in the true sequence 

2. The distance between:
    *a) negative POI and true destination* and 
    *b) true second last POI and true destination*
   should be as close as possible
   
The output neg_sequences should be a 3d array of shape [user, seq, neg_sample]

In [40]:
# store distance between each valid POI (time consuming)
    
dist_mat = Helper.generate_POI_dist_mat(data, POI_reIndex_mapping)



In [41]:
dist_mat

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.20748832,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 9.36021367, 10.56223784,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [16.63790537, 15.65527477, 24.54961891, ...,  0.        ,
         0.        ,  0.        ],
       [15.56464368, 14.99844888, 21.19168798, ...,  7.25114969,
         0.        ,  0.        ],
       [ 8.45148939,  9.63998342,  1.20321786, ..., 23.37682314,
        19.99099261,  0.        ]])

In [45]:
# generate negative samples 

neg_sequences = Helper.generate_neg_sequences(POI_sequences, dist_mat, neg_sample_num, data, POI_reIndex_mapping, cat_reIndex_mapping)

In [46]:
neg_sequences[0] # negative samples for each sequence of 1st user

[[[286, 31, 0], [390, 94, 0], [389, 129, 0], [388, 52, 0], [387, 123, 0]],
 [[286, 31, 0], [390, 94, 0], [389, 129, 0], [388, 52, 0], [387, 123, 0]],
 [[286, 31, 0], [390, 94, 0], [389, 129, 0], [388, 52, 0], [387, 123, 0]],
 [[286, 31, 0], [390, 94, 0], [389, 129, 0], [388, 52, 0], [387, 123, 0]],
 [[286, 31, 0], [391, 115, 0], [390, 94, 0], [389, 129, 0], [388, 52, 0]],
 [[286, 31, 0], [391, 115, 0], [390, 94, 0], [389, 129, 0], [388, 52, 0]],
 [[286, 31, 0], [391, 115, 0], [390, 94, 0], [389, 129, 0], [388, 52, 0]],
 [[573, -1, 1], [558, -1, 1], [557, -1, 1], [555, -1, 1], [554, -1, 1]],
 [[286, 31, 0], [391, 115, 0], [390, 94, 0], [389, 129, 0], [388, 52, 0]],
 [[286, 31, 0], [391, 115, 0], [390, 94, 0], [389, 129, 0], [388, 52, 0]],
 [[286, 31, 0], [391, 115, 0], [390, 94, 0], [389, 129, 0], [388, 52, 0]],
 [[286, 31, 0], [391, 115, 0], [390, 94, 0], [389, 129, 0], [388, 52, 0]],
 [[286, 31, 0], [390, 94, 0], [389, 129, 0], [388, 52, 0], [387, 123, 0]],
 [[286, 31, 0], [390, 94, 0

In [47]:
cat_reIndex_mapping

array([  1,   2,   3,   4,   5,   6,   8,   9,  10,  12,  13,  14,  15,
        16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,
        29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,
        42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,
        55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  68,  69,
        70,  71,  72,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
        84,  85,  86,  87,  88,  89,  90,  92,  93,  94,  95,  98,  99,
       100, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 114, 116,
       117, 118, 120, 121, 122, 123, 124, 125, 126, 127, 128, 130, 131,
       132, 133, 134, 135, 136, 137, 138, 140, 142, 143, 144, 146, 147,
       148, 150, 151, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163,
       164, 166, 167, 168, 170, 174, 175, 176, 178, 179, 182, 183, 184,
       185, 188, 190, 193, 194, 197, 198, 200, 212, 218, 219, 230, 232],
      dtype=int64)

In [67]:
# generate poi_cat_specific_poi_dict mapping

grouped = data.groupby(['POI_id', 'L2_id'])['Location_id'].unique().apply(list)

grouped

POI_id  L2_id
0       0                                       [0]
1       4                                       [1]
2       1                                       [2]
3       173                                     [3]
4       3                                       [4]
5       2                                       [5]
6       17                                      [6]
7       11                                      [7]
10      7                                      [10]
11      142                                    [11]
13      5                                      [13]
16      13                                     [16]
17      12                                     [17]
19      15                                     [19]
21      14                                     [21]
22      10                                     [22]
23      18                                     [23]
24      16                                     [24]
25      19                                     [25

In [74]:
# generate poi_cat_specific_poi_dict

poi_cat_specific_poi_dict = {}

prev_poi = grouped.index[0][0]

cat_specific_poi_dict = {}

cat_specific_poi_dict[grouped.index[0][1]] = grouped[grouped.index[0]]

for index in grouped.index:

    if index[0] not in poi_cat_specific_poi_dict.keys():  
        
        poi_cat_specific_poi_dict[prev_poi] = cat_specific_poi_dict
        
        cat_specific_poi_dict = {}
        
        prev_poi = index[0]
        
        poi_cat_specific_poi_dict[index[0]] = {}
        
    cat_specific_poi_dict[index[1]] = grouped[index]
    
poi_cat_specific_poi_dict[prev_poi] = cat_specific_poi_dict

(317, 25)


In [75]:
poi_cat_specific_poi_dict[317]

{25: [317]}

## 5. Form Sample Sets

Concatenate five sequences to form a sample, which is a tuple consists of: (POI_seq, dist_seq, time_seq, type_seq, cat_seq, neg_samplw)

Organise samples according to users in a dictionary of form:

{ User_id (new id) : sample sets } 

In [52]:
# form sample set for each user

sample_sets = Helper.form_sample_sets(POI_sequences, dist_sequences, time_sequences, type_sequences, cat_sequences, ground_truth_sequences, specific_poi_sequences, neg_sequences)

Total user: 131 -- Total sample: 753


In [53]:
Helper.peep_dictionary(sample_sets)

0  :
 [([543, 21], [0, 5], [18, 19], [1, 0], [27, 24], [21], [76], [[286, 31, 0], [390, 94, 0], [389, 129, 0], [388, 52, 0], [387, 123, 0]]), ([23, 566, 17], [0, 1, 1], [16, 17, 17], [0, 1, 0], [38, 44, 36], [17], [62], [[286, 31, 0], [390, 94, 0], [389, 129, 0], [388, 52, 0], [387, 123, 0]]), ([18, 566, 17], [0, 2, 1], [14, 14, 14], [0, 1, 0], [30, 44, 36], [17, 27], [62, 98], [[286, 31, 0], [390, 94, 0], [389, 129, 0], [388, 52, 0], [387, 123, 0]]), ([18, 566, 17, 27], [0, 2, 1, 5], [14, 14, 14, 14], [0, 1, 0, 0], [30, 44, 36, 46], [27], [98], [[286, 31, 0], [390, 94, 0], [389, 129, 0], [388, 52, 0], [387, 123, 0]]), ([30, 25], [0, 5], [17, 18], [0, 0], [31, 34], [25], [94], [[286, 31, 0], [391, 115, 0], [390, 94, 0], [389, 129, 0], [388, 52, 0]]), ([19, 18], [0, 2], [17, 17], [0, 0], [32, 30], [18], [64], [[286, 31, 0], [391, 115, 0], [390, 94, 0], [389, 129, 0], [388, 52, 0]]), ([30, 18], [0, 1], [17, 17], [0, 0], [31, 30], [18], [64], [[286, 31, 0], [391, 115, 0], [390, 94, 0], [3

In [54]:
sample_sets

{0: [([543, 21],
   [0, 5],
   [18, 19],
   [1, 0],
   [27, 24],
   [21],
   [76],
   [[286, 31, 0], [390, 94, 0], [389, 129, 0], [388, 52, 0], [387, 123, 0]]),
  ([23, 566, 17],
   [0, 1, 1],
   [16, 17, 17],
   [0, 1, 0],
   [38, 44, 36],
   [17],
   [62],
   [[286, 31, 0], [390, 94, 0], [389, 129, 0], [388, 52, 0], [387, 123, 0]]),
  ([18, 566, 17],
   [0, 2, 1],
   [14, 14, 14],
   [0, 1, 0],
   [30, 44, 36],
   [17, 27],
   [62, 98],
   [[286, 31, 0], [390, 94, 0], [389, 129, 0], [388, 52, 0], [387, 123, 0]]),
  ([18, 566, 17, 27],
   [0, 2, 1, 5],
   [14, 14, 14, 14],
   [0, 1, 0, 0],
   [30, 44, 36, 46],
   [27],
   [98],
   [[286, 31, 0], [390, 94, 0], [389, 129, 0], [388, 52, 0], [387, 123, 0]]),
  ([30, 25],
   [0, 5],
   [17, 18],
   [0, 0],
   [31, 34],
   [25],
   [94],
   [[286, 31, 0], [391, 115, 0], [390, 94, 0], [389, 129, 0], [388, 52, 0]]),
  ([19, 18],
   [0, 2],
   [17, 17],
   [0, 0],
   [32, 30],
   [18],
   [64],
   [[286, 31, 0], [391, 115, 0], [390, 94, 0], [3

# 6. Output Files

In [55]:
# set output directory

dir = './np_save_CHA/'
if small_sample:
    dir = './test_np_save_CHA/'

In [56]:
# create directory if not exists

if not os.path.exists(dir):
    os.makedirs(dir)

In [57]:
# save concatenated samples

Helper.save_dict(sample_sets, dir + 'sample_sets.pkl')

In [58]:
# save id mappings

np.save(dir + 'POI_reIndex_mapping.npy', POI_reIndex_mapping)
np.save(dir + 'user_reIndex_mapping.npy', user_reIndex_mapping)
np.save(dir + 'cat_reIndex_mapping.npy', cat_reIndex_mapping)

In [59]:
# save collective POI's category distribution dictionary

Helper.save_dict(poi_cat_distrib, dir + 'poi_cat_distrib.pkl')
Helper.save_dict(poi_cat_specific_poi_dict, dir + 'poi_cat_specific_poi_dict.pkl')

In [60]:
# save POI distance matrix 

np.save(dir + 'dist_mat.npy', dist_mat)

In [61]:
# save other relavant parameters

np.save(dir + 'max_dist.npy', max_dist) # max distance (for distance embedding)
np.save(dir + 'max_seq_len.npy', max_seq_len) # max sequence length (for input size)
np.save(dir + 'neg_sample_num.npy', neg_sample_num) # number of negative samples (for negative input size)