# Document Overview
**Purpose:**
1. Generate training and testing sequences: 
2. Generate negative samples for each sequence
3. Generate category distribution matrix for each collective POI
4. Generate index map for reindexed POIs and Users

**Input file:** 
1. Original Data: 'data_CHA.csv'
2. Helper Function Libaray: 'Helper_Functions.py'

**Output file:** 
1. Sample sets consisting POI, distance, time, type, category, negative sequence
2. POI, user, category id mapping from old to new
3. Collective POI's category distribution dictionary
4. POI distance matrix 
5. Other paramters: POI max_distance and max_sequence_length

**Creation Date:** 4th Nov 2019

In [1]:
# dependencies
import numpy as np
import pandas as pd
import os
# import argparse

In [2]:
# import helper functions

import Helper_Functions as Helper

### Adjustable parameters:

1. **small_sample** *(boolean)*: Whether to use a small sample (1000 visits) for testing
2. **augment_sample** *(boolean)*: Whether to perform sample augmentation
3. **pad_data** *(boolean)*: Whether to perform padding on data sequence

4. **min_seq_len** *(int)*: Minimum No. POIs for a valid sequence
5. **min_seq_num** *(int)*: Minimun No. valid sequences for a valid user
6. **neg_sample_num** *(int)*: Number of negative samples for each POI

In [3]:
# setup parameters (for ipython execution)

small_sample = False
augment_sample = True
pad_data = False

min_seq_len = 2
min_seq_num = 2
neg_sample_num = 5

## 1.Import data

In [4]:
if small_sample:  
    data = pd.read_csv('./data_PHO.csv')[:20000] 
else: 
    data = pd.read_csv('./data_PHO.csv')

In [5]:
data.head()

Unnamed: 0.1,Unnamed: 0,Unnamed,Location_id,POI_id,POI_Type,Org_id,User_id,TimeStamp,Zone,Latitude,...,Unnamed: 0_y,POI_id_y,POI_Type_y,Latitude_y,Longitude_y,Category_2_y,Org_id_y,yelp_ID_y,name_y,stars_y
0,0,0,557,557,Independent,4b196b3af964a52000dd23e3,92745,Tue Apr 03 18:09:08 +0000 2012,-420,33.479985,...,0,557,Independent,33.479985,-112.077729,Sandwich Place,4b196b3af964a52000dd23e3,yUh85ZlAq_zZ9-rRlvCgcA,Sacks Sandwiches - Phoenix,4.0
1,1,1,3321,3321,Independent,4a901a25f964a520021620e3,65794,Tue Apr 03 18:27:40 +0000 2012,-420,33.61045,...,1,3321,Independent,33.61045,-112.147466,Coffee Shop,4a901a25f964a520021620e3,KY4Mg8wSgDg4UpHRB_7JFg,Starbucks,3.0
2,2,2,3321,3321,Independent,4a901a25f964a520021620e3,18884,Tue Apr 03 20:26:50 +0000 2012,-420,33.61045,...,1,3321,Independent,33.61045,-112.147466,Coffee Shop,4a901a25f964a520021620e3,KY4Mg8wSgDg4UpHRB_7JFg,Starbucks,3.0
3,3,3,3321,3321,Independent,4a901a25f964a520021620e3,65794,Wed Apr 04 17:24:19 +0000 2012,-420,33.61045,...,1,3321,Independent,33.61045,-112.147466,Coffee Shop,4a901a25f964a520021620e3,KY4Mg8wSgDg4UpHRB_7JFg,Starbucks,3.0
4,4,4,3321,3321,Independent,4a901a25f964a520021620e3,18884,Sat Apr 07 19:47:10 +0000 2012,-420,33.61045,...,1,3321,Independent,33.61045,-112.147466,Coffee Shop,4a901a25f964a520021620e3,KY4Mg8wSgDg4UpHRB_7JFg,Starbucks,3.0


In [6]:
data.columns

Index(['Unnamed: 0', 'Unnamed', 'Location_id', 'POI_id', 'POI_Type', 'Org_id',
       'User_id', 'TimeStamp', 'Zone', 'Latitude', 'Longitude', 'Category_2',
       'yelp_ID', 'name', 'stars', 'Time', 'date', 'Local_sg_time', 'L2_id',
       'Unnamed: 0_y', 'POI_id_y', 'POI_Type_y', 'Latitude_y', 'Longitude_y',
       'Category_2_y', 'Org_id_y', 'yelp_ID_y', 'name_y', 'stars_y'],
      dtype='object')

## 2. Generate Visit Sequence 
Generate valid index sequences for each valid user

In [7]:
# form visit sequences 

visit_sequences, max_seq_len, valid_visits, user_reIndex_mapping = Helper.generate_sequence(data, min_seq_len, min_seq_num)

assert bool(visit_sequences), 'no qualified sequence after filtering!' # check if output sequence is empty

[                                                                        ]   0%

   Unnamed: 0  Unnamed  Location_id  POI_id     POI_Type  \
0           0        0          557     557  Independent   

                     Org_id  User_id                       TimeStamp  Zone  \
0  4b196b3af964a52000dd23e3    92745  Tue Apr 03 18:09:08 +0000 2012  -420   

    Latitude  ...  Unnamed: 0_y POI_id_y   POI_Type_y Latitude_y  Longitude_y  \
0  33.479985  ...             0      557  Independent  33.479985  -112.077729   

     Category_2_y                  Org_id_y               yelp_ID_y  \
0  Sandwich Place  4b196b3af964a52000dd23e3  yUh85ZlAq_zZ9-rRlvCgcA   

                       name_y  stars_y  
0  Sacks Sandwiches - Phoenix      4.0  

[1 rows x 29 columns]
     Unnamed: 0  Unnamed  Location_id  POI_id     POI_Type  \
129         129      129          417     417  Independent   
202         202      202          561     561  Independent   

                       Org_id  User_id                       TimeStamp  Zone  \
129  4aaebc5ff964a520146320e3    92745  Wed 

KeyError: 201

In [None]:
Helper.peep_dictionary(visit_sequences)

In [None]:
max_seq_len 

In [None]:
len(valid_visits)

In [None]:
user_reIndex_mapping

In [None]:
# augment sequences (optional)

if augment_sample:
#     visit_sequences = Helper.aug_sequence(visit_sequences, min_len=3)
    visit_sequences, ground_truth_dict = Helper.aug_sequence(visit_sequences, min_len=3)

In [None]:
Helper.peep_dictionary(visit_sequences)

In [None]:
Helper.peep_dictionary(ground_truth_dict)

In [None]:
# pad sequences (optional)

if pad_data:
    
    visit_sequences = Helper.pad_sequence(visit_sequences, max_seq_len)

In [None]:
Helper.peep_dictionary(visit_sequences)

## 3. Prepare Input Sequences
Five input sequences paralleled with the Visit Sequence are prepared:
1. POI sequence
2. Distance sequence
3. Time sequence
4. Type sequence
5. Category sequence

In [None]:
# generate POI sequence

POI_sequences, POI_reIndex_mapping = Helper.generate_POI_sequences(data, visit_sequences)

In [None]:
POI_sequences[0] # POI_sequence for first user

In [None]:
POI_reIndex_mapping

In [None]:
# generate distance sequence

dist_sequences, max_dist = Helper.generate_dist_sequences(data, visit_sequences)

In [None]:
dist_sequences[0] # dist_sequence for first user # can perform analysis

In [None]:
max_dist # maximum distance between two consecutive visits 

In [None]:
# generate time sequence

time_sequences = Helper.generate_time_sequences(data, visit_sequences)

In [None]:
time_sequences[0] # time_sequence for first user

In [None]:
# generage Type sequence

type_sequences = Helper.generate_type_sequence(data, visit_sequences)

In [None]:
type_sequences[0] # type_sequence for first user

In [None]:
# generate category sequence

cat_sequences, cat_reIndex_mapping = Helper.generate_cat_sequences(data, visit_sequences)

In [None]:
cat_sequences[0] # cat_sequence for first user

In [None]:
cat_reIndex_mapping 

In [None]:
# generate ground truth for each sequence

ground_truth_sequences = Helper.generate_ground_truth_sequences(data, ground_truth_dict, POI_reIndex_mapping)

In [None]:
ground_truth_sequences[0]

In [None]:
# generate specific poi ground truth for each sequence

specific_poi_sequences = Helper.generate_specific_poi_sequences(data, ground_truth_dict)

In [None]:
specific_poi_sequences[0]

## 4. Extra Data Preperation

### Collective POI's category distribution

For each collective POI, count the number stores belongs to each category it has.
The distribution is recorded in a 2-layer dictionary of form:

{ POI_id (new id) : { category_id (new id): store count (int)} }

In [None]:
# generate collective POI's category distribution

poi_cat_distrib = Helper.generate_cat_distrib(data, valid_visits, POI_reIndex_mapping, cat_reIndex_mapping)

In [None]:
Helper.peep_dictionary(poi_cat_distrib)

In [None]:
poi_cat_distrib

In [None]:
valid_visit_data = data[data.index.isin(valid_visits)]

### Negative Samples for Each Sequence

For each user's each sequence, generate 'neg_sample_num' number of negative POIs

Negative POIs statisfy following criteria:

1. The POI does not appear in the true sequence 

2. The distance between:
    *a) negative POI and true destination* and 
    *b) true second last POI and true destination*
   should be as close as possible
   
The output neg_sequences should be a 3d array of shape [user, seq, neg_sample]

In [None]:
# store distance between each valid POI (time consuming)
    
dist_mat = Helper.generate_POI_dist_mat(data, POI_reIndex_mapping)

In [None]:
dist_mat

In [None]:
# generate negative samples 

neg_sequences = Helper.generate_neg_sequences(POI_sequences, dist_mat, neg_sample_num, data, POI_reIndex_mapping, cat_reIndex_mapping)

In [None]:
neg_sequences[0] # negative samples for each sequence of 1st user

In [None]:
cat_reIndex_mapping

In [None]:
# generate poi_cat_specific_poi_dict mapping

grouped = data.groupby(['POI_id', 'L2_id'])['Location_id'].unique().apply(list)

grouped

In [None]:
# generate poi_cat_specific_poi_dict

poi_cat_specific_poi_dict = {}

prev_poi = grouped.index[0][0]

cat_specific_poi_dict = {}

cat_specific_poi_dict[grouped.index[0][1]] = grouped[grouped.index[0]]

for index in grouped.index:

    if index[0] not in poi_cat_specific_poi_dict.keys():  
        
        poi_cat_specific_poi_dict[prev_poi] = cat_specific_poi_dict
        
        cat_specific_poi_dict = {}
        
        prev_poi = index[0]
        
        poi_cat_specific_poi_dict[index[0]] = {}
        
    cat_specific_poi_dict[index[1]] = grouped[index]
    
poi_cat_specific_poi_dict[prev_poi] = cat_specific_poi_dict

In [None]:
poi_cat_specific_poi_dict[317]

## 5. Form Sample Sets

Concatenate five sequences to form a sample, which is a tuple consists of: (POI_seq, dist_seq, time_seq, type_seq, cat_seq, neg_samplw)

Organise samples according to users in a dictionary of form:

{ User_id (new id) : sample sets } 

In [None]:
# form sample set for each user

sample_sets = Helper.form_sample_sets(POI_sequences, dist_sequences, time_sequences, type_sequences, cat_sequences, ground_truth_sequences, specific_poi_sequences, neg_sequences)

In [None]:
Helper.peep_dictionary(sample_sets)

In [None]:
sample_sets

# 6. Output Files

In [None]:
# set output directory

dir = './np_save_PHO/'
if small_sample:
    dir = './test_np_save_PHO/'

In [None]:
# create directory if not exists

if not os.path.exists(dir):
    os.makedirs(dir)

In [None]:
# save concatenated samples

Helper.save_dict(sample_sets, dir + 'sample_sets.pkl')

In [None]:
# save id mappings

np.save(dir + 'POI_reIndex_mapping.npy', POI_reIndex_mapping)
np.save(dir + 'user_reIndex_mapping.npy', user_reIndex_mapping)
np.save(dir + 'cat_reIndex_mapping.npy', cat_reIndex_mapping)

In [None]:
# save collective POI's category distribution dictionary

Helper.save_dict(poi_cat_distrib, dir + 'poi_cat_distrib.pkl')
Helper.save_dict(poi_cat_specific_poi_dict, dir + 'poi_cat_specific_poi_dict.pkl')

In [None]:
# save POI distance matrix 

np.save(dir + 'dist_mat.npy', dist_mat)

In [None]:
# save other relavant parameters

np.save(dir + 'max_dist.npy', max_dist) # max distance (for distance embedding)
np.save(dir + 'max_seq_len.npy', max_seq_len) # max sequence length (for input size)
np.save(dir + 'neg_sample_num.npy', neg_sample_num) # number of negative samples (for negative input size)