# Purpose Inference Testing for POI enhancement 
<hr> 

This notebook is used to test the impact of enhanced POI data for the trip purpose inference. Enhanced POI data and the original POI data are used seperately for that requirement. 

The bayesian purpose inference model proposed by <b> Gong et al. (2015) </b>  and improved by <b> Dhananjaya et al. (2021) </b> is used for this testing. 

https://www.tandfonline.com/doi/abs/10.1080/15230406.2015.1014424 </br> 
https://ieeexplore.ieee.org/abstract/document/9655943

In [1]:
# working directory set
import os 
os.chdir(r'C:\\Users\\HP\\Desktop/Projects/Trip purpose/POI enhancement/Purpose Inference')
os.getcwd()

'C:\\Users\\HP\\Desktop\\Projects\\Trip purpose\\POI enhancement\\Purpose Inference'

## Package Import  

In [2]:
# general packages 
import numpy as np 
import pandas as pd 
import math 
from collections import Counter

In [3]:
# geo packages 
import geopandas as gpd 
from shapely.geometry import Point
import shapely.speedups 
shapely.speedups.enable()
from shapely.geometry import Polygon

In [4]:
# visualization packages 
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt

In [5]:
# ML packages
from sklearn.cluster import DBSCAN

In [6]:
# remove the default='warns'
pd.options.mode.chained_assignment = None 

In [27]:
# autocompletion magic command 
%config Completer.use_jedi = False

## Data Import 

### Places (POI) data

In [1]:
# prepare the places df before processsing
# add the category/purpose 
places = pd.read_csv('D:\\MSC\Data\places\\Cleaned_Places\\thi_jpura_dehi_cleaned_places_predicted_transformed.csv',index_col = 0)
places = places.drop(places[places['type_1'] == 'gas_station'].index)
places.reset_index(inplace = True, drop = True)
places = places[['clean_name','lat','lng','type_1','no_of_ratings','avg_rating','purpose','Type']]

NameError: name 'pd' is not defined

In [8]:
# places gpd creation 
geometry = [Point(xy) for xy in zip(places.lng, places.lat)]
placesgpd  = gpd.GeoDataFrame(places, crs="EPSG:4326", geometry=geometry)
placesgpd.crs = "EPSG:4326"

In [9]:
placesgpd.shape

(44316, 9)

#### Raw POI data 

note that according to the imputation of new labels from the enhancing methods, the feature 'type' includes whether it is an original one 'labelled' or a predicted one 'predicted'. hence, the row data can be achived by filtering as follows.

In [10]:
raw_placesgpd  = placesgpd[placesgpd['Type'] == 'Labelled']

In [11]:
raw_placesgpd.shape

(25077, 9)

### Taxi trips 

In [12]:
taxitrips = pd.read_csv('D:\\MSC\Data\pickme_data\\nov\\total\\thi_jupra_dehi_data_nov_45.csv', index_col = 0, parse_dates = [11,12,13])
taxitrips = taxitrips[['trip_id', 'passenger_id', 'pickup_lat', 'pickup_long', 'dropoff_lat', 'dropoff_long', 'actual_pickup_time','pickup_time','drop_time']]

# optimize the data types 
taxitrips['trip_id'] = taxitrips ['trip_id'].astype(np.int32)
taxitrips[['pickup_lat', 'pickup_long', 'dropoff_lat', 'dropoff_long']] = taxitrips[['pickup_lat', 'pickup_long', 'dropoff_lat', 'dropoff_long']].astype(np.float32)

### Inputs for bayesian inference model 

In [28]:
# huanggrid  0 measure the temporal impact for the purposes 
huanggrid = pd.read_csv('D:\\MSC\Data\inputs\HuangGrid.csv',index_col = 0, parse_dates = [0])
huanggrid.columns = huanggrid.columns.str.replace(' ', '')
huanggrid['Day'] = huanggrid['Day'].str.replace(' ', '')

# time df - define the opening and closing times of POIs
time_df = pd.read_csv('D:\\MSC\Data\places\categories\category_type_times.csv',index_col=0)
time_df.columns = time_df.columns.str.replace(' ', '')

## Inference Module import 

In [29]:
# import the required functions for inference from the custom modules 
from Functions import baysian_inference
from Functions import candidate_poi_selection
from Functions import linear_distance

## Processing 

In [15]:
taxitrips_to_baysean_inference  = taxitrips

### Processing for Raw POIs

In [17]:
# array to collect the results 
trip_purposes = np.array([])

for row in taxitrips_to_baysean_inference.itertuples():
    
    # candidate poi selection 
    candidate_pois = candidate_poi_selection.candidate_poi_selection(raw_placesgpd, row.dropoff_lat, row.dropoff_long, row.drop_time, time_df, walking_radius = 100)
    
    # baysean inference 
    Trip_purpose = baysian_inference.baysean_inference_ln(candidate_pois, row.dropoff_lat, row.dropoff_long, row.drop_time, huanggrid)
    
    # collect the results for an array 
    trip_purposes = np.append(trip_purposes, Trip_purpose)


In [None]:
taxitrips_to_baysean_inference['Trip purpose'] = trip_purposes

In [167]:
# saving the file to csv 
taxitrips_to_baysean_inference.to_csv("Raw_POI_results.csv")

In [168]:
# saving the file to feather format 
taxitrips_to_baysean_inference.to_feather("Raw_POI_results.feather")

### Processing for enhanced POIs.

In [17]:
# array to collect the results 
trip_purposes = np.array([])

for row in taxitrips_to_baysean_inference.itertuples():
    
    # candidate poi selection 
    candidate_pois = candidate_poi_selection.candidate_poi_selection(placesgpd, row.dropoff_lat, row.dropoff_long, row.drop_time, time_df, walking_radius = 100)
    
    # baysean inference 
    Trip_purpose = baysian_inference.baysean_inference_ln(candidate_pois, row.dropoff_lat, row.dropoff_long, row.drop_time, huanggrid)
    
    # collect the results for an array 
    trip_purposes = np.append(trip_purposes, Trip_purpose)


In [None]:
taxitrips_to_baysean_inference['Trip purpose'] = trip_purposes

In [167]:
# saving the file to csv 
taxitrips_to_baysean_inference.to_csv("Enhanced_POI_results.csv")

In [168]:
# saving the file to feather format 
taxitrips_to_baysean_inference.to_feather("Enhanced_POI_results.feather")

## EDA

In [335]:
enhanced_data = pd.read_feather(r"C:\Users\HP\Desktop\Projects\Trip purpose\POI enhancement\Purpose Inference\Enhanced_results.feather")
raw_data =  pd.read_feather(r"C:\Users\HP\Desktop\Projects\Trip purpose\POI enhancement\Purpose Inference\Raw_POI_results.feather")

In [188]:
enhanced_data_value_counts = enhanced_data['Trip purpose'].value_counts().reset_index()
enhanced_data_value_counts

Unnamed: 0,index,Trip purpose
0,shopping,33087
1,personal,26971
2,dining,13285
3,medical,11665
4,education,7102
5,home,5571
6,transit,4595
7,recreational,3428
8,,1219
9,multiple,694


In [187]:
raw_data_value_counts = raw_data['Trip purpose'].value_counts().reset_index()
raw_data_value_counts

Unnamed: 0,index,Trip purpose
0,personal,29674
1,shopping,28383
2,dining,17343
3,medical,11238
4,home,6902
5,education,6196
6,transit,4007
7,,2013
8,recreational,1493
9,multiple,368


In [336]:
enhanced_data['Day'] = enhanced_data['drop_time'].dt.day_name()
raw_data['Day'] = raw_data['drop_time'].dt.day_name()

In [340]:
raw_data['Trip purpose'].value_counts()

personal        26749
shopping        25463
dining          15615
NA              12553
medical         10129
home             6229
education        5586
transit          3636
recreational     1329
multiple          328
Name: Trip purpose, dtype: int64

### Results for enhanced POI data 

In [None]:
# stacked bar chart 

x_values = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
width = 0.35       # the width of the bars: can also be len(x) sequence

fig, ax = plt.subplots()

purpose_groups =  enhanced_data.groupby('Trip purpose')
purposes = enhanced_data['Trip purpose'].unique()

y_values_dict = {}

for purpose in purposes:
    
    purpose_df = purpose_groups.get_group(purpose).groupby('Day').size().reset_index().set_index('Day')
    y_values = [purpose_df.loc['Monday'].values[0], purpose_df.loc['Tuesday'].values[0], purpose_df.loc['Wednesday'].values[0], purpose_df.loc['Thursday'].values[0], purpose_df.loc['Friday'].values[0], purpose_df.loc['Saturday'].values[0], purpose_df.loc['Sunday'].values[0]]
    y_values_dict[purpose] = y_values
    
# convert the dictionary keys to variables 
locals().update(y_values_dict)
    
# for purpose in purposes: 
bottom_raw = np.array([0,0,0,0,0,0,0])

for key in y_values_dict:
    
    ax.bar(x_values , y_values_dict[key], width, label = key, bottom = bottom_raw)
    
    bottom_raw = bottom_raw + np.array(y_values_dict[key])

ax.set_ylabel('Trip Count')
ax.legend(loc=0, bbox_to_anchor=(1, 1))
ax.set_xticklabels(x_values, rotation = 20)

plt.savefig(r"C:\Users\HP\Desktop\Projects\Trip purpose\POI enhancement\Purpose Inference\enhanced_results.jpg", dpi = 300, bbox_inches = 'tight')

### Results for original POI data

In [None]:
# stacked bar chart 

x_values = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
width = 0.35       # the width of the bars: can also be len(x) sequence

fig, ax = plt.subplots()

purpose_groups =  raw_data.groupby('Trip purpose')
purposes = enhanced_data['Trip purpose'].unique()

y_values_dict = {}

for purpose in purposes:
    
    purpose_df = purpose_groups.get_group(purpose).groupby('Day').size().reset_index().set_index('Day')
    y_values = [purpose_df.loc['Monday'].values[0], purpose_df.loc['Tuesday'].values[0], purpose_df.loc['Wednesday'].values[0], purpose_df.loc['Thursday'].values[0], purpose_df.loc['Friday'].values[0], purpose_df.loc['Saturday'].values[0], purpose_df.loc['Sunday'].values[0]]
    y_values_dict[purpose] = y_values
    
# convert the dictionary keys to variables 
locals().update(y_values_dict)
    
# for purpose in purposes: 
bottom_raw = np.array([0,0,0,0,0,0,0])

for key in y_values_dict:
    
    ax.bar(x_values , y_values_dict[key], width, label = key, bottom = bottom_raw)
    
    bottom_raw = bottom_raw + np.array(y_values_dict[key])

ax.set_ylabel('Trip Count')
ax.legend(loc=0, bbox_to_anchor=(1, 1))
ax.set_xticklabels(x_values, rotation = 20)

plt.savefig(r"C:\Users\HP\Desktop\Projects\Trip purpose\POI enhancement\Purpose Inference\raw_results.jpg", dpi = 300, bbox_inches = 'tight')