# Erwthma A2 (Multiprocessing)

Import some modules

In [1]:
import pandas as pd
import numpy as np
import gmplot
import time

from ast import literal_eval
from Queue import PriorityQueue
from haversine import haversine
from multiprocessing import Pool,Queue
from utils import create_dir,pretty_print,remove_time,lcs

- Open train_set.csv
- Open test_set_a2.csv

In [2]:
trainSet = pd.read_csv(
'../datasets/train_set.csv',
converters={"Trajectory": literal_eval},
index_col='tripId'
)

testSet_a2 = pd.read_csv(
'../datasets/test_set_a2.csv',
sep='\t',
converters={"Trajectory": literal_eval},
)


- We remove timestamps because we do not need them .  
- Also, we reverse latitude-longitude pairs , because they were given in the wrong order

In [3]:
trainSet.apply(func=remove_time,axis=1,reduce=False)
testSet_a2.apply(func=remove_time,axis=1,reduce=False);

- Create output directories

In [4]:
for q in range(len(testSet_a2)):
    create_dir('../html_maps/erwthma_a2/query_'+str(q+1))

- Split trainSet into chunks 

**Note:**
Set the number of parallel processes according to your CPU/RAM specs

In [5]:
procs = 6
chunk_size = len(trainSet) // procs
splits = [df for g, df in trainSet.groupby(np.arange(len(trainSet)) // chunk_size)]

- This is the function that each process will run

**Note:** Our LCS implementation is based on [Wikipedia's LCS DP algorithm](https://en.wikipedia.org/wiki/Longest_common_subsequence_problem#Code_for_the_dynamic_programming_solution "LCS Wiki")

In [6]:
def process_function(train_split):
    """
    Calculate LCS (length of it & the LCS itself) 
    Append it to distance queue
    Note: This queue is shared among the processes
    """
    
    for tripId,journeyPatternId,train in train_split.itertuples():
        length, LCS = lcs(query,train)
        distances.put((length,journeyPatternId,LCS,train))

*For each query in test_set_a2.csv:* 


- Create a pool of processes | Use a *process-safe queue* for storing each lcs
- Find 5 NN
- Print stats
- Plot query map
- Plot neighbor's whole route with green color
- Plot LCS with red color

**Notes:** 
- The *if _\_name__*  part is necessary on Windows
- map() blocks until all tasks are finished

In [7]:
if __name__ == '__main__':
    S = time.time()

    for i,query in testSet_a2.itertuples(): 

        start_time = time.time()
        distances = Queue()

        pool = Pool(processes=procs)
        pool.map(process_function,splits)
        pool.close()

        longest = sorted([distances.get() for _ in range(distances.qsize())],reverse=True)[:5]
        pretty_print(i,time.time() - start_time,longest,flag=2)

        # Plot the query
        querymap = gmplot.GoogleMapPlotter(53.3498,-6.2603, 14)
        lats , lons = [point[0] for point in query] ,[point[1] for point in query]
        querymap.plot(lats,lons,color='green',edge_width=5)
        querymap.draw('../html_maps/erwthma_a2/query_' + str(i+1) + '/query.html')

        for idx,route in enumerate(longest):
            LCS              = route[2]
            whole_route      = route[3]

            gmap = gmplot.GoogleMapPlotter(53.3498,-6.2603, 14)

            # Plot neighbor's whole route
            lats , lons = [point[0] for point in whole_route] ,[point[1] for point in whole_route]
            gmap.plot(lats,lons,color='green',edge_width=5)

            # Plot LCS on top of neighbor's whole route
            lats , lons = [point[0] for point in LCS] ,[point[1] for point in LCS]
            gmap.plot(lats,lons,color='red',edge_width=5)
            gmap.draw('../html_maps/erwthma_a2/query_' + str(i+1) + '/neighbour' + str(idx+1) + '.html')

    print "Total time: %.2f sec"%(round(time.time()  - S,2))


Query 1
Time =  106.76sec
Neighbour[1]
JP_ID           : 040D1002
Matching Points : 78 

Neighbour[2]
JP_ID           : 040D1002
Matching Points : 78 

Neighbour[3]
JP_ID           : 040D1002
Matching Points : 76 

Neighbour[4]
JP_ID           : 040D1002
Matching Points : 76 

Neighbour[5]
JP_ID           : 040D1002
Matching Points : 75 

Query 2
Time =  108.83sec
Neighbour[1]
JP_ID           : 040D1002
Matching Points : 82 

Neighbour[2]
JP_ID           : 040D1002
Matching Points : 78 

Neighbour[3]
JP_ID           : 040D1002
Matching Points : 75 

Neighbour[4]
JP_ID           : 040D1002
Matching Points : 74 

Neighbour[5]
JP_ID           : 040D1002
Matching Points : 73 

Query 3
Time =  55.65sec
Neighbour[1]
JP_ID           : 079A0001
Matching Points : 40 

Neighbour[2]
JP_ID           : 067X0001
Matching Points : 40 

Neighbour[3]
JP_ID           : 066X0004
Matching Points : 40 

Neighbour[4]
JP_ID           : 01451008
Matching Points : 40 

Neighbour[5]
JP_ID           : 01451001
M