# Part 2 (Multiprocessing)

Import some modules

In [1]:
import pandas as pd
import numpy as np
import gmplot
import random
import time

from Queue import PriorityQueue
from fastdtw import fastdtw,dtw
from haversine import haversine
from ast import literal_eval
from utils import create_dir,pretty_print,remove_time,mydtw
from multiprocessing import Pool,Queue

- Open train_set.csv
- Open test_set_a1.csv

In [2]:
trainSet = pd.read_csv(
'../datasets/train_set.csv',
converters={"Trajectory": literal_eval},
index_col='tripId'
)

testSet_a1 = pd.read_csv(
'../datasets/test_set_a1.csv',
sep='\t',
converters={"Trajectory": literal_eval},
)

- We remove timestamps because we do not need them .  
- Also, we reverse latitude-longitude pairs , because they were given in the wrong order

In [3]:
trainSet.apply(func=remove_time,axis=1,reduce=False)
testSet_a1.apply(func=remove_time,axis=1,reduce=False);

- Create output directories

In [4]:
for q in range(len(testSet_a1)):
    create_dir('../html_maps/erwthma_a1/query_'+str(q+1))

- Split trainSet into chunks 

**Note:**
Set the number of parallel processes according to your CPU/RAM specs

In [5]:
procs = 5
chunk_size = len(trainSet) // procs
splits = [df for g, df in trainSet.groupby(np.arange(len(trainSet)) // chunk_size)]

- This is the function that each process will run

**Notes:**   
- We used [haversine module](https://github.com/mapado/haversine "haversine github") for computing haversine's distance .
- [fastdtw module](https://github.com/slaypni/fastdtw "fastdtw github") provides also a classic\[optimal\] dtw implementation, called "dtw". It is slower than our own implementation due to some array preprocessing , but both give the exact same results . 

In [6]:
def process_function(train_split):
    """
    Calculate DTW distance using Haversine's formula
    Append it to distance queue
    Note: This queue is shared among the processes
    """

    for tripId,journeyPatternId,train in train_split.itertuples():
        dist = fastdtw(query,train,dist=haversine)[0]
#         dist = mydtw(query,train,dist=haversine)[0]
#         dist = dtw(query,train,dist=haversine)[0]
        distances.put((dist,journeyPatternId,train))


*For each query in test_set_a1.csv:*   

- Create a pool of processes | Use a *process-safe queue* for storing each calculated distance
- Find 5 NN using [fastdtw](https://github.com/slaypni/fastdtw "fastdtw github") or mydtw (i.e: the slow one,based on [wikipedia's pseudocode](https://en.wikipedia.org/wiki/Dynamic_time_warping#Implementation "Wikipedia DTW"))
- Print stats
- Plot query map
- Plot its 5 nearest neighbors' maps

**Notes:** 
- The *if _\_name__*  part is necessary on Windows
- map() blocks until all tasks are finished

In [7]:
if __name__ == '__main__':
    S = time.time()

    for i,query in testSet_a1.itertuples(): 

        start_time = time.time()
        distances  = Queue()

        pool = Pool(processes=procs)
        pool.map(process_function,splits)
        pool.close()

        nearest = sorted([distances.get() for _ in range(distances.qsize())])[:5]
        pretty_print(i,time.time() - start_time,nearest)
        
        # Print query
        querymap = gmplot.GoogleMapPlotter(53.3498,-6.2603, 14)
        lats , lons = [point[0] for point in query] ,[point[1] for point in query]
        querymap.plot(lats,lons,color='green',edge_width=5)
        querymap.draw('../html_maps/erwthma_a1/query_' + str(i+1) + '/query.html')
        
        # Print neighbors
        for idx,n in enumerate(nearest):
            lats , lons = [point[0] for point in n[2]] ,[point[1] for point in n[2]]
            neighbourmap = gmplot.GoogleMapPlotter(53.3498,-6.2603, 14)
            neighbourmap.plot(lats,lons,color='green',edge_width=5)
            neighbourmap.draw('../html_maps/erwthma_a1/query_' + str(i+1) + '/neighbour' + str(idx+1) + '.html')

    print "Total time: %.2f sec"%(round(time.time()  - S,2))


Query 1
Time =  55.32sec
Neighbour[1]
JP_ID : 01501001
DTW   : 0.0 km 

Neighbour[2]
JP_ID : 01501001
DTW   : 3.5 km 

Neighbour[3]
JP_ID : 01501001
DTW   : 3.8 km 

Neighbour[4]
JP_ID : 01501001
DTW   : 4.0 km 

Neighbour[5]
JP_ID : 01501001
DTW   : 4.1 km 

Query 2
Time =  33.03sec
Neighbour[1]
JP_ID : 01200001
DTW   : 0.0 km 

Neighbour[2]
JP_ID : 01200001
DTW   : 2.8 km 

Neighbour[3]
JP_ID : 01200001
DTW   : 3.4 km 

Neighbour[4]
JP_ID : 01200001
DTW   : 3.4 km 

Neighbour[5]
JP_ID : 01200001
DTW   : 3.5 km 

Query 3
Time =  45.12sec
Neighbour[1]
JP_ID : 00791001
DTW   : 0.0 km 

Neighbour[2]
JP_ID : 00791001
DTW   : 4.7 km 

Neighbour[3]
JP_ID : 00791001
DTW   : 4.8 km 

Neighbour[4]
JP_ID : 00791001
DTW   : 4.8 km 

Neighbour[5]
JP_ID : 00791001
DTW   : 4.9 km 

Query 4
Time =  37.77sec
Neighbour[1]
JP_ID : 00010002
DTW   : 0.0 km 

Neighbour[2]
JP_ID : 00010002
DTW   : 2.5 km 

Neighbour[3]
JP_ID : 00010002
DTW   : 2.8 km 

Neighbour[4]
JP_ID : 00010002
DTW   : 3.2 km 

Neighbo