# Part 4

Import some modules

In [1]:
import pandas as pd
import numpy as np
import time

from Queue import PriorityQueue
from fastdtw import fastdtw,dtw
from haversine import haversine
from ast import literal_eval
from utils import create_csv,remove_time,majority_voting,mydtw
from sklearn.model_selection import cross_validate,cross_val_score
from sklearn.base import BaseEstimator, ClassifierMixin

- Open train_set.csv
- Open test_set_a2.csv

In [2]:
trainSet = pd.read_csv(
'../datasets/train_set.csv',
converters={"Trajectory": literal_eval},
index_col='tripId'
)

testSet_a2 = pd.read_csv(
'../datasets/test_set_a2.csv',
sep='\t',
converters={"Trajectory": literal_eval},
)

- We remove timestamps because we do not need them .  
- Also, we reverse latitude-longitude pairs , because they were given in the wrong order

In [3]:
trainSet.apply(func=remove_time,axis=1,reduce=False)
testSet_a2.apply(func=remove_time,axis=1,reduce=False);

- This is our own KNN Classifier
- It is a subclass of [BaseEstimator](http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html) and [ClassifierMixin](http://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html)

In [4]:
class NeighborsClassifier(BaseEstimator,ClassifierMixin):
    
    def __init__(self, n_neighbors=5,metric=fastdtw):
        if n_neighbors < 1 :
            raise ValueError("Expected n_neighbors > 0. Got %d"%(n_neighbors))
        
        self.n_neighbors = n_neighbors
        self.metric   = metric

    def fit(self, trainSet, labels):
        len1 ,len2 = len(trainSet) , len(labels)
        if len1 != len2:
            raise ValueError("Found input variables with inconsistent numbers of samples: [%d,%d]"%(len1,len2))
        
        self.trainSet = trainSet
        self.labels   = labels
        
    def predict(self,X):
        predicted = []

        for query in X:
            distances  = PriorityQueue()
            for i,train in enumerate(self.trainSet):
                
                # Calculate DTW distance using Haversine's formula
                # Add (distance,journeyPatternId) to the queue
                dist  = self.metric(query,train,dist=haversine)[0]
#                 dist = mydtw(query,train,dist=haversine)[0]
#                 dist = dtw(query,train,dist=haversine)[0]
                jp_id = self.labels[i]
                distances.put((dist,jp_id))
            
            # Fetch the labels of the k nearest neighbors
            nearest = [neighbor[1] for neighbor in [distances.get() for _ in range(5)]]

            # Apply majority voting.
            # Append result on predicted list
            predicted.append(majority_voting(nearest))
        
        return predicted

In [None]:
X = trainSet['Trajectory']
y = list(trainSet['journeyPatternId'])

queries    = testSet_a2['Trajectory']
tripIdList = testSet_a2.index.values

# Initialize our classifier 
clf  = NeighborsClassifier(metric=fastdtw)

- Cross Validation

**Note:**
n_jobs is set to -1 in order to exploit all cpu cores 

In [None]:
start = time.time()
percent = len(X[:1500]) / float(len(trainSet)) * 100
cv = 10
scores = cross_validate(clf,X[:1500],y[:1500],cv=cv,return_train_score=False,scoring='accuracy',n_jobs=-1)
print 'Cross Validation (folds = %d, %.2f%% of trainSet)'%(cv,percent)
print '============================================='
print 'Time    :',round(time.time()-start,2),'sec'
print("Accuracy: %0.2f (+/- %0.2f)\n" % (scores['test_score'].mean(), scores['test_score'].std() * 2))



- Fit our classifier
- Make predictions for the given queries
- Output results to testSet_JourneyPatternIDs.csv

In [None]:
clf.fit(X,y)
predictions = clf.predict(queries)
create_csv(tripIdList,predictions)
pd.read_csv('../testSet_JourneyPatternIDs.csv',sep='\t')