### Data caveats

1. This is based on data collected on the UC Berkeley campus in Spring 2014. Unfortunately, due to the sensitive nature of the data collected (location data), the dataset cannot be shared, but the corresponding author can run queries against it and return the aggregate results upon request.
2. Since the initial results were not run with a fixed random seed, the results in this notebook differ slightly from the published results, but they are generally consistent. And any subsequent results should be identical since this notebook contains a random seed

In [None]:
import datetime

We collected some more training data after this, but it was not included in these results.
So let us read the data from the backup database to ensure consistency with the published results

In [None]:
import pymongo

In [None]:
Sections = pymongo.MongoClient('localhost').Backup_database.Stage_Sections

In [None]:
Sections.find({'type': 'move'}).count()

In [None]:
import numpy as np
import scipy as sp
np.random.seed(61297777)

In [None]:
confirmedSections = Sections.find({"$and": [{'type': 'move'}, {'confirmed_mode': {'$ne': ''}}]})

In [None]:
walkSections = Sections.find({"$and": [{'type': 'move'}, {'confirmed_mode': 1}]})

In [None]:
import json

In [None]:
modeList = json.load(open("modes.json"))
print(modeList)

In [None]:
print(Sections.find({"$and": [{'type': 'move'}, {'confirmed_mode': {'$ne': ''}}]}).count())
modeNameList = []
modeCountList = []
for mode in modeList:
    modeCount = Sections.find({"$and": [{'type': 'move'}, {'confirmed_mode': mode['mode_id']}]}).count()
    print("%s, %s" % (mode['mode_name'], modeCount))
    if modeCount > 0:
        modeNameList.append(mode['mode_name'])
        modeCountList.append(modeCount)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib as mpl

%matplotlib inline
%config InlineBackend.figure_format='png'

In [None]:
import displayHelpers

In [None]:
cleanModeNames = {"walking":"walk", "running":"run", "cycling":"cycle"}
(fig, ax) = displayHelpers.showCategoryChart(modeNameList, [modeCountList], ['Confirmed'], ['r'],
                                             "Number of trip sections", "Trip sections by mode", 
                                             cleanNameDict = cleanModeNames, figsize=(12,5))
ax.set_ylim(top=4000)
plt.show()

In [None]:
startTimes = Sections.find({"$and": [{'type': 'move'}, {'confirmed_mode': {'$ne': ''}}]}, {'section_start_datetime': 1, '_id': 0}).sort('section_start_datetime', pymongo.ASCENDING)

In [None]:
nSections = startTimes.count()
print("Starting from %s" % startTimes[0])
print("Ending at %s" % startTimes[nSections-1])
startTime = startTimes[0]['section_start_datetime']
endTime = startTimes[nSections-1]['section_start_datetime']

In [None]:
from datetime import datetime, timedelta

In [None]:
timeNameList = []
timeCountList = []
timeTotalList = []
currTime = startTime
while currTime < endTime:
    currEndTime = currTime + timedelta(days=30)
    if currEndTime > endTime:
        currEndTime = endTime
    currTimeSearch = {'section_start_datetime': {"$gte": currTime, "$lte": currEndTime}}
    nTrips = Sections.find({"$and": [{'type': 'move'}, {'confirmed_mode': {'$ne': ''}}, currTimeSearch]}, {'section_start_datetime': 1, '_id': 0}).count()
    nTotalTrips = Sections.find({"$and": [{'type': 'move'}, currTimeSearch]}, {'section_start_datetime': 1, '_id': 0}).count()
    print("%s - %s = %s, %s" % (currTime.strftime("%m-%d"), currEndTime, nTrips, nTotalTrips))
    timeNameList.append("%s to %s" % (currTime.strftime("%b-%d"), currEndTime.strftime("%b-%d")))
    timeCountList.append(nTrips)
    timeTotalList.append(nTotalTrips)
    currTime = currEndTime

In [None]:
(fig, ax) = displayHelpers.showCategoryChart(timeNameList[:-1], [timeCountList[:-1], timeTotalList[:-1]],
                                             ["Confirmed", "Total"], ['r', 'b'], 
                                             "Number of trip sections", "Trip sections by start time", figsize=(12,5))
# ax.set_ylim(top=5500)
plt.show()

In [None]:
timeNameList = []
timeCountList = []
timeTotalList = []
currTime = datetime(year=2014,month=5,day=6)
while currTime < endTime:
    currTimeSearch = {'section_start_datetime': {"$lte": currTime}}
    nTrips = Sections.find({"$and": [{'type': 'move'}, {'confirmed_mode': {'$ne': ''}}, currTimeSearch]}, {'section_start_datetime': 1, '_id': 0}).count()
    nTotalTrips = Sections.find({"$and": [currTimeSearch]}, {'section_start_datetime': 1, '_id': 0}).count()
    print("%s = %s, %s" % (currTime.strftime("%m-%d"), nTrips, nTotalTrips))
    timeNameList.append("%s" % (currTime.strftime("%b-%d")))
    timeCountList.append(nTrips)
    timeTotalList.append(nTotalTrips)
    currTime = currTime + timedelta(days=7)

In [None]:
idList = Sections.find({"$and": [{'type': 'move'}, {'confirmed_mode': {'$ne': ''}}]}).distinct('user_id')
idNameList = []
idCountList = []
confirmedCountList = []
for i, id in enumerate(idList):
    count = Sections.find({"$and": [{'type': 'move'}, {'user_id': id}]}).count()
    confirmedCount = Sections.find({"$and": [{'type': 'move'}, {'confirmed_mode': {'$ne': ''}}, {'user_id': id}]}).count()
    print(id, count, confirmedCount)
    idNameList.append("%s" % i)
    idCountList.append(count)
    confirmedCountList.append(confirmedCount)

In [None]:
fig, axes = displayHelpers.showCategoryChart(idNameList, [confirmedCountList, idCountList],
                                             ["Confirmed Sections", "Total Sections"],
                                             ['r', 'b'], "Number of trip sections", "Trip sections by user", figsize=(12,5))
oldSize = mpl.rcParams['font.size']
mpl.rcParams['font.size'] = 16
labels = range(0, len(idNameList), 5)
axes.set_xticks(labels)
axes.set_xticklabels(np.array(idNameList)[labels])
fig.show()
mpl.rcParams['font.size'] = oldSize

In [None]:
mpl.rcParams

In [None]:
from featurecalc import calDistance, calSpeed, calHeading, calAvgSpeed, calSpeeds, calAccels, getIthMaxSpeed, getIthMaxAccel, calHCR,\
calSR, calVCR, mode_cluster, mode_start_end_coverage

In [None]:
def getSpeedsForMode(modeId):
    modeSectionCursor = Sections.find({"$and": [{'type': 'move'}, {'confirmed_mode': modeId}]})
    speedList = []
    for section in modeSectionCursor:
        speeds = calSpeeds(section)
        if speeds != None:
            # currHistogram = sp.histogram(speeds)
            speedList.append(speeds)
    return speedList

## Feature matrix construction

In [None]:
# Features are:
# 0. distance
# 1. duration
# 2. first filter mode
# 3. sectionId
# 4. avg speed
# 5. speed EV
# 6. speed variance
# 7. max speed
# 8. max accel
# 9. isCommute
# 10. heading change rate (currently unfilled)
# 11. stop rate (currently unfilled)
# 12. velocity change rate (currently unfilled)
# 13. start lat
# 14. start lng
# 15. stop lat
# 16. stop lng
# 17. start hour
# 18. end hour
# 19. both start and end close to bus stop
# 20. both start and end close to train station
# 21. both start and end close to airport
featureLabels = ["distance", "duration", "first filter mode", "sectionId", "avg speed",
                 "speed EV", "speed variance", "max speed", "max accel", "isCommute",
                 "heading change rate", "stop rate", "velocity change rate", "start lat", "start lng",
                 "stop lat", "stop lng", "start hour", "end hour", "close to bus stop", "close to train stop",
                 "close to airport"]
bus_cluster=mode_cluster(5,105,1)
train_cluster=mode_cluster(6,600,1)
air_cluster=mode_cluster(9,600,1)
def generateFeatureMatrixAndResultVector(sectionQuery):
    confirmedSections = Sections.find(sectionQuery)
    featureMatrix = np.zeros([confirmedSections.count(), len(featureLabels)])
    resultVector = np.zeros(confirmedSections.count())
    for (i, section) in enumerate(confirmedSections):
        featureMatrix[i, 0] = section['distance']
        featureMatrix[i, 1] = (section['section_end_datetime'] - section['section_start_datetime']).total_seconds()
        
        # Deal with unknown modes like "airplane"
        try:
            featureMatrix[i, 2] = section['mode']
        except ValueError:
            featureMatrix[i, 2] = 0
            
        featureMatrix[i, 3] = section['section_id']
        featureMatrix[i, 4] = calAvgSpeed(section)
        speeds = calSpeeds(section)
        if speeds != None:
            featureMatrix[i, 5] = np.mean(speeds)
            featureMatrix[i, 6] = np.std(speeds)
            featureMatrix[i, 7] = np.max(speeds)
        else:
            # They will remain zero
            pass
        accels = calAccels(section)
        if accels != None and len(accels) > 0:
            featureMatrix[i, 8] = np.max(accels)
        else:
            # They will remain zero
            pass
        featureMatrix[i, 9] = ('commute' in section) and (section['commute'] == 'to' or section['commute'] == 'from')
        featureMatrix[i, 10] = calHCR(section)
        featureMatrix[i, 11] = calSR(section)
        featureMatrix[i, 12] = calVCR(section)
        if section['section_start_point'] != None:
            startCoords = section['section_start_point']['coordinates']
            featureMatrix[i, 13] = startCoords[0]
            featureMatrix[i, 14] = startCoords[1]
        
        if section['section_end_point'] != None:
            endCoords = section['section_end_point']['coordinates']
            featureMatrix[i, 15] = endCoords[0]
            featureMatrix[i, 16] = endCoords[1]
        
        featureMatrix[i, 17] = section['section_start_datetime'].time().hour
        featureMatrix[i, 18] = section['section_end_datetime'].time().hour
        
        featureMatrix[i, 19] = mode_start_end_coverage(section,bus_cluster,105)
        featureMatrix[i, 20] = mode_start_end_coverage(section,train_cluster,600)
        featureMatrix[i, 21] = mode_start_end_coverage(section,air_cluster,600)
        resultVector[i] = section['confirmed_mode']
    return (featureMatrix, resultVector)

In [None]:
(featureMatrix, resultVector) = generateFeatureMatrixAndResultVector({"$and": [{'type': 'move'}, {'confirmed_mode': {'$ne': ''}}]})


In [None]:
print(np.max(featureMatrix[:,10]))
print(np.max(featureMatrix[:,20]))
print(np.max(featureMatrix[:,12]))
print(featureMatrix.shape, resultVector.shape)

In [None]:
runIndices = resultVector == 2
transportIndices = resultVector == 4
mixedIndices = resultVector == 8
airIndices = resultVector == 9
unknownIndices = resultVector == 0
strippedIndices = np.logical_not(runIndices | transportIndices | mixedIndices)
print("runIndices = %s" % (np.nonzero(runIndices)))
print("transportIndices = %s" % (np.nonzero(transportIndices)))
print("mixedIndices = %s" % (np.nonzero(mixedIndices)))
print("airIndices = %s" % (np.nonzero(airIndices)))
print("unknownIndices = %s" % (np.nonzero(unknownIndices)))
print("strippedIndices count = %s" % (np.count_nonzero(strippedIndices)))

Now, we filter out "mixed" and "running", since there are few instances of them and we don't intend to predict them initially. We also filter out any "transport" since it should never be in the confirmed set, and we don't want to deal with it if it is.

In [None]:
strippedFeatureMatrix = featureMatrix[strippedIndices]
strippedResultVector = resultVector[strippedIndices]

First, we visualize the distribution of some of the features. This is so that we can compare our dataset to Zheng et al 2010.

In [None]:
def plotFeatureVector(featureMatrix, resultVector, featureIndex, modeList):
    avgSpeedFig, avgSpeedAxes = plt.subplots(1,1)
    currModeSpeedsList = []
    currModeNamesList = []
    for mode in modeList:
        currModeMask = resultVector == mode['mode_id']
        currModeSpeeds = featureMatrix[currModeMask, featureIndex]
        # print "For mode %s, shape is %s" % (mode['mode_id'], str(currModeSpeeds.shape))
        if np.count_nonzero(currModeMask) != 0:
            currModeNamesList.append(mode['mode_name'])
            currModeSpeedsList.append(currModeSpeeds)
    avgSpeedAxes.hist(currModeSpeedsList, normed=True, histtype="bar", label=currModeNamesList)
    avgSpeedAxes.set_ylabel("number of segments")
    avgSpeedAxes.set_xlabel(featureLabels[featureIndex])
    plt.legend()

In [None]:
for col in range(0, len(featureLabels)):
    plotFeatureVector(strippedFeatureMatrix, strippedResultVector, col, modeList)

The air modes are such outliers that we are unable to see the variation in the other modes. Let's strip out the outliers and focus on lower speed trips.

In [None]:
speedNormalEntries = strippedFeatureMatrix[:,4] < 50
plotFeatureVector(strippedFeatureMatrix[speedNormalEntries], strippedResultVector[speedNormalEntries], 4, modeList)

Using the graphs above, we can estimate the separability of our input. Clearly, there is some separability - the car and train trips that are at 20-30+ are clearly separable from the walk/bike trips that are at lower speeds. But are they separable from each other? And at least eyeballing the data, it looks like at least 75% of car trips are actually not that fast - the mean EV is < 10mph. Even with max speed, at least 25% of car trips appear to have a max speed ~ 10 mph. Max accel doesn't seem to have as much predictive power as one might hope - most max accel clusters at less than 5. It would be nice to visualize the clusters in this data, but I'm just going to start trying decision trees and SVMs on this data now.

We used to strip out outliers here, but the outliers actually correspond to plane trips, so we want to retain them. The rest of the code assumes that we stripped outliers to get "cleaned" trips, so we just reassign the values here instead of changing all the code. We can restore outlier detection at that point.

In [None]:
cleanedFeatureMatrix = strippedFeatureMatrix
cleanedResultVector = strippedResultVector

## Feature Indices

In [None]:
genericFeatureIndices = list(range(0,10))
AdvancedFeatureIndices = list(range(10,13))
LocationFeatureIndices = list(range(13,17))
TimeFeatureIndices = list(range(17,19))
BusTrainFeatureIndices = list(range(19,22))
print(genericFeatureIndices)
print(AdvancedFeatureIndices)
print(LocationFeatureIndices)
print(TimeFeatureIndices)
print(BusTrainFeatureIndices)

## Generic model, generic features

In [None]:
genericCleanedFM = cleanedFeatureMatrix[:,genericFeatureIndices]
print(genericCleanedFM.shape)

In [None]:
from sklearn import cross_validation
from sklearn import svm

In [None]:
svmClf = svm.LinearSVC()
svmScores = cross_validation.cross_val_score(svmClf, genericCleanedFM, cleanedResultVector, cv=5)

In [None]:
print(svmScores)
print(svmScores.mean())

Using svm.SVC() takes significantly longer (hours instead of seconds) but generates higher accuracy. The accuracy is still lower than the random forest, though.

In [None]:
from sklearn import ensemble

In [None]:
forestClf = ensemble.RandomForestClassifier()
forestScores = cross_validation.cross_val_score(forestClf, genericCleanedFM, cleanedResultVector, cv=5)

In [None]:
print(forestScores)
print(forestScores.mean())

These results look pretty good, and pretty much parallel what the Zheng paper got, even with just the basic features. We get 82% average accuracy for a linear SVM and 86% average accuracy for a random forest. But the 82% and 86% values are for cross validation, where we have a known value that we can validate against.

But what we really want to do is to decide, while looking at a section that we have no ground truth on, whether we want the user to classify it or not. And then we want to see, for the high confidence predictions that we will not prompt the user for, how accurate our classification really is.

In order to do this, we get the probabilities for each prediction in addition to the prediction itself. We can then test the accuracy of the high confidence predictions and compare it to the accuracy of all predictions.

To recap, we now return three metrics:

- The number of entries that would be autoclassified given a particular target confidence interval
- The accuracy of the entries that would be autoclassified
- The accuracy of all entries, including ones that had low confidence

In [None]:
# Generate folds of indices
def generateFoldArrays(nIndices, nFolds):
    currPermutation = np.random.permutation(nIndices)
    currPermutationParts = np.array_split(currPermutation, nFolds)
    
    foldArrays = []
    for i in range(0, nFolds):
        testIndices = currPermutationParts[i]
        trainIndicesParts = [currPermutationPart for (j, currPermutationPart) in enumerate(currPermutationParts) if j != i]
        trainIndices = np.concatenate(trainIndicesParts)
        foldArrays.append((trainIndices, testIndices))
    return foldArrays

def kFoldValidationWithProb(algo, X, y, nFolds, prob_threshold):
    foldArrays = generateFoldArrays(len(y), nFolds)
    
    scores = []
    highConfidenceScores = []
    percentAutoClassified = []
    percentAutoClassifiedByMode = []
    for (trainIndices, testIndices) in foldArrays:
        # print testIndices[0]
        model = algo.fit(X[trainIndices], y[trainIndices])
        testX = X[testIndices]
        testy = y[testIndices]
        
        predictedY = model.predict(testX)
        if hasattr(algo, "decision_function"):
            predictedYProb = algo.decision_function(testX)
        else:
            predictedYProb = algo.predict_proba(testX)
        
        # print ("predictedY.shape = %s, predictedYProb.shape = %s" %
        #        (str(predictedY.shape), str(predictedYProb.shape)))
        
        # As we can see below, we take the max confidence along the first axis
        highConfidencePredictions = np.max(predictedYProb, 1) > prob_threshold
        print("Found %s high confidence predictions out of %s" % (np.count_nonzero(highConfidencePredictions),
                                                                  len(testIndices)))
        
        cmc = lambda m:np.count_nonzero(testy[highConfidencePredictions] == m)
        
        # Let us see how many of each mode were autoclassified
        # print("Autoclassifications split by confirmed modes: walk: %s, bike: %s, bus: %s, train: %s, car: %s" %
        #       (cmc(1), cmc(3), cmc(5), cmc(6), cmc(7)))
        
        pcmc = lambda m: float(np.count_nonzero(testy[highConfidencePredictions] == m))/np.count_nonzero(testy == m) if ((np.count_nonzero(testy == m) != 0)) else 0 
        # Let us see what percentage of each mode was autoclassified
        # print("For threshold %s, autoclassifications split by confirmed mode percents: walk: %s, bike: %s, bus: %s, train: %s, car: %s" %
        #        (prob_threshold, pcmc(1), pcmc(3), pcmc(5), pcmc(6), pcmc(7)))
        
        percentAutoClassified.append(float(np.count_nonzero(highConfidencePredictions))/len(testIndices))
        percentAutoClassifiedByMode.append([pcmc(1), pcmc(3), pcmc(5), pcmc(6), pcmc(7), pcmc(9)])
        
        # so now we are going to generate two scores.
        # the first is the score on only the high confidence predictions
        highConfidenceScore = model.score(testX[highConfidencePredictions], testy[highConfidencePredictions])
        highConfidenceScores.append(highConfidenceScore)
        
        score = model.score(X[testIndices], y[testIndices])
        scores.append(score)
    # print scores
    
    print("for prob %s, percentage auto classified %s" % (prob_threshold, np.array(percentAutoClassified).mean()))
    print("for prob %s, scoring only on high confidence predictions %s" % (prob_threshold, np.array(highConfidenceScores).mean()))
    print("for prob %s, scoring on all predictions %s" % (prob_threshold, np.array(scores).mean()))

    return (np.array(percentAutoClassified), np.array(percentAutoClassifiedByMode), np.array(highConfidenceScores), np.array(scores))

In [None]:
def exploreKFoldValidationSpace(algo, X, y, nFolds):
    (pac0, pacm0, hcs0, s0) = kFoldValidationWithProb(algo, X, y, nFolds, 0.90)    
    (pac5, pacm5, hcs5, s5) = kFoldValidationWithProb(algo, X, y, nFolds, 0.95)
    (pac9, pacm9, hcs9, s9) = kFoldValidationWithProb(algo, X, y, nFolds, 0.99)
    
    probs = [0.90, 0.95, 0.99]
    pacs = [pac0.mean(), pac5.mean(), pac9.mean()]
    hcs = [hcs0.mean(), hcs5.mean(), hcs9.mean()]
    ss = [s0.mean(), s5.mean(), s9.mean()]
    
    pacmWalk = [pacm0[:,0].mean(), pacm5[:,0].mean(), pacm9[:,0].mean()]
    pacmBike = [pacm0[:,1].mean(), pacm5[:,1].mean(), pacm9[:,1].mean()]
    pacmBus = [pacm0[:,2].mean(), pacm5[:,2].mean(), pacm9[:,2].mean()]
    pacmTrain = [pacm0[:,3].mean(), pacm5[:,3].mean(), pacm9[:,3].mean()]
    pacmCar = [pacm0[:,4].mean(), pacm5[:,4].mean(), pacm9[:,4].mean()]
    pacmAir = [pacm0[:5].mean(), pacm5[:,5].mean(), pacm9[:,4].mean()]
    
    fig, axes = plt.subplots(1, 1, figsize=(15, 10))
    print(pacs)
    axes.set_yticks(np.arange(0,1,0.1))
    axes.plot(probs, pacs, label="percentage auto classified")
    
    print(pacmWalk)
    axes.plot(probs, pacmWalk, linewidth = 5, label="percent walk auto classified")
    print(pacmBike)
    axes.plot(probs, pacmBike, label="percent bike auto classified")
    print(pacmBus)
    axes.plot(probs, pacmBus, linewidth=5, label="percent bus auto classified")
    print(pacmTrain)
    axes.plot(probs, pacmTrain, label="percent train auto classified")
    print(pacmCar)
    axes.plot(probs, pacmCar, linewidth=5, label="percent car auto classified")
    print(pacmAir)
    axes.plot(probs, pacmAir, linewidth=5, label = "percent air auto classified")
    
    print(hcs)
    axes.plot(probs, hcs, label="accuracy of high confidence samples")
    print(ss)
    axes.plot(probs, ss, linewidth = 5, label="accuracy of all samples")
    plt.legend(loc='best')

In [None]:
forestClf = ensemble.RandomForestClassifier()
exploreKFoldValidationSpace(forestClf, genericCleanedFM, cleanedResultVector, 5)

The results of these three metrics for confidence intervals of 90%, 95% and 99% are shown above, and they are all largely similar. The accuracy of the high confidence predictions is, as expected, really high at 97 - 98%. However, we were only able to auto-classify ~ 50% of the sections. Now, let's retry using the linear SVM above.

In [None]:
svmClf = svm.LinearSVC()
exploreKFoldValidationSpace(svmClf, genericCleanedFM, cleanedResultVector, 5)

We see that the SVM is able to classify more trips than the decision tree, but at the cost of unacceptably lower performance on the high confidence predictions. It is hard to understand the results with line plots, let's switch to bar graphs instead.

In [None]:
def exploreKFoldValidationSpaceBarGraph(algo, X, y, nFolds):
    (pac0, pacm0, hcs0, s0) = kFoldValidationWithProb(algo, X, y, nFolds, 0.90)    
    (pac5, pacm5, hcs5, s5) = kFoldValidationWithProb(algo, X, y, nFolds, 0.95)
    (pac9, pacm9, hcs9, s9) = kFoldValidationWithProb(algo, X, y, nFolds, 0.99)
    
    probs = [0.90, 0.95, 0.99]
    pacs = [pac0.mean() * 100, pac5.mean() * 100, pac9.mean() * 100]
    hcs = [hcs0.mean() * 100, hcs5.mean() * 100, hcs9.mean() * 100]
    ss = [s0.mean() * 100, s5.mean() * 100, s9.mean() * 100]
    
    mpl.rcParams['font.size'] = 16
    fig, axes = displayHelpers.showCategoryChart(["90%", "95%", "99%"], [pacs, hcs, ss],
                                                       ["% high confidence", "high confidence accuracy", "overall accuracy"],
                                                       ['r', 'g', 'b'], "Percent", "High confidence predictions",
                                                       width=0.15, figsize=(6,5))
    axes.set_yticks(range(0, 100, 10))
    axes.axhline(50, label = "50%")
    axes.axhline(90, label = "90%")
    # axes.get_legend().set_bbox_
    axes.set_ylim(top=100)
    fig.show()

In [None]:
forestClf = ensemble.RandomForestClassifier()
exploreKFoldValidationSpaceBarGraph(forestClf, cleanedFeatureMatrix, cleanedResultVector, 5)

In [None]:
svmClf = svm.LinearSVC()
exploreKFoldValidationSpaceBarGraph(svmClf, cleanedFeatureMatrix, cleanedResultVector, 5)

We now get the most important params for the decision tree so that we can better understand what it is doing.

In [None]:
forestClf.get_params()

In [None]:
for (i, importance) in enumerate(forestClf.feature_importances_):
    print(featureLabels[i], importance)

So the highest importance features are:

- first filter mode (moves mode)
- speed EV
- avg speed
- distance

Now, let's try another non-parametric method like nearest neighbor

In [None]:
from sklearn import neighbors

In [None]:
knnClf = neighbors.KNeighborsClassifier()

In [None]:
exploreKFoldValidationSpace(knnClf, cleanedFeatureMatrix, cleanedResultVector, 5)

knn does almost the same as decision tree, except that the accuracy of the high confidence predictions is a bit lower.
I think that the percentages are around the same as well. Basically, we can classify walk pretty well and the others pretty poorly.
So I am not sure what we are adding here over moves :)

I'm surprised at the low prediction rate for cycling. Moves seems to get that pretty accurately for me.

I'm now going to plot this data and see what it looks like.

## Advanced features added

In [None]:
Advanced_indices=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18]
print(Advanced_indices)
forestClf = ensemble.RandomForestClassifier()
exploreKFoldValidationSpace(forestClf, cleanedFeatureMatrix[:,Advanced_indices], cleanedResultVector, 5)

## Spatial knowledge added

In [None]:
Spatial_indices=[0,1,2,3,4,5,6,7,8,9,13,14,15,16,17,18,19,20]
print(Spatial_indices)
forestClf = ensemble.RandomForestClassifier()
exploreKFoldValidationSpace(forestClf, cleanedFeatureMatrix[:,Spatial_indices], cleanedResultVector, 5)

## Location and time features added

In [None]:
forestClf = ensemble.RandomForestClassifier()
exploreKFoldValidationSpace(forestClf, cleanedFeatureMatrix, cleanedResultVector, 5)

In [None]:
knnClf = neighbors.KNeighborsClassifier()
exploreKFoldValidationSpace(knnClf, cleanedFeatureMatrix, cleanedResultVector, 5)

In [None]:
for (i, importance) in enumerate(forestClf.feature_importances_):
    print(featureLabels[i], importance)

## Some more contour plots to help us visualize the data

In [None]:
from matplotlib import colors
import itertools

In [None]:
def printColorMap(algo, Xall, y):
    # we want to split roughly into roughly 10-20 sections
    nSplits = 20
    
    # setup parameters
    cmap_light = colors.ListedColormap(['#FAAAAA', '#AFAAAA', '#AAFAAA', '#AAAFAA', '#AAAAFA', '#AAAAAF'])
    cmap_bold = colors.ListedColormap(['#F00000', '#0F0000', '#00F000', '#000F00', '#0000F0', '#00000F'])
   
    # nFeatures = Xall.shape[1]
    nFeatures = 10
    fig, axes = plt.subplots(20, 5, figsize=(15,50))
    plt.tight_layout()
    axesArr = axes.flatten()
        
    i = 0
    for selCombo in itertools.product(np.arange(nFeatures), np.arange(nFeatures)):
        if selCombo[0] == selCombo[1]:
            continue
        # print("Generating grid for combo %s,%s in slot %s" % (featureLabels[selCombo[0]], featureLabels[selCombo[1]], i))
        
        selMask = np.zeros(Xall.shape[1])
        # Otherwise, we won't be able to plot it properly below
        assert(len(selCombo) == 2)
        selMask[selCombo[0]] = 1
        selMask[selCombo[1]] = 1
    
        X = Xall[:,selMask == 1]
    
        algo.fit(X, y)
        # Plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, m_max]x[y_min, y_max].
        x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
        y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
        # we want to split roughly into 
        h_x = float(x_max - x_min) / nSplits
        h_y = float(y_max - y_min) / nSplits
        
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h_x),
                             np.arange(y_min, y_max, h_y))
        Z = algo.predict(np.c_[xx.ravel(), yy.ravel()])
    
        # Put the result into a color plot
        Z = Z.reshape(xx.shape)

        axesArr[i].pcolormesh(xx, yy, Z, cmap=cmap_light)

        # Plot also the training points
        axesArr[i].scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
        # plt.scatter(X[:, 0], X[:, 1], c=y)
        axesArr[i].set_xlim(xx.min(), xx.max())
        axesArr[i].set_ylim(yy.min(), yy.max())
        axesArr[i].set_title("%s v/s %s" % (featureLabels[selCombo[0]], featureLabels[selCombo[1]]))
        # axesArr[i].legend(loc='best')
        i = i+1    

In [None]:
printColorMap(forestClf, cleanedFeatureMatrix, cleanedResultVector)

Let us also quickly take a look at the confusion matrix for the overall model. Because maybe we should not care about the confidence of the predictions, and just weight them lower.

In [None]:
from sklearn import metrics
from matplotlib import cm

In [None]:
def printConfusionMatrix(algo, X, y, title):
    skf = cross_validation.StratifiedKFold(y, 5)
    nClasses = np.count_nonzero(np.unique(y))
    print("nClasses = %s" % nClasses)
    sumPCM = np.zeros([nClasses, nClasses])
    for train, test in skf:
        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
        print("Number of distinct classes in training set = %s, test set = %s" % (np.unique(y[train]), np.unique(y[test])))
        y_pred = algo.fit(X_train, y_train).predict(X_test)
        # This has the raw number of entries (e.g. [610  12   1   0  32   1])
        # Since the total number of entries for each mode is different, we want to convert this to a percentage
        cmraw = metrics.confusion_matrix(y_test, y_pred)
        # We do that by summing up the entries for each mode (e.g. 656)
        sumArr = np.sum(cmraw, axis=1)
        # and repeating it across the row (e.g. [656 656 656 656 656 656])
        repeatedSumArr = np.repeat(sumArr, cmraw.shape[1]).reshape(cmraw.shape)
        # And dividing the raw numbers by the sums to get percentages (e.g [92.98 1.82 0 4.87 0.15])
        sumPCM = np.add(sumPCM, np.divide(cmraw.astype(float), repeatedSumArr))
    
    finalPCM = sumPCM / 5
    logFinalPCM = np.log(finalPCM + 1)
    np.set_printoptions(precision=0, suppress=True)
    # np.set_printoptions(precision=4, suppress=False)
    print(finalPCM * 100)

    oldSize = mpl.rcParams['font.size']
    mpl.rcParams['font.size'] = 16
    (fig, ax) = plt.subplots()
    # First element is "" because of http://stackoverflow.com/questions/3529666/matplotlib-matshow-labels
    ax.set_xticklabels(["","walk", "", "bus", "", "car", ""])
    ax.set_yticklabels(["","walk", "cycle", "bus", "train", "car", "air"])
    cax = ax.matshow(logFinalPCM, cmap=cm.gray)
    ax.set_title(title, color='green', weight='bold', size=16, y=1.1)
    
    fig.colorbar(cax)
    ax.set_ylabel('True label', size="large")
    ax.set_xlabel('Predicted label', size="large")
    fig.tight_layout()
    plt.show()
    return (finalPCM, fig)

## Model selection

In [None]:
modelCMList = []
saveDir = "/tmp/ml_results/"

In [None]:
import os
os.makedirs(saveDir, exist_ok=True)

In [None]:
forestClf = ensemble.RandomForestClassifier()
printConfusionMatrix(forestClf, genericCleanedFM, cleanedResultVector, "Generic features, random forest")

In [None]:
forestClf = ensemble.RandomForestClassifier()
printConfusionMatrix(forestClf, cleanedFeatureMatrix[:,Spatial_indices], cleanedResultVector, "Spatial Features, random forest")

In [None]:
forestClf = ensemble.RandomForestClassifier()
currCM, fig = printConfusionMatrix(forestClf, cleanedFeatureMatrix, cleanedResultVector, "All features, random forest")
fig.savefig(saveDir+"cm_all_random_forest.png", bbox_inches="tight")

Adding start and end points does improve the accuracy of the bus and train. Train trips in particular, are significantly improved.

In [None]:
knnClf = neighbors.KNeighborsClassifier()
printConfusionMatrix(knnClf, genericCleanedFM, cleanedResultVector, "Generic features, k-nn")

In [None]:
knnClf = neighbors.KNeighborsClassifier()
currCM, fig = printConfusionMatrix(knnClf, cleanedFeatureMatrix, cleanedResultVector, "All features, k-nn")
fig.savefig(saveDir+"cm_all_k_nn.png", bbox_inches="tight")

knn does significantly worse, primarily because of bus trips. I suspect this is because different people make the same trip using different modes. Time for per-user trips?

In [None]:
svmClf = svm.LinearSVC()
currCM, fig = printConfusionMatrix(svmClf, cleanedFeatureMatrix, cleanedResultVector, "All features, Linear SVM")
fig.savefig(saveDir+"cm_all_linear_svm.png", bbox_inches="tight")

For parametric models, it is particularly import to tune the parameters correctly. We use the grid_search function from sklearn to find the correct parameters for SVC.

In [None]:
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report

First, we do some coarse tuning on the parameters.

In [None]:
tuned_parameters = [{'C': [0.01, 0.1, 1, 10, 100, 1000],
                     'dual': [True, False],
                     'class_weight' : [None, 'auto']}]
baseClf = svm.LinearSVC()
clf = GridSearchCV(baseClf, tuned_parameters, cv=5)
clf.fit(cleanedFeatureMatrix, cleanedResultVector)
clf.get_params
clf.best_estimator_

Then, we do some fine tuning around the result of the coarse parameter tuning.

In [None]:
tuned_parameters = [{'C': list(range(75, 125, 5))}]
baseClf = svm.LinearSVC()
clf = GridSearchCV(baseClf, tuned_parameters, cv=5)
clf.fit(cleanedFeatureMatrix, cleanedResultVector)
print(clf.get_params)
print(clf.best_estimator_)
print(clf.best_score_)

In [None]:
tuned_parameters = [{'C': list(range(55, 125, 5))}]
baseClf = svm.LinearSVC(dual=False)
clf = GridSearchCV(baseClf, tuned_parameters, cv=5)
clf.fit(cleanedFeatureMatrix, cleanedResultVector)
print(clf.get_params)
print(clf.best_estimator_)
print(clf.best_score_)

In [None]:
tunedSvmClf = svm.LinearSVC(C=80, dual=False)
currCM, fig = printConfusionMatrix(tunedSvmClf, cleanedFeatureMatrix, cleanedResultVector, "All features, SVM(C=80,dual=F)")
fig.savefig(saveDir+"cm_all_tuned_svm.png", bbox_inches="tight")

In [None]:
from sklearn.kernel_approximation import Nystroem, AdditiveChi2Sampler, RBFSampler, SkewedChi2Sampler
from sklearn.linear_model import SGDClassifier
from sklearn.lda import LDA
from sklearn import preprocessing

In [None]:
rbf_feature = RBFSampler()
cleanedFeatureMatrix_features = rbf_feature.fit_transform(preprocessing.scale(cleanedFeatureMatrix))
sgdClf = SGDClassifier()
currCM, fig = printConfusionMatrix(sgdClf, cleanedFeatureMatrix_features, cleanedResultVector, "All features, SGD w RBF kernel")
fig.savefig(saveDir+"cm_all_rbf_kernel_sgd.png", bbox_inches="tight")

In [None]:
rbf_feature = RBFSampler(gamma=1)
cleanedFeatureMatrix_features = rbf_feature.fit_transform(preprocessing.scale(cleanedFeatureMatrix))
sgdClf = SGDClassifier()
printConfusionMatrix(sgdClf, cleanedFeatureMatrix_features, cleanedResultVector, "All features, SGD w RBF kernel (gamma=1)")

In [None]:
rbf_feature = RBFSampler()
cleanedFeatureMatrix_features = rbf_feature.fit_transform(preprocessing.scale(cleanedFeatureMatrix))
ldaClf = LDA()
currCM, fig = printConfusionMatrix(ldaClf, cleanedFeatureMatrix_features, cleanedResultVector, "All features, LDA w RBF kernel")
fig.savefig(saveDir+"cm_all_rbf_kernel_lda.png", bbox_inches="tight")

In [None]:
rbf_feature = RBFSampler()
cleanedFeatureMatrix_features = rbf_feature.fit_transform(preprocessing.scale(cleanedFeatureMatrix))
linearSVMClf = svm.LinearSVC()
currCM, fig = printConfusionMatrix(linearSVMClf, cleanedFeatureMatrix_features, cleanedResultVector, "All features, SVM w RBF kernel")
fig.savefig(saveDir+"cm_all_rbf_kernel_svm.png", bbox_inches="tight")

In [None]:
nys_feature = Nystroem(kernel='polynomial')
cleanedFeatureMatrix_features = nys_feature.fit_transform(preprocessing.scale(cleanedFeatureMatrix))
sgdClf = SGDClassifier()
currCM, fig = printConfusionMatrix(sgdClf, cleanedFeatureMatrix_features, cleanedResultVector, "All features, SGD w poly kernel")
fig.savefig(saveDir+"cm_all_poly_kernel_sgd.png", bbox_inches="tight")

In [None]:
nys_feature = Nystroem(kernel='polynomial')
cleanedFeatureMatrix_features = nys_feature.fit_transform(preprocessing.scale(cleanedFeatureMatrix))
ldaClf = LDA()
currCM, fig = printConfusionMatrix(ldaClf, cleanedFeatureMatrix_features, cleanedResultVector, "All features, LDA w poly kernel")
fig.savefig(saveDir+"cm_all_poly_kernel_lda.png", bbox_inches="tight")

In [None]:
nys_feature = Nystroem(kernel='polynomial')
cleanedFeatureMatrix_features = nys_feature.fit_transform(preprocessing.scale(cleanedFeatureMatrix))
svmClf = svm.LinearSVC()
currCM, fig = printConfusionMatrix(svmClf, cleanedFeatureMatrix_features, cleanedResultVector, "All features, SVM w poly kernel")
fig.savefig(saveDir+"cm_all_poly_kernel_svm.png", bbox_inches="tight")

In [None]:
nys_feature = Nystroem(kernel='sigmoid')
cleanedFeatureMatrix_features = nys_feature.fit_transform(preprocessing.scale(cleanedFeatureMatrix))
sgdClf = SGDClassifier()
currCM, fig = printConfusionMatrix(sgdClf, cleanedFeatureMatrix_features, cleanedResultVector, "All features, SGD w sigmoid kernel")
fig.savefig(saveDir+"cm_all_sigmoid_kernel_sgd.png", bbox_inches="tight")

In [None]:
nys_feature = Nystroem(kernel='sigmoid')
cleanedFeatureMatrix_features = nys_feature.fit_transform(preprocessing.scale(cleanedFeatureMatrix))
ldaClf = LDA()
currCM, fig = printConfusionMatrix(ldaClf, cleanedFeatureMatrix_features, cleanedResultVector, "All features, LDA w sigmoid kernel")
fig.savefig(saveDir+"cm_all_sigmoid_kernel_lda.png", bbox_inches="tight")

In [None]:
nys_feature = Nystroem(kernel='sigmoid')
cleanedFeatureMatrix_features = nys_feature.fit_transform(preprocessing.scale(cleanedFeatureMatrix))
svmClf = svm.LinearSVC()
currCM, fig = printConfusionMatrix(svmClf, cleanedFeatureMatrix_features, cleanedResultVector, "All features, SVM w poly kernel")
fig.savefig(saveDir+"cm_all_sigmoid_kernel_svm.png", bbox_inches="tight")

In [None]:
# nys_feature = Nystroem(kernel='chi2')
# cleanedFeatureMatrix_features = nys_feature.fit_transform(preprocessing.scale(cleanedFeatureMatrix))
# sgdClf = SGDClassifier()
# printConfusionMatrix(sgdClf, cleanedFeatureMatrix_features, cleanedResultVector, "All features, SGD w sigmoid kernel")

### Only for transport trips

As we can see, the prediction rate is best for walk and bike, which are the ones for which we get the most data from moves. It may be a mistake to use the same model for both types of trips because moves will do a good job for walk/bike and a horrible job for transport, because we don't allow users to specify 'transport' in the output.

These also have zero carbon footprint. Let us see how well we do on the motorized trips alone.

In [None]:
transportTrips = cleanedFeatureMatrix[:,2] == 4
print(np.count_nonzero(transportTrips))

In [None]:
forestClf = ensemble.RandomForestClassifier()
printConfusionMatrix(forestClf, genericCleanedFM[transportTrips], cleanedResultVector[transportTrips], "Generic features, motorized only, random forest")

In [None]:
forestClf = ensemble.RandomForestClassifier()
printConfusionMatrix(forestClf, cleanedFeatureMatrix[transportTrips], cleanedResultVector[transportTrips], "All features, transport only, random forest")

In [None]:
knnClf = neighbors.KNeighborsClassifier()
printConfusionMatrix(knnClf, genericCleanedFM[transportTrips], cleanedResultVector[transportTrips], "Generic features, transport only, k-nn")

In [None]:
knnClf = neighbors.KNeighborsClassifier()
printConfusionMatrix(knnClf, cleanedFeatureMatrix[transportTrips], cleanedResultVector[transportTrips], "All features, transport only, k-nn")

As we can see, we are actually able to predict car trips with a fair degree of accuracy. But bus and train trips are pretty much a tossup. Ignore the entries for 0 and 1 above, since we stripped out all walk and bike trips, and so these are only trips which moves misclassified, and not the entire dataset. Now we know why the Zheng paper only attempted to distinguish between bus and car trips, and not bus, train and car. The new features helped in the decision tree case, but not by that much, and did not help us at all in the knn case.

## User-specific models

In [None]:
def getUserModelComparison(isTransportOnly):
    userIds = Sections.distinct("user_id")

    # I'm not going to bother with testing against only the generic features
    # because the main issue here is personalization

    userIdList = []
    numberOfSections = []
    percentWalkBikeSections = []
    percentAutoClassified = []
    percentAutoClassifiedWalk = []
    percentAutoClassifiedBike = []
    percentAutoClassifiedBus = []
    percentAutoClassifiedTrain = []
    percentAutoClassifiedCar = []
    autoClassifiedAccuracy = []
    overallAccuracy = []

    labels = ["Number of sections", "% walk+bike trips",
              "% autoclassified", "% auto classified walk",
              "% auto classified bike", "% auto classified bus",
              "% auto classified train", "% auto classified car",
              "auto classified accuracy", "overall accuracy"]
    
    for userId in userIds:
        # decision tree with all features
        if not isTransportOnly:
            query = {"$and": [{'type': 'move'}, {'confirmed_mode': {'$ne': ''}}, {'user_id': userId}]}
        else:
            query = {"$and": [{'type': 'move'}, {'confirmed_mode': {'$ne': ''}}, {'mode': 4}, {'user_id': userId}]}
        
        wbQuery = {"$and": [{'type': 'move'}, {'confirmed_mode': {'$in': ['1', '3', '7']}}, {'user_id': userId}]}
        walkBikeTripCount = Sections.find(wbQuery).count()
        (userFeatureMatrix, userResultVector) = generateFeatureMatrixAndResultVector(query)
    
        # we only focus on users who have enough history with us
        if len(userResultVector) < 150:
            print("Skipping user with userId %s who has %s unconfirmed sections" % (userId, len(userResultVector)))
            continue
        
        forestClf = ensemble.RandomForestClassifier()
        # printConfusionMatrix(forestClf, userFeatureMatrix, userResultVector)
        (pac5, pacm5, hcs5, s5) = kFoldValidationWithProb(forestClf, userFeatureMatrix, userResultVector, 5, 0.95)
        userIdList.append(userId)
        
        numberOfSections.append(len(userResultVector))
        percentWalkBikeSections.append(float(walkBikeTripCount)/len(userResultVector))
        percentAutoClassified.append(pac5.mean())
        percentAutoClassifiedWalk.append(pacm5[0].mean())
        percentAutoClassifiedBike.append(pacm5[1].mean())
        percentAutoClassifiedBus.append(pacm5[2].mean())
        percentAutoClassifiedTrain.append(pacm5[3].mean())
        percentAutoClassifiedCar.append(pacm5[4].mean())
        autoClassifiedAccuracy.append(hcs5.mean())
        overallAccuracy.append(s5.mean())
    resultArray = np.array([numberOfSections, percentAutoClassified, percentAutoClassifiedWalk,
                            percentAutoClassifiedBike, percentAutoClassifiedBus, percentAutoClassifiedTrain,
                            percentAutoClassifiedCar, autoClassifiedAccuracy, overallAccuracy])
    print(resultArray.shape)
    return (userIdList, resultArray, labels)

In [None]:
def displayUserVariation(userIds, ra, labels):
    ''' ra has rows = plots and cols = users
    '''
    fig, (axes, axesNum) = plt.subplots(2, 1, figsize=(25, 25))
    nUsers = len(userIds)
    
    for i in [1,-2,-1]:
        # each row is one plot
        print(ra[i])
        axes.plot(np.arange(nUsers), ra[i], linewidth=2, label=labels[i])
        axes.legend(loc='best')
    for i in [0]:
        # each row is one plot
        print(ra[i])
        axesNum.plot(np.arange(nUsers), ra[i], linewidth=2, label=labels[i])    
        axesNum.legend(loc='best')

In [None]:
(userIds, userResultArray, labels) = getUserModelComparison(isTransportOnly=False)

In [None]:
displayUserVariation(userIds, userResultArray, labels)

In [None]:
selIndices = [2, -2, -1, 1]
mpl.rcParams['font.size'] = 16
fig, axes = displayHelpers.showCategoryChart(np.arange(len(userIds)), np.multiply([userResultArray[i] for i in selIndices], 100), [labels[i] for i in selIndices],
                                             ['b', 'g', 'c', 'm', 'y', 'k', 'w'],
                                             "Percentage",
                                             "Accuracy of various predictions",
                                             figsize=(12,5), width=0.2)
ax2 = axes.twinx()
# ax2.bar(np.arange(len(userResultArray[0])) + 3 * 0.2, userResultArray[0], 0.2, color = 'c')
nSecLine, = ax2.plot(userResultArray[0], 'r-+', linewidth=2, label="Number of confirmed sections")
axes.set_ylim(top=115)
axes.axhline(98, linewidth=2, label = "98%")
axes.axhline(85, linewidth=2, label = "85%")
axes.text(x=15.5, y=98.5, s="98%")
axes.text(x=15.5, y=85.5, s="85%")
axes.get_legend().set_bbox_to_anchor((0.4, -0.1))
print(axes.get_legend_handles_labels())
ax2.legend([nSecLine], ["Number of sections"], loc="upper right", framealpha=0.3, bbox_to_anchor=(1, -0.1))
ax2.set_ylabel("Count (number of sections)")
axes.set_xlabel("Users")
ax2.set_ylim(bottom=0)
fig.show()

In [None]:
fig, ax = plt.subplots()
ax.plot(userResultArray[1], userResultArray[-1], "o")

Let us pick users with really high accuracy. Unfortunately, they also have the least confirmed sections. Let us see whether this is because they are heavy walkers or otherwise.

In [None]:
userIds[5], userIds[6], userIds[2]

In [None]:
for i in [2,5,6]:
    userId = userIds[i]
    query = {"$and": [{'type': 'move'}, {'confirmed_mode': {'$ne': ''}}, {'user_id': userId}]}
    (userFeatureMatrix, userResultVector) = generateFeatureMatrixAndResultVector(query)
    indicesToStrip = (userResultVector == 2) | (userResultVector == 4) | (userResultVector == 8)
    for mode in [1,3,5,6,7,9]:
        # nFolds = 5
        if np.count_nonzero(userResultVector == mode) <= 10:
            print("number of instances of mode %s = %s" % (mode, np.count_nonzero(userResultVector == mode)))
            indicesToStrip = indicesToStrip | (userResultVector == mode)
    userStrippedIndices = np.logical_not(indicesToStrip)
    strippedUserFeatureMatrix = userFeatureMatrix[userStrippedIndices]
    strippedUserResultVector = userResultVector[userStrippedIndices]
    forestClf = ensemble.RandomForestClassifier()
    printConfusionMatrix(forestClf, strippedUserFeatureMatrix, strippedUserResultVector, "All features, random forest, user %s" % i)

So there's quite a bit of variability in both the overall accuracy, and in the number of trips for a user. The two don't seem to be correlated though. We get some fairly uneven improvement - for some users, the general classification is over 90%. We are also able to classify over 80% of the trips for some users.

But that might just be due to a higher ratio of walk trips, which are classified more accurately. I can explore this only for transport, but first, I'm going to try to build a gesture library and build the associated features. Then maybe Mogeng can continue some of the exploration.

In [None]:
(userIdsTransOnly, userResultArrayTransOnly, labelsTransOnly) = getUserModelComparison(isTransportOnly=True)

In [None]:
displayUserVariation(userIdsTransOnly, userResultArrayTransOnly, labelsTransOnly)

So looking at transport-only trips, and focusing on users with enough transport history (50+ motorized transport trips), we are able to get an overall accuracy of around 70 - 80% even for the motorized trips. However, there are some clear outliers, like the one who has only 60% accuracy. Also, because our current threshold for high confidence is set so high, the high confidence predictions are > 95% correct as before. We have to decide what to use.

We can autoclassify 20 - 50% of the motorized transport trips. In general, this is related to the number of trips - there is a very clear spike in the data for user 4. But the correlation is not exact. In particular, user 5 has > 50 trips, but only ~ 10% autoclassified trips.

It might be worthwhile to take a closer look at these 6 users, see what their transport trips look like, and get a sense of what the difference between user 4 and user 5 is, for example. This might help us figure out how to build better user models.

In [None]:
def buildRouteLibrary(userId, threshold):
    '''
    Here we attempt to build a route library for each user.
    Then, the probability of the top match can be a factor in our machine learning.
    Let us just start with the start and end points instead of a full dynamic time warp.
    
    userSections = Sections.find({"$and": [{'type': 'move'}, {'confirmed_mode': {'$ne': ''}}, {'user_id': userId}]})
    existingRoutes = RouteLibrary()
    for section in userSections:
        existingRoutes.update(section)
    return existingRoutes
    '''

## A summary comparision of models

In [None]:
modelCMList = []
saveDir = "/tmp/ml_results/"

In [None]:
import os
os.makedirs(saveDir, exist_ok=True)

### Generic model

In [None]:
forestClf = ensemble.RandomForestClassifier()
exploreKFoldValidationSpace(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices], cleanedResultVector, 5)
currCM, fig = printConfusionMatrix(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices], cleanedResultVector, "Generic")
modelCMList.append(currCM)
fig.savefig(saveDir+"cm_generic.png", bbox_inches="tight")

### Generic + Advanced model

In [None]:
forestClf = ensemble.RandomForestClassifier()
exploreKFoldValidationSpace(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices+AdvancedFeatureIndices], cleanedResultVector, 5)
currCM, fig = printConfusionMatrix(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices+AdvancedFeatureIndices], cleanedResultVector, "Generic + Advanced")
modelCMList.append(currCM)
fig.savefig(saveDir+"cm_generic_advanced.png", bbox_inches="tight")

### Generic + Advanced + Location model

In [None]:
forestClf = ensemble.RandomForestClassifier()
exploreKFoldValidationSpace(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices+AdvancedFeatureIndices
                                                            +LocationFeatureIndices], cleanedResultVector, 5)
currCM, fig = printConfusionMatrix(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices+AdvancedFeatureIndices
                                                            +LocationFeatureIndices], cleanedResultVector, "Generic + Advanced + Location")
modelCMList.append(currCM)
fig.savefig(saveDir+"cm_generic_advanced_location.png", bbox_inches="tight")

### Generic + BusTrain model

In [None]:
forestClf = ensemble.RandomForestClassifier()
exploreKFoldValidationSpace(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices+BusTrainFeatureIndices], cleanedResultVector, 5)
currCM, fig = printConfusionMatrix(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices+BusTrainFeatureIndices], cleanedResultVector, "Generic + BusTrain")
modelCMList.append(currCM)
fig.savefig(saveDir+"cm_generic_bustrain.png", bbox_inches="tight")

### Generic + Advanced + BusTrain model

In [None]:
forestClf = ensemble.RandomForestClassifier()
exploreKFoldValidationSpace(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices+AdvancedFeatureIndices+BusTrainFeatureIndices], cleanedResultVector, 5)
currCM, fig = printConfusionMatrix(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices+AdvancedFeatureIndices+BusTrainFeatureIndices], cleanedResultVector, "Generic + Advanced + BusTrain")
modelCMList.append(currCM)
fig.savefig(saveDir+"cm_generic_advanced_bustrain.png", bbox_inches="tight")

### Generic + Advanced + BusTrain + Location model

In [None]:
forestClf = ensemble.RandomForestClassifier()
exploreKFoldValidationSpace(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices+AdvancedFeatureIndices+BusTrainFeatureIndices
                                                            +LocationFeatureIndices], cleanedResultVector, 5)
currCM, fig = printConfusionMatrix(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices+AdvancedFeatureIndices+BusTrainFeatureIndices
                                                            +LocationFeatureIndices], cleanedResultVector, "Gen + Adv + BusTrain + Loc")
modelCMList.append(currCM)
fig.savefig(saveDir+"cm_generic_advanced_bustrain_location.png", bbox_inches="tight")

### Generic + Advanced + BusTrain + Location + Time model

In [None]:
forestClf = ensemble.RandomForestClassifier()
exploreKFoldValidationSpace(forestClf, cleanedFeatureMatrix, cleanedResultVector, 5)
currCM, fig = printConfusionMatrix(forestClf, cleanedFeatureMatrix, cleanedResultVector, "Gen + Adv + BusTrain + Loc + Time")
modelCMList.append(currCM)
fig.savefig(saveDir+"cm_generic_advanced_bustrain_location_time.png", bbox_inches="tight")

It seems that with all the features in, we can have the best prediction result.

In [None]:
cmDiagList = []
modelLabels = ["Generic", "G+A", "G+A+L", "G+B", "G+A+B", "G+A+B+L", "G+A+B+L+T"]
for currCM in modelCMList:
    cmDiagList.append(np.diag(currCM) * 100)
accuracyMatrix = np.vstack(cmDiagList)
print(len(modelLabels), accuracyMatrix.shape)
np.set_printoptions(precision=4)
print(accuracyMatrix)
np.set_printoptions(precision=0)
print(accuracyMatrix)

In [None]:
cmDiagList = []
modelLabels = ["Generic", "G+A", "G+A+L", "G+B", "G+A+B", "G+A+B+L", "G+A+B+L+T"]
for currCM in modelCMList:
    cmDiagList.append(np.diag(currCM) * 100)
accuracyMatrix = np.vstack(cmDiagList).round(decimals=0).astype(object)
print(len(modelLabels), accuracyMatrix.shape)
outputMatrix = np.insert(accuracyMatrix, 0, modelLabels, axis=1)
print(outputMatrix)