In [1]:
from __future__ import absolute_import, division, print_function

# Run Kernel Density Estimate and GridSearch for bandwidth parameter

In [2]:
import copy
import glob
import json
import os
import pickle
import random
import re
import sys
import time
from collections import Counter

import googlemaps
import matplotlib as mpl
import matplotlib.path as mplPath
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
import pandas as pd
import requests
import scipy.stats as ss
from sklearn.datasets.species_distributions import construct_grids
from sklearn.neighbors.kde import KernelDensity

%matplotlib inline


# Data

In [3]:
# Get CA coordinates
try:
    x_coords
    y_coords
except:
    infile = os.path.join('..','..','data','ca_shape.csv')
    ca_shape = pd.read_csv(infile)
    x_coords = ca_shape['longitude'].tolist()
    y_coords = ca_shape['latitude'].tolist()


In [4]:
# import cleaned and pickled dataframe
try:
    flickr_all_clean.shape
except:
    start_time = time.time()
    flickr_all_clean = pd.read_pickle(os.path.join('..','..','data','flickr_all_clean.df'))
    print("--- %s seconds ---" % (time.time() - start_time))


--- 29.890805006 seconds ---


In [63]:
keyword = 'football'
df = flickr_all_clean[flickr_all_clean['title_tags']
                      .str.contains(keyword, na=False)]
print(df.shape)

(18104, 8)


# Run KDE

In [64]:
# Select subset based on keyword
start_time = time.time()
# Set coordinates for cutout
limit_lng = [-122.7,-121.8]
limit_lat = [36.9,38.3]
#
limit_lng = [-123.194178,-121.375941]
limit_lat = [36.911135,38.202246]
#
if len(limit_lng) != 0 and len(limit_lat) != 0:
    yin = np.array((df['longitude'][(df['longitude'] > limit_lng[0]) & 
                                    (df['longitude'] < limit_lng[1]) & 
                                    (df['latitude'] > limit_lat[0]) & 
                                    (df['latitude'] < limit_lat[1])].tolist()))

    xin = np.array((df['latitude'][(df['longitude'] > limit_lng[0]) & 
                                   (df['longitude'] < limit_lng[1]) & 
                                   (df['latitude'] > limit_lat[0]) & 
                                   (df['latitude'] < limit_lat[1])].tolist()))
else:
    yin = np.array((df['longitude'][(df['longitude'] != 0.0) & (df['latitude'] != 0.0)].tolist()))
    xin = np.array((df['latitude'][(df['longitude'] != 0.0) & (df['latitude'] != 0.0)].tolist()))
# build array
XY = np.vstack([yin.ravel(), xin.ravel()]).T
# Run KDE
lng_max = limit_lng[0]
lng_min = limit_lng[1]
lat_max = limit_lat[0]
lat_min = limit_lat[1]
# Set up the data grid for the contour plot
xgrid = np.linspace(lng_min,lng_max,100)
ygrid = np.linspace(lat_min,lat_max,100)
X, Y = np.meshgrid(xgrid, ygrid)
#
xy = np.vstack([Y.ravel(), X.ravel()]).T
#
Xtrain = np.vstack([xin,yin]).T
# Xtrain *= np.pi / 180.  # Convert lat/long to radians
# # 
# band = 0.0003
# kde = KernelDensity(bandwidth=band,algorithm='ball_tree',rtol=1e-4)
# kde.fit(Xtrain)
# # evaluate only on land
# # Create path for CA 
# bbPath = mplPath.Path(np.transpose(np.array((x_coords,y_coords))),closed=True)
# patch_time = time.time()
# land_mask = np.zeros(xy.shape[0], dtype=bool)
# for idx,tmp_coords in enumerate(xy):
#     land_mask[idx] = bbPath.contains_point((tmp_coords[1], tmp_coords[0]))
# print("--- Masking: %s seconds ---" % (time.time() - patch_time))
# #
# xy *= np.pi / 180. # Convert lat/long to radians
# xy = xy[land_mask]
# Z = -9999 + np.zeros(land_mask.shape[0])
# Z[land_mask] = np.exp(kde.score_samples(xy))
# Z = Z.reshape(X.shape)
# land_mask_reshape = land_mask.reshape(X.shape)


In [65]:
print(xy.shape)
print(Xtrain.shape)

(10000, 2)
(17798, 2)


In [66]:
# Create land mask for input data
patch_time = time.time()
land_mask_input = np.zeros(Xtrain.shape[0], dtype=bool)
for idx,tmp_coords in enumerate(Xtrain):
    land_mask_input[idx] = bbPath.contains_point((tmp_coords[1], tmp_coords[0]))
print("--- Masking: %s seconds ---" % (time.time() - patch_time))
#
Xtrain *= np.pi / 180. # Convert lat/long to radians
Xtrain = Xtrain[land_mask_input]


--- Masking: 8.95744299889 seconds ---


In [67]:
print(land_mask_input)
print(sum(land_mask_input))

[ True  True  True ...,  True  True  True]
17761


In [68]:
print(Xtrain[0])

[ 0.66096772 -2.13366316]


In [69]:
print(Xtrain.shape)

(17761, 2)


# Rund GridSearch

In [72]:

patch_time = time.time()
from sklearn.model_selection import GridSearchCV
parameters = {'bandwidth':[0.0001,0.0002,0.0003,0.0004,0.0005,0.0007,0.001,0.002,0.005]}
parameters = {'bandwidth':[0.0001,0.0002,0.0003,0.0004,0.0005,0.0006]}
parameters = {'bandwidth':[0.0001,0.0005,0.001,0.005,0.01,0.05]}
parameters = {'bandwidth':[0.001,0.003,0.005,0.007,0.009]}
parameters = {'bandwidth':[0.001,0.002,0.003,0.004,0.005]}
# parameters = {'bandwidth':[0.0001]}
kde = KernelDensity(algorithm='ball_tree',rtol=1e-4)
clf = GridSearchCV(kde, parameters)
clf.fit(Xtrain)
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

# print("Best parameters set found on development set:")
# print()
# print(clf.best_params_)
# print()
# print("Grid scores on development set:")
# print()
# # for params, mean_test_score, std_test_score in clf.cv_results_:
# for n in range(len(clf.cv_results_['mean_test_score'])):
#     print("%0.3f (+/-%0.03f) for %r"
#           % (clf.cv_results_['mean_test_score'][n], 
#              clf.cv_results_['std_test_score'][n] * 2, 
#              clf.cv_results_['params'][n]))
# print()
print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
print("--- GridSearch: %s seconds ---" % (time.time() - patch_time))


Best parameters set found on development set:

{'bandwidth': 0.004}

Grid scores on development set:

30055.297 (+/-25925.800) for {'bandwidth': 0.001}
44653.751 (+/-8374.583) for {'bandwidth': 0.002}
46922.539 (+/-4741.707) for {'bandwidth': 0.003}
46931.656 (+/-3284.146) for {'bandwidth': 0.004}
46343.560 (+/-2402.643) for {'bandwidth': 0.005}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

--- GridSearch: 139.628803968 seconds ---


In [None]:
30055.297 (+/-25925.800) for {'bandwidth': 0.001}
44653.751 (+/-8374.583) for {'bandwidth': 0.002}
46922.539 (+/-4741.707) for {'bandwidth': 0.003}
46931.656 (+/-3284.146) for {'bandwidth': 0.004}
46343.560 (+/-2402.643) for {'bandwidth': 0.005}
