In [1]:
# This ensures Plotly output works in multiple places:
# plotly_mimetype: VS Code notebook UI
# notebook: "Jupyter: Export to HTML" command in VS Code
# See https://plotly.com/python/renderers/#multiple-renderers
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook"

In [2]:
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)  

In [32]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from sklearn.cluster import KMeans, AgglomerativeClustering
import random
import time
from geopy.distance import geodesic

In [4]:
from sklearn.metrics import silhouette_score

# Clustering of pokemon spawns

### What's in this notebook:
#### EDA
- plot different pokemon spawns on map, visually analyse patterns

#### K-means clustering
- cluster spawns by latitude and longitude
(to-do: use different methods to find optimal number of clusters)
- clutser pokemons based on occurences in clusters


*Potential analysis: Travelling Salesman Problem*


### About the data

The dataset contains Pokemon Go spawns in the San Francisco Bay Area over a three-day period from 26th to 28th Jul 2016. It jas the following fields: 
* num: number of spawn instances
* name: name of the pokemon
* lat & lng: latitude and longitude of the spawn
* encounter_ms: unix timestamp of the encounter
* disappear_ms: unix timestamp of when it disappears

Data source: Kaggle https://www.kaggle.com/datasets/kveykva/sf-bay-area-pokemon-go-spawns

## Data cleaning

In [5]:
df_complete = pd.read_csv('data/pokemon-spawns.csv')

### Sense check latitudes and longitudes

In [6]:
print('Latitude: \n', df_complete['lat'].describe(), '\n')
print('Longitude: \n', df_complete['lng'].describe(), '\n')

Latitude: 
 count    314105.000000
mean         37.291383
std           1.918667
min          32.188748
25%          37.322103
50%          37.615938
75%          37.788990
max          43.164284
Name: lat, dtype: float64 

Longitude: 
 count    314105.000000
mean        -95.059814
std          73.256881
min        -122.611804
25%        -122.337013
50%        -122.106334
75%        -118.260931
max         139.840451
Name: lng, dtype: float64 



### Detect outliers
It appears that some spawns are outside of the SF Bay Area. To identify outliers, I tried two methods: 
1. z-scores < 3: z-score measures the number of standard deviations a data point is away from the mean. Outliers are any data points with z-score > 3. This is based on the statistical rule that for a normal distribution, 99.7% of all observed data would fall within 3 standard deviations of the mean. 
<br>
2. inter-quartile range (IQR): the inter-quartile range refers to the difference between the 3rd and 1st quartile. This method considers data points 1.5 IQR above 3rd quartile or 1.5 IQR below 1st quartile to be the outlier, which is mathematically comparable to the z-score < 3 method*. 

    <i>\* By definition, 50% of data lies between 1st and 3rd quartile. With a normal distribution, ~50% of data lies between +/- 0.7 standard deviations. This means that the IQR is roughly comparable to to 0.7 x 2 = 1.4 standard deviations, and the lower/upper bounds are defined such that they are around 3 standard deviations from the mean (0.7 + 1.5 x 1.4 = 2.8).</i> 

**1. Z-Score**

In [9]:
from scipy import stats
df_complete['lat_zscore'] = np.abs(stats.zscore(df_complete['lat']))
df_complete['lng_zscore'] = np.abs(stats.zscore(df_complete['lng']))

In [10]:
print('Latitude (z-score < 3): \n', df_complete[df_complete['lat_zscore']<3]['lat'].describe(), '\n')
print('Longitude (z-score < 3): \n', df_complete[df_complete['lng_zscore']<3]['lng'].describe(), '\n')

Latitude (z-score < 3): 
 count    306444.000000
mean         37.145566
std           1.703384
min          32.188748
25%          37.315651
50%          37.598394
75%          37.779847
max          40.818783
Name: lat, dtype: float64 

Longitude (z-score < 3): 
 count    287022.000000
mean       -117.214987
std          13.420221
min        -122.611804
25%        -122.371368
50%        -122.159748
75%        -121.901929
max         -73.357929
Name: lng, dtype: float64 



**2. Inter-Quartile Range**

In [11]:
q1 = np.percentile(df_complete['lat'], 25, method='midpoint')
q3 = np.percentile(df_complete['lat'], 75, method='midpoint')
iqr = q3 - q1
lat_lower_bound = q1-1.5*iqr
lat_upper_bound = q3+1.5*iqr

q1 = np.percentile(df_complete['lng'], 25, method='midpoint')
q3 = np.percentile(df_complete['lng'], 75, method='midpoint')
iqr = q3 - q1

lng_lower_bound = q1-1.5*iqr
lng_upper_bound = q3+1.5*iqr

In [12]:
print('Latitude (IQR rule): \n', df_complete[(df_complete['lat']>lat_lower_bound)
                                               &(df_complete['lat']<lat_upper_bound)]['lat'].describe(), '\n')
print('Latitude (IQR rule): \n', df_complete[(df_complete['lng']>lng_lower_bound)
                                               &(df_complete['lng']<lng_upper_bound)]['lng'].describe(), '\n')


Latitude (IQR rule): 
 count    222939.000000
mean         37.635558
std           0.196614
min          36.621860
25%          37.477212
50%          37.682257
75%          37.784168
max          37.998836
Name: lat, dtype: float64 

Latitude (IQR rule): 
 count    248309.000000
mean       -121.889323
std           1.114851
min        -122.611804
25%        -122.403057
50%        -122.210387
75%        -121.996139
max        -118.164244
Name: lng, dtype: float64 



Both the z-score and IQR methods resulted in a data set that contains points significantly outside the SF Bay Area. So in the end, I simply took the border coordinates of the SF Bay Area since that was the author's intended scope of the data set. 

In [13]:
df = df_complete[(df_complete['lng']>=-122.6445)
                &(df_complete['lng']<-121.5871)
                &(df_complete['lat']>=37.1897)
                &(df_complete['lat']<38.2033)
                ]

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 221979 entries, 0 to 314104
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   s2_id         221979 non-null  int64  
 1   s2_token      221979 non-null  object 
 2   num           221979 non-null  int64  
 3   name          221979 non-null  object 
 4   lat           221979 non-null  float64
 5   lng           221979 non-null  float64
 6   encounter_ms  221979 non-null  int64  
 7   disppear_ms   221979 non-null  int64  
 8   lat_zscore    221979 non-null  float64
 9   lng_zscore    221979 non-null  float64
dtypes: float64(4), int64(4), object(2)
memory usage: 18.6+ MB


Then i converted pokemon name field to dummy variables (for analysing patterns later)

In [14]:
pokemon_dummies = pd.get_dummies(df['name'], prefix='pkm')
df = pd.concat([df, pokemon_dummies], axis=1).dropna()

In [15]:
df['spawn_latlng'] = list(zip(df['lat'], df['lng']))

## Exploratory Data Analysis

### Scatter map for all pokemon species
First I will plot the data on a map to check if there are any visible clusters. It looks like there are many small clusters and that are arbitrarily shaped (e.g. the clusters along the waterfront appears to be elongated whereas the inland clusters are more spherical. 

In [16]:
# the mapbox access token can be created for free by registering on mapbox. the token allows access to more base map options
with open('../mapboxtoken.txt', 'r') as file:
    mapboxtoken = file.read()

In [17]:
# set the GPS coordinates for map centre
map_centre = {'lat':37.6965, 'lon':-122.1158}

In [19]:
# the data set has >300k data points, which makes plotting very slow
# therefore I selected a random sample of 10% of the points to plot on the map
random.seed(42)
plot_sample_indices = random.sample(df.reset_index().index.tolist(), int(len(df)/100))
df_plot_sample = df.iloc[plot_sample_indices].copy(deep=True)

In [20]:
fig = go.Figure()
fig.add_trace(go.Scattermapbox( lat=df_plot_sample['lat'],  
                                lon=df_plot_sample['lng'], 
                                mode='markers',
                                marker=go.scattermapbox.Marker(size=df_plot_sample['num']/20, 
                                                              opacity=.7)
                              ))
# update mapbox style and layout
fig.update_mapboxes(
    domain=dict(column=2),
    style='mapbox://styles/mapbox/light-v10',
    zoom=9.5,
    center={'lat':37.6965, 'lon':-122.30}
)

fig.update_layout(height=600, margin_t=50, mapbox_accesstoken=mapboxtoken, showlegend=False)

fig.show()

### Scatter map by species
Besides clustering the spawns, I am also interested to see whether certain pokemon species are more concentrated in some areas but not others. So I created scatter maps for a few relatively common species. 

*I used a different data visualisation package called Folium (I used Plotly for the previous chart) purely for aesthetic reason because Folium allows me to replace circular markers with pokemon icons.*

As the maps below show, the different pokemon species do have different geographic distributions. Pikachus are more concentrated in the south (Palo Alto, Mountain View), whereas Squirtle and Bulbasaur are more frequently found in the north of San Mateo. Jigglypuff is most evenly distributed among the four species selected. 

In [25]:
top_50_pokemons = df['name'].str.lower().value_counts().head(50).index.tolist()

In [33]:
df_plot_sample_pkm_dict = {}
for pkm in top_50_pokemons:#'pikachu', 'squirtle', 'bulbasaur', 'jigglypuff', 'voltorb', 'krabby', 'staryu']:
    df_plot_sample_pkm_dict[pkm]= df_plot_sample[df_plot_sample['name'].str.lower()==pkm] 
    print(pkm, len(df_plot_sample_pkm_dict[pkm]))

pidgey 342
zubat 277
rattata 209
spearow 127
weedle 87
paras 94
ekans 77
eevee 53
doduo 54
caterpie 48
magikarp 46
venonat 50
nidoran♂ 40
nidoran♀ 29
mankey 34
growlithe 40
meowth 36
clefairy 38
poliwag 27
krabby 39
staryu 29
goldeen 24
pidgeotto 20
oddish 17
bellsprout 13
psyduck 22
sandshrew 12
geodude 17
bulbasaur 23
cubone 12
pikachu 15
diglett 17
pinsir 8
rhyhorn 20
horsea 11
tentacool 5
abra 14
jigglypuff 14
exeggcute 15
machop 6
magnemite 4
voltorb 12
golbat 7
ponyta 6
slowpoke 10
raticate 5
squirtle 10
gastly 2
kakuna 2
dratini 7


In [54]:
df_median_distances = pd.DataFrame()
df_mean_distances = pd.DataFrame()

for i, pkm in enumerate(top_50_pokemons):
    df_ = df_plot_sample_pkm_dict[pkm]
    df_median_distances[pkm] = [None]*49
    df_mean_distances[pkm] = [None]*49
    for j, pkm2 in enumerate(top_50_pokemons[i+1:]):
        array_of_min_distances = df_['spawn_latlng'].apply(lambda x: min([geodesic(x, y).kilometers
                                                                          for y in df_plot_sample_pkm_dict[pkm2]['spawn_latlng'].tolist()]))
        df_['dist_'+pkm2] = array_of_min_distances
        df_median_distances.loc[i+j, pkm] = array_of_min_distances.median()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [49]:
df_median_distances.iloc[4]['pidgey']

1.9763337171206992

In [53]:
df_median_distances.loc[4,'pidgey'] = 2
df_median_distances.iloc[4]['pidgey'] 

2

In [28]:
import folium.plugins

m = folium.plugins.DualMap(location=[37.6965,-122.1158], tiles="Cartodb Positron", zoom_start=9)

for i in range(len(df_plot_sample_pkm_dict['pikachu'])-1):#stations['features']:
    lon, lat = df_plot_sample_pkm_dict['pikachu'].iloc[i]['lng'], df_plot_sample_pkm_dict['pikachu'].iloc[i]['lat']
    icon_path = '../static/img/pkm-pikachu.ico'
    icon = folium.features.CustomIcon(icon_path,
                                      icon_size=(20, 20))
    marker = folium.map.Marker([lat, lon], icon=icon,
                              )
    m.m1.add_child(marker)

for i in range(len(df_plot_sample_pkm_dict['squirtle'])-1):#stations['features']:
    lon, lat = df_plot_sample_pkm_dict['squirtle'].iloc[i]['lng'], df_plot_sample_pkm_dict['squirtle'].iloc[i]['lat']
    icon_path = '../static/img/pkm-squirtle.ico'
    icon = folium.features.CustomIcon(icon_path,
                                      icon_size=(20, 20))
    
    marker = folium.map.Marker([lat, lon], icon=icon,
                              )
    m.m2.add_child(marker)
    
for i in range(len(df_plot_sample_pkm_dict['voltorb'])-1):#stations['features']:
    lon, lat = df_plot_sample_pkm_dict['voltorb'].iloc[i]['lng'], df_plot_sample_pkm_dict['voltorb'].iloc[i]['lat']
    icon_path = '../static/img/pkm-voltorb.ico'
    icon = folium.features.CustomIcon(icon_path,
                                      icon_size=(20, 20))
    
    marker = folium.map.Marker([lat, lon], icon=icon,
                              )
    m.m2.add_child(marker)

In [29]:
n = folium.plugins.DualMap(location=[37.6965,-122.1158], tiles="Cartodb Positron", zoom_start=9)

for i in range(len(df_plot_sample_pkm_dict['bulbasaur'])-1):#stations['features']:
    lon, lat = df_plot_sample_pkm_dict['bulbasaur'].iloc[i]['lng'], df_plot_sample_pkm_dict['bulbasaur'].iloc[i]['lat']
    icon_path = '../static/img/pkm-bulbasaur.ico'
    icon = folium.features.CustomIcon(icon_path,
                                      icon_size=(20, 20))
    marker = folium.map.Marker([lat, lon], icon=icon,
                              )
    n.m1.add_child(marker)
    
    
for i in range(len(df_plot_sample_pkm_dict['staryu'])-1):#stations['features']:
    lon, lat = df_plot_sample_pkm_dict['staryu'].iloc[i]['lng'], df_plot_sample_pkm_dict['staryu'].iloc[i]['lat']
    icon_path = '../static/img/pkm-staryu.ico'
    icon = folium.features.CustomIcon(icon_path,
                                      icon_size=(20, 20))
    marker = folium.map.Marker([lat, lon], icon=icon,
                              )
    n.m1.add_child(marker)
    
for i in range(len(df_plot_sample_pkm_dict['krabby'])-1):#stations['features']:
    lon, lat = df_plot_sample_pkm_dict['krabby'].iloc[i]['lng'], df_plot_sample_pkm_dict['krabby'].iloc[i]['lat']
    icon_path = '../static/img/pkm-krabby.ico'
    icon = folium.features.CustomIcon(icon_path,
                                      icon_size=(20, 20))
    marker = folium.map.Marker([lat, lon], icon=icon,
                              )
    n.m1.add_child(marker)

for i in range(len(df_plot_sample_pkm_dict['jigglypuff'])-1):#stations['features']:
    lon, lat = df_plot_sample_pkm_dict['jigglypuff'].iloc[i]['lng'], df_plot_sample_pkm_dict['jigglypuff'].iloc[i]['lat']
    icon_path = '../static/img/pkm-jigglypuff.ico'
    icon = folium.features.CustomIcon(icon_path,
                                      icon_size=(20, 20))
    
    marker = folium.map.Marker([lat, lon], icon=icon,
                              )
    n.m2.add_child(marker)

In [30]:
print('L: Pikachu, R:Squirtle')
m

L: Pikachu, R:Squirtle


In [31]:
print('L: Bulbasaur, R: Jigglypuff')
n

L: Bulbasaur, R: Jigglypuff


In [547]:
df_bulbasaur = df_plot_sample[df_plot_sample['name']=='Bulbasaur']

df_bulbasaur['dist_staryu'] = df_bulbasaur['spawn_latlng'].apply(lambda x: min([geodesic(x, staryu).kilometers for staryu in df_staryu['spawn_latlng'].tolist()]))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [550]:
df_bulbasaur['dist_jigglypuff'] = df_bulbasaur['spawn_latlng'].apply(lambda x: min([geodesic(x, jigglypuff).kilometers for jigglypuff in df_jigglypuff['spawn_latlng'].tolist()]))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [551]:
df_bulbasaur['dist_pikachu'] = df_bulbasaur['spawn_latlng'].apply(lambda x: min([geodesic(x, pikachu).kilometers for pikachu in df_pikachu['spawn_latlng'].tolist()]))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [554]:
for pkm in ['staryu', 'jigglypuff', 'pikachu']:
    print(pkm, df_bulbasaur['dist_'+pkm].median())

staryu 0.8456425849850513
jigglypuff 1.6657160318251796
pikachu 4.016245204207474


In [None]:
# heatmap and scatter maps created with plotly
# fig = make_subplots(rows=3, cols=2, column_widths=[0.1, 0.1],vertical_spacing=0.1, horizontal_spacing=0.1,
#                     subplot_titles=('Pikachu Scatter', 'Pikachu Density', 
#                                    'Squirtle Scatter', 'Squirtle Density', 
#                                    'Bulbasaur Scatter', 'Bulbasaur Density'),
#                     specs=[[{'type':'mapbox'}]*2,  
#                            [{'type':'mapbox'}]*2,
#                            [{'type':'mapbox'}]*2,
#                           ])

# df__ = df[df['name']=='Pikachu']
# fig.add_trace(go.Scattermapbox( lat=df__['lat'],  
#                                 lon=df__['lng'], 
#                                 mode='markers',
#                                 marker=go.scattermapbox.Marker(size=5,
#                                                               opacity=.7),
#                                ),
#                             row=1, col=1)

# fig.add_trace(go.Densitymapbox(lat=df__['lat'],  
#                                lon=df__['lng'], 
#                                radius=3,
#                               ),
#                                row=1, col=2
#                                )

# df__ = df[df['name']=='Squirtle']

# fig.add_trace(go.Scattermapbox( lat=df__['lat'],  
#                                 lon=df__['lng'], 
#                                 mode='markers',
#                                 marker=go.scattermapbox.Marker(size=5, 
#                                                               opacity=1),
#                               ),
#                             row=2, col=1)

# fig.add_trace(go.Densitymapbox(lat=df__['lat'],  
#                                lon=df__['lng'], 
#                                radius=3,
#                               ),
#                                row=2, col=2
#                                )

# df__ = df[df['name']=='Bulbasaur']

# fig.add_trace(go.Scattermapbox( lat=df__['lat'],  
#                                 lon=df__['lng'], 
#                                 mode='markers',
#                                 marker=go.scattermapbox.Marker(size=5, 
#                                                               opacity=1),
#                               ),
#                             row=3, col=1)

# fig.add_trace(go.Densitymapbox(lat=df__['lat'],  
#                                lon=df__['lng'], 
#                                radius=3,
#                               ),
#                                row=3, col=2
#                                )



# # update mapbox style and layout
# fig.update_mapboxes(
#     domain=dict(column=2),
#     style='open-street-map', #'mapbox://styles/mapbox/light-v10',
#     zoom=7,
#     center=map_centre,
# )

# fig.update_layout(height=1000, margin_t=50, showlegend=False)

# fig.show()

## K-Means Clustering
K-Means clustering is an unsupervised learning algorithm that partitions the data into K non-overlapping subgroups. It takes an iterative approach to find the optimal clustering that minimizes the within-cluster sum of squares (sum of squared distances between each data point and the centroid of its assigned cluster.

Step 1 - select K random points as initial centroids \
Step 2 - calculate the Euclidean distance between each point and each centroid, and assign each point to its nearest centroid \
Step 3 - update centroids by calculating the mid point of each cluster \
Step 4 - repeat steps 2 and 3 until no changes

To determine the optimal number of clusters, I will explore two methods: 
1. Elbow method
2. Silhouette score

In [None]:
# start = time.time()
# df_lat_lng = np.array(df[['lat', 'lng']]).tolist()
# kmeans = KMeans(n_clusters=50, random_state=42, n_init="auto").fit(df_lat_lng)
# df_centroids = pd.DataFrame(kmeans.cluster_centers_, columns=['lat', 'lng'])
# df_centroids['size'] = pd.Series(kmeans.labels_).value_counts()
# print('{:,.0f}s'.format(time.time() - start))

# # reset index as cluster_id
# df_centroids = df_centroids.reset_index(names='cluster_id')
# df_centroids

# # label each spawn with cluster id
# df['cluster_id'] = kmeans.labels_

# df['centroid_lat'] = df['cluster_id'].map(df_centroids.set_index('cluster_id')['lat'])
# df['centroid_lng'] = df['cluster_id'].map(df_centroids.set_index('cluster_id')['lng'])
# df['centroid_latlng'] = list(zip(df['centroid_lat'], df['centroid_lng']))
# df['spawn_latlng'] = list(zip(df['lat'], df['lng']))

### Elbow method
The Elbow method is a graphical technique which involves plotting the within-cluster sum of squares (WCSS) against the number of clusters. The WCSS measures the sum of squared distances (geodesic distance is used instead of Euclidean because we are clustering GPS coordinates). The idea is to choose the number of clusters at which the decrease in WCSS slows (diminishing returns to additional clusters set in). 

In [57]:
from scipy.spatial import distance
from geopy.distance import geodesic
df_lat_lng = np.array(df[['lat', 'lng']]).tolist()

In [58]:
wcss

{}

In [59]:
# within-cluster sum of squares / WCSS (sum of the distances between each point and the centroid of the cluster it belongs to)
start = time.time()
wcss = {}
kmeans_dict = {}
df_centroids_dict = {}


for i in np.arange(10, 101, 10):
# for i in np.arange(5, 101, 10):
# for i in [2]:
    if i not in wcss.keys():
        kmeans = KMeans(n_clusters=i, random_state=42, n_init="auto").fit(df_lat_lng)
        kmeans_dict[i] = kmeans

        df_centroids = pd.DataFrame(kmeans.cluster_centers_, columns=['lat', 'lng']).reset_index(names='cluster_id')
        df_centroids['size'] = pd.Series(kmeans.labels_).value_counts()
        df_centroids_dict[i] = df_centroids

        df['cluster_id_'+str(i)] = kmeans.labels_
        df['centroid_lat_'+str(i)] = df['cluster_id_'+str(i)].map(df_centroids.set_index('cluster_id')['lat'])
        df['centroid_lng_'+str(i)] = df['cluster_id_'+str(i)].map(df_centroids.set_index('cluster_id')['lng'])
        df['centroid_latlng_'+str(i)] = list(zip(df['centroid_lat_'+str(i)], df['centroid_lng_'+str(i)]))
        df['geodesic_dist_'+str(i)] = df.apply(lambda x: geodesic(x['spawn_latlng'], 
                                                                           x['centroid_latlng_'+str(i)]).kilometers, axis=1)

        wcss[i] = df['geodesic_dist_'+str(i)].apply(lambda x: x**2).sum()

        print(i, 'clusters completed')
print('{:,.0f}s'.format(time.time() - start))

10 clusters completed
20 clusters completed
30 clusters completed
40 clusters completed
50 clusters completed
60 clusters completed
70 clusters completed
80 clusters completed
90 clusters completed
100 clusters completed
1,865s


The graph suggests that 30 is the optimal number of clusters as that is the point where the slope starts to decrease slower. 

In [60]:
df_ = pd.DataFrame(wcss, index=['wcss']).transpose().reset_index(names='clusters')
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_['clusters'], y=df_['wcss'], mode='lines+markers'))
fig.update_layout(plot_bgcolor='rgba(0, 0, 0, 0)',
                  title='Elbow method',
                  xaxis=dict(title='Number of clusters'),
                  yaxis=dict(title='total within sum of squares'),
                  height=400, width=600
                 )
fig.show()

The 'Kneedle' algorithm also finds 30 as the point where the curve starts to flatten. \
<br>
*Kneedle algorithm was publisehd by Satopää, Albrecht, Irwin, and Raghavan (2011) [https://raghavan.usc.edu/papers/kneedle-simplex11.pdf]

In [61]:
from kneed import KneeLocator
kn = KneeLocator(df_['clusters'], y=df_['wcss'], curve='convex', direction='decreasing')
print(kn.knee)

30


### Silhouette score
The silhouette score is a measure of how similar an object is to its own cluster compared to other clusters. The method involves calculating for each data point:
1. intra-cluster distance (ICD) = the avgerage distance between the point and all other points in its cluster
2. nearest-cluster distance (NCD) = the average distance between the point and all points in the nearest neighbouring cluster
3. silhouette score = (NCD-ICD) / max(ICD, NDC)

The optimal number of clusters is the one that resulted in the highest average silouette score across all points. 

As shown below, the highest avearge silhouette score is 0.55, from splitting the data into two clusters. This contradicts with the Elbow method which found 20 to be the optimal number of clusters. Since k-means clustering responds poorly to non-convex shaped (e..g elongated) clusters, I will attempt a density-based clustering method in the next part. 

In [None]:
# score = silhouette_score(df_lat_lng, kmeans.labels_, metric='euclidean')

In [371]:
geodesic_scores_dict = {}
for i in kmeans_dict.keys(): #np.arange(10, 101, 10):
    print(i)
    start = time.time()
    geodesic_score = silhouette_score(df_lat_lng, kmeans_dict[i].labels_, metric="haversine", sample_size=3000)
    geodesic_scores_dict[i] = geodesic_score
    print('{:,.0f}s'.format(time.time() - start))

10
2s
20
2s
30
2s
40
2s
50
2s


In [372]:
pd.DataFrame(geodesic_scores_dict, index=['silhouette_score_total']).transpose().reset_index(names='clusters').sort_values('clusters')

Unnamed: 0,clusters,silhouette_score_total
0,10,0.445521
1,20,0.408463
2,30,0.368117
3,40,0.364196
4,50,0.35447


## DBSCAN

DBSCAN is another unsupervised algorithm that identifies distinctive clusters in the data. Unlike K-Means, which is based on the concept of centroids, DBSCAN views clusters as regions of high density. The algorithm takes two parameters - the minimum sample size (n) and epsilon (eps). DBSCAN separates the data set into core samples, non-core samples, and outliers. 

Core sample: a data point that has at least n other points within the distance of eps \
Non-core sample: a data point that is the neighbour of a core sample but not a core sample itself \
Outliers: a sample that is not a neighbour of any other core sample


In [62]:
from sklearn.cluster import DBSCAN

In [63]:
sample_indices = random.sample(df.reset_index().index.tolist(), 10000)
df_sample = df.iloc[sample_indices].copy(deep=True)

In [64]:
df_sample_lat_lng =  np.array(df_sample[['lat', 'lng']]).tolist()

In [589]:
db = DBSCAN(eps=0.007, min_samples=10, metric='haversine', n_jobs=-1).fit(df_sample_lat_lng)


In [591]:
df_sample['cluster_id_db'] = db.labels_


In [592]:
df_sample['cluster_id_db'].nunique()

103

In [593]:
colors = (px.colors.qualitative.Pastel1[:-1]
          +px.colors.qualitative.Set3
          +px.colors.qualitative.Pastel2
          +px.colors.qualitative.Set1
          +px.colors.qualitative.Set2
          +px.colors.qualitative.Antique
          +px.colors.qualitative.Bold
          +px.colors.qualitative.Prism
          +px.colors.qualitative.Pastel
           +px.colors.qualitative.Pastel1
          +px.colors.qualitative.Vivid
          +px.colors.qualitative.Safe
      
          
          +['rgb(0,0,0)'])
df_sample['cluster_color'] = df_sample['cluster_id_db'].apply(lambda x: colors[x])

In [594]:
df_sample['cluster_id_db'].value_counts()

 0     1956
-1     1913
 1     1024
 8      703
 6      447
       ... 
 85       7
 89       7
 96       6
 95       6
 90       4
Name: cluster_id_db, Length: 103, dtype: int64

Frequently occurring pokemons

In [543]:
df_sample_core = df_sample.iloc[db.core_sample_indices_].copy(deep=True)

In [544]:
gt_1000 = df_sample_core[pokemon_dummy_cols].sum()>10
gt_1000.sum()

52

In [545]:
'pkm_Staryu' in gt_1000

True

In [596]:
df_db_clusters = df_sample_core.groupby('cluster_id_db').agg({'num':'sum'}).reset_index()
df_db_clusters

# add pokemon spawn counts to the cluster summary
pokemon_dummy_cols = [col_name for col_name in df.columns.tolist() if 'pkm' in col_name]
df_db_cluster_pokemons = df_sample_core.groupby('cluster_id_db').agg({pkm:'sum' for pkm in pokemon_dummy_cols}).reset_index()
df_db_clusters = df_db_clusters.merge(df_db_cluster_pokemons, on='cluster_id_db')

In [597]:
df_ = df_db_clusters[pokemon_dummy_cols]#[[col for col in pokemon_dummy_cols if gt_1000[col]]]
df_corr = df_.corr().round(2)
df_corr

Unnamed: 0,pkm_Abra,pkm_Aerodactyl,pkm_Alakazam,pkm_Arbok,pkm_Arcanine,pkm_Beedrill,pkm_Bellsprout,pkm_Bulbasaur,pkm_Butterfree,pkm_Caterpie,...,pkm_Venusaur,pkm_Vileplume,pkm_Voltorb,pkm_Vulpix,pkm_Wartortle,pkm_Weedle,pkm_Weepinbell,pkm_Weezing,pkm_Wigglytuff,pkm_Zubat
pkm_Abra,1.00,,,-0.05,,0.33,0.43,0.46,-0.04,0.44,...,,,0.36,0.41,0.19,0.42,0.50,,-0.04,0.52
pkm_Aerodactyl,,,,,,,,,,,...,,,,,,,,,,
pkm_Alakazam,,,,,,,,,,,...,,,,,,,,,,
pkm_Arbok,-0.05,,,1.00,,-0.02,-0.03,-0.03,-0.02,-0.04,...,,,-0.03,-0.04,-0.02,-0.03,-0.02,,-0.02,-0.03
pkm_Arcanine,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
pkm_Weedle,0.42,,,-0.03,,0.96,0.99,0.98,0.53,0.98,...,,,0.75,0.60,0.84,1.00,0.82,,-0.02,0.91
pkm_Weepinbell,0.50,,,-0.02,,0.70,0.82,0.91,-0.01,0.86,...,,,0.78,0.73,0.44,0.82,1.00,,-0.01,0.98
pkm_Weezing,,,,,,,,,,,...,,,,,,,,,,
pkm_Wigglytuff,-0.04,,,-0.02,,-0.02,-0.02,-0.02,-0.01,-0.02,...,,,-0.02,-0.03,-0.02,-0.02,-0.01,,1.00,-0.02


In [607]:
mask = np.triu(np.ones_like(df_corr, dtype=bool))
df_corr_viz = df_corr.mask(mask)


corr_dict = {}
for col in [col for col in pokemon_dummy_cols if gt_1000[col]]: #pokemon_dummy_cols:#
    if len(df_corr_viz.loc[(abs(df_corr_viz[col])>.0), col])> 0:
        corr_dict[col] = dict(df_corr_viz.loc[(abs(df_corr_viz[col])>=.97)&(df_corr_viz.index != col), col])
        print(col[4:], corr_dict[col])

Abra {}
Bellsprout {'pkm_Bulbasaur': 0.98, 'pkm_Caterpie': 0.98, 'pkm_Dratini': 0.98, 'pkm_Eevee': 0.98, 'pkm_Goldeen': 0.99, 'pkm_Golduck': 0.98, 'pkm_Horsea': 0.98, 'pkm_Koffing': 0.97, 'pkm_Magikarp': 0.97, 'pkm_Nidoran♀': 0.97, 'pkm_Oddish': 0.97, 'pkm_Poliwag': 0.99, 'pkm_Psyduck': 0.98, 'pkm_Seadra': 0.98, 'pkm_Spearow': 0.97, 'pkm_Squirtle': 0.98, 'pkm_Weedle': 0.99}
Bulbasaur {'pkm_Caterpie': 0.99, 'pkm_Clefairy': 0.97, 'pkm_Dratini': 0.99, 'pkm_Eevee': 0.98, 'pkm_Goldeen': 0.97, 'pkm_Golduck': 0.99, 'pkm_Koffing': 0.97, 'pkm_Nidoran♀': 0.98, 'pkm_Nidoran♂': 0.98, 'pkm_Oddish': 0.99, 'pkm_Pidgey': 0.97, 'pkm_Spearow': 0.98, 'pkm_Squirtle': 0.98, 'pkm_Venonat': 0.97, 'pkm_Weedle': 0.98, 'pkm_Zubat': 0.97}
Caterpie {'pkm_Dratini': 0.98, 'pkm_Eevee': 0.98, 'pkm_Goldeen': 0.98, 'pkm_Golduck': 0.99, 'pkm_Koffing': 0.97, 'pkm_Nidoran♀': 0.97, 'pkm_Oddish': 0.98, 'pkm_Pidgey': 0.97, 'pkm_Poliwag': 0.98, 'pkm_Spearow': 0.98, 'pkm_Squirtle': 0.98, 'pkm_Venonat': 0.97, 'pkm_Weedle': 0.98

In [606]:
df_corr_viz.index != col

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [599]:
corr_dict['pkm_Squirtle']

{'pkm_Venonat': 0.96, 'pkm_Weedle': 0.98}

In [443]:
geodesic_score = silhouette_score(np.array(df_sample[['lat', 'lng']].iloc[db.core_sample_indices_]).tolist(), 
                                  db.labels_[db.core_sample_indices_], metric="haversine", #sample_size=3000
                                 )

In [444]:
geodesic_score

-0.07033980956192097

In [595]:
fig = go.Figure()
# for cl in sorted(df_sample['cluster_id_db'].unique(), reverse=True):
fig.add_trace(go.Scattermapbox( lat=df_sample['lat'],  
                                lon=df_sample['lng'], 
                                mode='markers',
                                marker=go.scattermapbox.Marker(size=10, 
                                                              opacity=1, 
                                                              color=df_sample['cluster_color'], 
                                                              colorbar=dict(title='cluster'))
                              ))
# update mapbox style and layout
fig.update_mapboxes(
    domain=dict(column=2),
    style='open-street-map', 
    #'mapbox://styles/mapbox/light-v10',
    zoom=7.8,
    center=map_centre
)

fig.update_layout(height=600, margin_t=50, showlegend=False, mapbox_accesstoken=mapboxtoken
                 )

fig.show()

In [None]:
for i, df_cen in df_centroids_dict.items():
    df_cen['unique_species'] = df_cen.merge(df.groupby(['cluster_id_'+str(i)]).agg({'name':'nunique'}).reset_index(), 
                                            left_on=['cluster_id'], right_on=['cluster_id_'+str(i)])['name']
    
    df_cen['spawns'] = df_cen.merge(df.groupby(['cluster_id_'+str(i)]).agg({'num':'sum'}).reset_index(), 
                                            left_on=['cluster_id'], right_on=['cluster_id_'+str(i)])['num']
    
#df_centroids['unique_species'] = df_centroids.merge(df.groupby(['cluster_id']).agg({'name':'nunique'}).reset_index(), on=['cluster_id'])['name']

In [None]:
fig = go.Figure()
fig.add_trace(go.Scattermapbox( lat=df_centroids['lat'],  
                                lon=df_centroids['lng'], 
                                #name=cluster,
                                mode='markers',
                                customdata = df_centroids_dict[20][['cluster_id', 'size', 'unique_species'
                                                          ]],
                               hovertemplate='''<b>%{customdata[0]}</b>
                                                <br>lat: %{lat}, lng: %{lon}
                                                <br>spawns: %{customdata[1]}
                                                <br>distinct species: %{customdata[2]}
                                                 '''
                                         ,
                                marker=go.scattermapbox.Marker(size=df_centroids_dict[20]['spawns']/10000, 
                                                               color=df_centroids_dict[20]['unique_species'],
                                                             
                                                               #df_centroids_dict[pokemon]['size']*.5,
                                                              #color='red',
                                                              opacity=0.7, 
                                                              colorscale='Magma_r', 
                                                               colorbar=dict(title='distinct species')
                               ),
                               )
             )

# update mapbox style and layout
fig.update_mapboxes(
    #domain=dict(column=2, row=1),
    style='open-street-map', #'mapbox://styles/mapbox/light-v10',
    zoom=8.5,
    center={'lat':37.65, 'lon':-122.2},
   # title_text='Pikachu'
)

fig.update_layout(height=700, 
                  margin_t=50, showlegend=False)

fig.show()

In [None]:
pokemon_dummies = [col_name for col_name in df.columns.tolist() if 'pkm' in col_name]
for pkm in pokemon_dummies:
    df_centroids[pkm] = df_centroids['cluster_id'].map(df.groupby('cluster_id').agg({pkm:'sum'})[pkm])

In [None]:
df_ = df_centroids[pokemon_dummies]
df_corr = df_.corr().round(2)

In [None]:
mask = np.triu(np.ones_like(df_corr, dtype=bool))
df_corr_viz = df_corr.mask(mask)

In [None]:
df_corr

In [None]:
corr_dict = {}
for col in pokemon_dummies:
    if len(df_corr_viz.loc[(abs(df_corr_viz[col])>.8), col])> 0:
        corr_dict[col] = dict(df_corr_viz.loc[(abs(df_corr_viz[col])>.8), col])
        print(col[4:], corr_dict[col])

In [None]:
pval_dict = {}
for pkm1 in corr_dict.keys():
    print(pkm1)
    pvals_list = []
    for pkm2 in corr_dict[pkm1].keys():
        pval = pearsonr(df[pkm1], df[pkm2])[1].round(5)
        pvals_list.append({pkm2: pval})
    pval_dict[pkm1] = pvals_list

pval_dict

In [None]:
fig = go.Figure()
df_ = df_centroids[df_centroids['cluster_id'].isin(df[(df['pkm_Pidgey']==1)|(df['pkm_Zubat']==1)]['cluster_id'])]
df_['has_pidgey'] = 1*df_centroids['cluster_id'].isin(df[(df['pkm_Pidgey']==1)]['cluster_id'])
df_['has_zubat'] = 1*df_centroids['cluster_id'].isin(df[(df['pkm_Zubat']==1)]['cluster_id'])

df_['pidgey_zubat'] = df_.apply(lambda x: 'Only Pidgey' if (x['has_pidgey']==1)&(x['has_zubat']==0)
                                          else 'Both', axis=1)

df_.loc[(df_['has_zubat']==1)&(df_['has_pidgey']==0), 'pidgey_zubat'] = 'Only Zubat'

for i in ['Only Pidgey', 'Only Zubat', 'Both'
         ]:
    df__ = df_[df_['pidgey_zubat']==i]
    fig.add_trace(go.Scattermapbox( lat=df__ ['lat'],  
                                    lon=df__ ['lng'], 
                                    #name=cluster,
                                    mode='markers',
                                    customdata = df__ [['cluster_id', 'pidgey_zubat', 'size']],
                                   hovertemplate='''<b>%{customdata[0]}</b>
                                                    <br>lat: %{lat}, lng: %{lon}
                                                    <br>pidgey/zubat: %{customdata[1]}
                                                    <br>size: %{customdata[2]}
                                                     '''
                                             ,
                                   name=i,
                                    marker=go.scattermapbox.Marker(size=df__['size']/100, 
#                                                                    color=df_['pidgey_zubat'],

                                                                   #df_centroids_dict[pokemon]['size']*.5,
                                                                  #color='red',
                                                                  opacity=0.7, 
    #                                                               colorscale='Magma_r', 
    #                                                                colorbar=dict(title='distinct species')
                                   ),
                                   )
                 )

# update mapbox style and layout
fig.update_mapboxes(
    #domain=dict(column=2, row=1),
    style='open-street-map', #'mapbox://styles/mapbox/light-v10',
    zoom=8.5,
    center={'lat':37.65, 'lon':-122.2},
   # title_text='Pikachu'
)

fig.update_layout(height=700, 
                  margin_t=50, showlegend=False)

fig.show()

In [None]:
fig = go.Figure()
df_ = df_centroids[df_centroids['cluster_id'].isin(df[(df['pkm_Pidgey']==1)|(df['pkm_Rattata']==1)]['cluster_id'])]
df_['has_pidgey'] = 1*df_centroids['cluster_id'].isin(df[(df['pkm_Pidgey']==1)]['cluster_id'])
df_['has_rattata'] = 1*df_centroids['cluster_id'].isin(df[(df['pkm_Rattata']==1)]['cluster_id'])

df_['pidgey_rattata'] = df_.apply(lambda x: 'Only Pidgey' if (x['has_pidgey']==1)&(x['has_rattata']==0)
                                          else 'Both', axis=1)

df_.loc[(df_['has_rattata']==1)&(df_['has_pidgey']==0), 'pidgey_rattata'] = 'Only Rattata'

for i in ['Only Pidgey', 'Only Rattata', 'Both'
         ]:
    df__ = df_[df_['pidgey_rattata']==i]
    fig.add_trace(go.Scattermapbox( lat=df__ ['lat'],  
                                    lon=df__ ['lng'], 
                                    #name=cluster,
                                    mode='markers',
                                    customdata = df__ [['cluster_id', 'pidgey_rattata', 'size']],
                                   hovertemplate='''<b>%{customdata[0]}</b>
                                                    <br>lat: %{lat}, lng: %{lon}
                                                    <br>pidgey/rattata: %{customdata[1]}
                                                    <br>size: %{customdata[2]}
                                                     '''
                                             ,
                                   name=i,
                                    marker=go.scattermapbox.Marker(size=df__['size']/100, 
#                                                                    color=df_['pidgey_zubat'],

                                                                   #df_centroids_dict[pokemon]['size']*.5,
                                                                  #color='red',
                                                                  opacity=0.7, 
    #                                                               colorscale='Magma_r', 
    #                                                                colorbar=dict(title='distinct species')
                                   ),
                                   )
                 )

# update mapbox style and layout
fig.update_mapboxes(
    #domain=dict(column=2, row=1),
    style='open-street-map', #'mapbox://styles/mapbox/light-v10',
    zoom=8.5,
    center={'lat':37.65, 'lon':-122.2},
   # title_text='Pikachu'
)

fig.update_layout(height=700, 
                  margin_t=50, showlegend=False)

fig.show()

In [111]:
pokemon_dummies = [col_name for col_name in df.columns.tolist() if 'pkm' in col_name]
df_ = df[pokemon_dummies]
df_corr = df_.corr().round(2)

In [112]:
mask = np.triu(np.ones_like(df_corr, dtype=bool))
df_corr_viz = df_corr.mask(mask)

In [None]:
# set the self correlation to Nna
for pkm in pokemon_dummies:
    df_corr.loc[pkm, pkm] = None

In [None]:
corr_dict = {}
for col in pokemon_dummies:
    if len(df_corr_viz.loc[(abs(df_corr_viz[col])>.05), col])> 0:
        corr_dict[col] = dict(df_corr_viz.loc[(abs(df_corr_viz[col])>.03), col])
        print(col[4:], corr_dict[col])

In [None]:
from scipy.stats import pearsonr

In [None]:
pval_dict = {}
for pkm1 in corr_dict.keys():
    print(pkm1)
    pvals_list = []
    for pkm2 in corr_dict[pkm1].keys():
        pval = pearsonr(df[pkm1], df[pkm2])[1].round(5)
        pvals_list.append({pkm2: pval})
    pval_dict[pkm1] = pvals_list

In [None]:
pval_dict

In [None]:
pd.DataFrame(corr_dict)

In [None]:
pval_dict

In [None]:
df_pvals = pd.DataFrame(index=pokemon_dummies)
for col_pkm in pokemon_dummies:
    pvals_list = []
    for row_pkm in pokemon_dummies:
        pvals_list.append(pearsonr(df[col_pkm],df[row_pkm])[1])
    df_pvals[col_pkm] = pvals_list

In [None]:
# FOR LEARNING, NOT SUITABLE FOR LARGE NUMBER OF CLUSTERS

# As a learning exercise, I will implement K-Means clustering from scratch. 
# def k_means_from_scratch(data, k):
#     random.seed(0)
#     centroids = random.sample(data,k)
#     counter = 0
#     while counter < 1000:
#         clustering = {cluster:np.empty(shape=(0,2)) for cluster in np.arange(k)}
#         # for each data point
#         for i, spwn in enumerate(data):
#             # calculate distance from each centroid
#             distances = [((spwn[0]-c[0])**2+(spwn[1]-c[1])**2)**.5 for c in centroids]
#             # add data point to the cluster of the centroid that it is closest to
#             clustering[np.argmin(distances)] = np.vstack([clustering[np.argmin(distances)],spwn])

#         # calculate new centroids as the mid point of each cluster 
#         new_centroids = np.array([clustering[c].mean(axis=0) for c in np.arange(k)])

#         # update centroids until there are no longer changing
#         if np.array_equiv(new_centroids,centroids):
#             print(f'Converged, final centroids: {centroids}')
#             break

#         centroids = new_centroids
#         counter += 1
        
#         clustering_dict = {key:{'centroid':centroids[key], 
#                                 'size':len(clustering[key]),
#                                 'points':clustering[key]} for key, values in clustering.items()}

#     return clustering_dict

### Agglomerative clustering
An alternative clustering method is Agglomerative Clustering. It is a hierarchical clustering starting with each data point being in its own cluster. Clusters that are closest to each other are paired up in each iteration until the desired number of clusters is reached or until the distance threshold is met. 

In [None]:


# from sklearn.cluster import AgglomerativeClustering
df_lat_lng = np.array(df[['lat', 'lng']].head(10000)).tolist()
aggclustering = AgglomerativeClustering(n_clusters=10, #distance_threshold=0.0001, 
                                        compute_distances=True).fit(df_lat_lng)

In [None]:
# pikachu_spawns = np.array(df[df['name']=='Pikachu'][['lat', 'lng']]).tolist()
# pikachu_clusters = k_means_from_scratch(pikachu_spawns, 10)
# df_pikachu_centroids = pd.DataFrame([i['centroid'] for i in list(pikachu_clusters.values())], columns=['lat', 'lng'])
# df_pikachu_centroids['size'] = [i['size'] for i in list(pikachu_clusters.values())]

In [None]:
df

In [None]:
df_counts = df.groupby('name').agg({'s2_id':'count'}).reset_index().sort_values('name').rename(columns={'s2_id':'count'})
# df_counts[(df_counts['count']>1000)]
df_counts['count'].describe()

In [None]:
df_centroids_dict = {}
for pokemon in ['Pikachu', 'Squirtle', 'Bulbasaur','Charmander', 'Raticate']:
    print(pokemon, len(df[df['name']==pokemon]))
    df_lat_lng = np.array(df[df['name']==pokemon][['lat', 'lng']]).tolist()
    kmeans = KMeans(n_clusters=20, random_state=42, n_init="auto").fit(df_lat_lng)
    df_centroids = pd.DataFrame(kmeans.cluster_centers_, columns=['lat', 'lng'])
    df_centroids['size'] = pd.Series(kmeans.labels_).value_counts()
    df_centroids_dict[pokemon] = df_centroids

In [None]:
from sklearn.cluster import AgglomerativeClustering
df_aggcentroids_dict = {}
for pokemon in ['Pikachu', #'Squirtle', 'Bulbasaur','Charmander', 'Raticate'
               ]:
    print(pokemon, len(df[df['name']==pokemon]))
    df_lat_lng = np.array(df[df['name']==pokemon][['lat', 'lng']]).tolist()
    aggclustering = AgglomerativeClustering(n_clusters=None, distance_threshold=0.1, compute_distances=True).fit(df_lat_lng)
#     df_aggcentroids = pd.DataFrame({'label':aggclustering.labels_, 'distances':aggclustering.distances_})
    
#     df_aggcentroids_dict[pokemon] = df_aggcentroids

In [None]:
df_ = pd.DataFrame(df_lat_lng).rename(columns={0:'lat', 1:'lng'})

In [None]:
df_

In [None]:

# df_ = df[df['name']==pokemon][['lat', 'lng']]
df_['cluster_label'] = pd.Series(aggclustering.labels_)

df_counts = df_.groupby('cluster_label').agg({'lat':'count'}).reset_index().rename(columns={'lat':'size'})
clusters_gt_10 = df_counts[df_counts['size']>10]['cluster_label'].to_list()


df_ = df_[df_['cluster_label'].isin(clusters_gt_10)]

df_

In [None]:
clusters_gt_10

In [None]:
fig = go.Figure()

for cluster in clusters_gt_10:
    df__ = df_[df_['cluster_label']==cluster]
    fig.add_trace(go.Scattermapbox(
        lat=df__['lat'],  
        lon=df__['lng'], 
        name=cluster,
        mode='markers',
        marker=go.scattermapbox.Marker(
            size=5, #df_centroids_dict[pokemon]['size']*.5,
            #color='red',
            opacity=.7
        )
    ))

# update mapbox style and layout
fig.update_layout(
    mapbox_style='stamen-toner', #'mapbox://styles/mapbox/light-v10',
    mapbox_zoom=8,
    mapbox_center={'lat':37.65, 'lon':-122.2},
    width=800,
    height=600
)

fig.show()

In [None]:
pd.Series(aggclustering.distances_).describe()

In [None]:
pd.Series(aggclustering.distances_).value_counts()

In [None]:
# kmeans = KMeans(n_clusters=20, random_state=42, n_init="auto").fit()
# df_pikachu_centroids_sklearn = pd.DataFrame(kmeans.cluster_centers_, columns=['lat', 'lng'])
# df_pikachu_centroids_sklearn['size'] = pd.Series(kmeans.labels_).value_counts()

In [None]:
fig = go.Figure()

for pokemon in [#'Pikachu', 
                #'Bulbasaur', 
                'Squirtle', 
                'Charmander', 
                'Raticate']:
    
    fig.add_trace(go.Scattermapbox(
        lat=df_centroids_dict[pokemon]['lat'],  
        lon=df_centroids_dict[pokemon]['lng'], 
        name=pokemon,
        mode='markers',
        marker=go.scattermapbox.Marker(
            size=df_centroids_dict[pokemon]['size']*.5,
            #color='red',
            opacity=.7
        )
    ))

# update mapbox style and layout
fig.update_layout(
    mapbox_style='open-street-map', #'stamen-toner', #'mapbox://styles/mapbox/light-v10',
    mapbox_zoom=8,
    mapbox_center={'lat':37.65, 'lon':-122.2},
    width=800,
    height=600
)

fig.show()

In [None]:

df_counts[(df_counts['count']>np.percentile(df_counts['count'], 50))&(df_counts['count']<np.percentile(df_counts['count'], 75))].sort_values('count').head()

rare_pokemons = df_counts[df_counts['count']<10]['name'].tolist()

In [None]:
df_counts[df_counts['count']<10]['name'].tolist()

In [None]:
fig = px.density_mapbox(df[df['name'].isin(rare_pokemons)], lat='lat', lon='lng',
                        mapbox_style='open-street-map',
                        zoom=8.3,
                        center={'lat':37.65, 'lon':-122.2},
                        radius=7,
                       width=800,
                       height=600)

fig.show()

In [None]:
rare_pokemons = df_counts[df_counts['count']<5]['name'].tolist()
fig = px.density_mapbox(df[df['name'].isin(rare_pokemons)], lat='lat', lon='lng',
                        mapbox_style='open-street-map',
                        zoom=8.3,
                        center={'lat':37.65, 'lon':-122.2},
                        radius=7,
                       width=800,
                       height=600)
fig.show()