In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from sklearn.cluster import KMeans
import random
import time

In [184]:
df_complete = pd.read_csv('data/pokemon-spawns.csv')
pokemon_dummies = pd.get_dummies(df_complete['name'], prefix='pkm')
df = df_complete[(df_complete['lng']>=-123)
                &(df_complete['lng']<-120)
                &(df_complete['lat']>=37)
                &(df_complete['lat']<38)
                ]
df = pd.concat([df, pokemon_dummies], axis=1).dropna()

In [185]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 221979 entries, 0 to 314104
Columns: 148 entries, s2_id to pkm_Zubat
dtypes: float64(6), object(2), uint8(140)
memory usage: 44.9+ MB


In [188]:
df_complete[(df_complete['lng']>=-123)
                &(df_complete['lng']<-120)
                &(df_complete['lat']>=37)
                &(df_complete['lat']<38)
                ].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 221979 entries, 0 to 314104
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   s2_id         221979 non-null  int64  
 1   s2_token      221979 non-null  object 
 2   num           221979 non-null  int64  
 3   name          221979 non-null  object 
 4   lat           221979 non-null  float64
 5   lng           221979 non-null  float64
 6   encounter_ms  221979 non-null  int64  
 7   disppear_ms   221979 non-null  int64  
dtypes: float64(2), int64(4), object(2)
memory usage: 15.2+ MB


In [186]:
df.columns[:10]

Index(['s2_id', 's2_token', 'num', 'name', 'lat', 'lng', 'encounter_ms',
       'disppear_ms', 'pkm_Abra', 'pkm_Aerodactyl'],
      dtype='object')

## Aim
I will use this data to try to asnwer the following questions: 
* How are Pokemons distributed across the SF Bay Area? (clustering, spatial distances, etc)
* Which pokemons often occur in the same area? (spatial correlation)
* What is the shortest route that will allow someone to spawn every pokemon? (Travelling Salesman Problem)


## Exploratory Data Analysis

First I will plot the data on a map using Mapbox. Plotly allow you to choose the base map from various public raster tile servers e.g. OpenStreetMap, Stamen Terrain. 

In [187]:
df['name'].value_counts().head(50)

Pidgey        32532
Zubat         30678
Rattata       20774
Spearow       11783
Weedle         9281
Paras          8919
Ekans          8007
Eevee          7445
Doduo          5670
Caterpie       5585
Magikarp       4754
Venonat        4145
Nidoran♂       3990
Nidoran♀       3686
Mankey         3372
Growlithe      3333
Meowth         3265
Clefairy       3146
Poliwag        2591
Krabby         2477
Staryu         2318
Goldeen        2295
Pidgeotto      1999
Oddish         1920
Bellsprout     1884
Psyduck        1858
Sandshrew      1796
Geodude        1787
Bulbasaur      1766
Cubone         1456
Pikachu        1435
Diglett        1366
Pinsir         1298
Rhyhorn        1290
Horsea         1247
Tentacool      1120
Abra           1055
Jigglypuff     1041
Exeggcute       998
Machop          975
Magnemite       965
Voltorb         948
Golbat          901
Ponyta          813
Slowpoke        747
Raticate        639
Squirtle        603
Gastly          598
Kakuna          524
Dratini         488


In [345]:
fig = make_subplots(rows=3, cols=2, column_widths=[0.1, 0.1],vertical_spacing=0.1, horizontal_spacing=0.1,
                    subplot_titles=('Pikachu Scatter', 'Pikachu Density', 
                                   'Squirtle Scatter', 'Squirtle Density', 
                                   'Bulbasaur Scatter', 'Bulbasaur Density'),
                    specs=[[{'type':'mapbox'}]*2,  
                           [{'type':'mapbox'}]*2,
                           [{'type':'mapbox'}]*2,
                          ])

df__ = df[df['name']=='Pikachu']
fig.add_trace(go.Scattermapbox( lat=df__['lat'],  
                                lon=df__['lng'], 
                                #name=cluster,
                                mode='markers',
                                marker=go.scattermapbox.Marker(size=5, #df_centroids_dict[pokemon]['size']*.5,
                                                              #color='red',
                                                              opacity=.7),
                               subplot='mapbox'),
                            row=1, col=1)

fig.add_trace(go.Densitymapbox(lat=df__['lat'],  
                               lon=df__['lng'], 
                               #name=cluster,
                               radius=3,
                               subplot='mapbox1'),
                               row=1, col=2
                               )

df__ = df[df['name']=='Squirtle']

fig.add_trace(go.Scattermapbox( lat=df__['lat'],  
                                lon=df__['lng'], 
                                #name=cluster,
                                mode='markers',
                                marker=go.scattermapbox.Marker(size=5, #df_centroids_dict[pokemon]['size']*.5,
                                                              #color='red',
                                                              opacity=1),
                               subplot='mapbox'),
                            row=2, col=1)

fig.add_trace(go.Densitymapbox(lat=df__['lat'],  
                               lon=df__['lng'], 
                               #name=cluster,
                               radius=3,
                               subplot='mapbox1'),
                               row=2, col=2
                               )

df__ = df[df['name']=='Bulbasaur']

fig.add_trace(go.Scattermapbox( lat=df__['lat'],  
                                lon=df__['lng'], 
                                #name=cluster,
                                mode='markers',
                                marker=go.scattermapbox.Marker(size=5, #df_centroids_dict[pokemon]['size']*.5,
                                                              #color='red',
                                                              opacity=1),
                               subplot='mapbox'),
                            row=3, col=1)

fig.add_trace(go.Densitymapbox(lat=df__['lat'],  
                               lon=df__['lng'], 
                               #name=cluster,
                               radius=3,
                               subplot='mapbox1'),
                               row=3, col=2
                               )



# update mapbox style and layout
fig.update_mapboxes(
    domain=dict(column=2),
    style='open-street-map', #'mapbox://styles/mapbox/light-v10',
    zoom=7.8,
    center={'lat':37.65, 'lon':-122.2},
   # title_text='Pikachu'
)

fig.update_layout(height=1000, margin_t=50, showlegend=False)

fig.show()

In [354]:
fig = make_subplots(rows=1, cols=2, column_widths=[0.1, 0.1],vertical_spacing=0.1, horizontal_spacing=0.1,
                    subplot_titles=('Pidgey', 'Zubat'),
                    specs=[[{'type':'mapbox'}]*2])

df__ = df[df['name']=='Pidgey']
fig.add_trace(go.Scattermapbox( lat=df__['lat'],  
                                lon=df__['lng'], 
                                #name=cluster,
                                mode='markers',
                                marker=go.scattermapbox.Marker(size=5, #df_centroids_dict[pokemon]['size']*.5,
                                                              #color='red',
                                                              opacity=.7),
                               name='Pidgey',
                               subplot='mapbox'),
              
                            row=1, col=1)


df__ = df[df['name']=='Zubat']

fig.add_trace(go.Scattermapbox( lat=df__['lat'],  
                                lon=df__['lng'], 
                                #name=cluster,
                                mode='markers',
                                marker=go.scattermapbox.Marker(size=5, #df_centroids_dict[pokemon]['size']*.5,
                                                              #color='red',
                                                              opacity=1),
                               name='Zubat',
                               subplot='mapbox1'),
                            row=1, col=1)


# update mapbox style and layout
fig.update_mapboxes(
    domain=dict(column=2, row=1),
    style='open-street-map', #'mapbox://styles/mapbox/light-v10',
    zoom=7.8,
    center={'lat':37.65, 'lon':-122.2},
   # title_text='Pikachu'
)

fig.update_layout(height=1000, margin_t=50, showlegend=False)

fig.show()

In [127]:
fig = px.density_mapbox(df[df['name']=='Voltorb'], lat='lat', lon='lng',
                        mapbox_style='open-street-map',
                        zoom=8.3,
                        center={'lat':37.65, 'lon':-122.2},
                        radius=7,
                       width=800,
                       height=600)
fig.show()

In [129]:
df[['lat', 'lng']].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 314105 entries, 0 to 308533
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   lat     221979 non-null  float64
 1   lng     221979 non-null  float64
dtypes: float64(2)
memory usage: 7.2 MB


### K-Means Clustering
As a learning exercise, I will implement K-Means clustering from scratch. K-Means clustering is an unsupervised learning algorithm that partitions the data into K non-overlapping subgroups. It takes an iterative approach to find the optimal clustering that minimizes the within-cluster sum of squares (sum of squared distances between each data point and the centroid of its assigned cluster.

In [233]:
# from sklearn.cluster import AgglomerativeClustering
# df_lat_lng = np.array(df[['lat', 'lng']]).tolist()
# aggclustering = AgglomerativeClustering(n_clusters=None, distance_threshold=0.1, compute_distances=True).fit(df_lat_lng)

In [402]:
start = time.time()
df_lat_lng = np.array(df[['lat', 'lng']]).tolist()
kmeans = KMeans(n_clusters=50, random_state=42, n_init="auto").fit(df_lat_lng)
df_centroids = pd.DataFrame(kmeans.cluster_centers_, columns=['lat', 'lng'])
df_centroids['size'] = pd.Series(kmeans.labels_).value_counts()
print('{:,.0f}s'.format(time.time() - start))

2s


In [403]:
# reset index as cluster_id
df_centroids = df_centroids.reset_index(names='cluster_id')
df_centroids

# label each spawn with cluster id
df['cluster_id'] = kmeans.labels_

In [404]:
df_centroids['unique_species'] = df_centroids.merge(df.groupby(['cluster_id']).agg({'name':'nunique'}).reset_index(), on=['cluster_id'])['name']
#df_centroids['unique_species'] = df_centroids.merge(df.groupby(['cluster_id']).agg({'name':'nunique'}).reset_index(), on=['cluster_id'])['name']

In [405]:
fig = go.Figure()
fig.add_trace(go.Scattermapbox( lat=df_centroids['lat'],  
                                lon=df_centroids['lng'], 
                                #name=cluster,
                                mode='markers',
                                customdata = df_centroids[['cluster_id', 'size', 'unique_species']],
                               hovertemplate='''<b>%{customdata[0]}</b>
                                                <br>lat: %{lat}, lng: %{lon}
                                                <br>spawns: %{customdata[1]}
                                                <br>distinct species: %{customdata[2]}
                                                 '''
                                         ,
                                marker=go.scattermapbox.Marker(size=df_centroids['size']/100, 
                                                               color=df_centroids['unique_species'],
                                                             
                                                               #df_centroids_dict[pokemon]['size']*.5,
                                                              #color='red',
                                                              opacity=0.7, 
                                                              colorscale='Magma_r', 
                                                               colorbar=dict(title='distinct species')
                               ),
                               )
             )

# update mapbox style and layout
fig.update_mapboxes(
    #domain=dict(column=2, row=1),
    style='open-street-map', #'mapbox://styles/mapbox/light-v10',
    zoom=8.5,
    center={'lat':37.65, 'lon':-122.2},
   # title_text='Pikachu'
)

fig.update_layout(height=700, 
                  margin_t=50, showlegend=False)

fig.show()

In [420]:
df.groupby('cluster_id').agg({'pkm_Pidgey':'sum'})

Unnamed: 0_level_0,pkm_Pidgey
cluster_id,Unnamed: 1_level_1
0,829
1,1218
2,600
3,804
4,426
5,456
6,733
7,401
8,885
9,178


In [None]:
pokemon_dummies = [col_name for col_name in df.columns.tolist() if 'pkm' in col_name]

In [425]:
for pkm in pokemon_dummies:
    df_centroids[pkm] = df_centroids['cluster_id'].map(df.groupby('cluster_id').agg({pkm:'sum'})[pkm])


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr

In [429]:
df_ = df_centroids[pokemon_dummies]
df_corr = df_.corr().round(2)

In [430]:
mask = np.triu(np.ones_like(df_corr, dtype=bool))
df_corr_viz = df_corr.mask(mask)

In [431]:
df_corr

Unnamed: 0,pkm_Abra,pkm_Aerodactyl,pkm_Alakazam,pkm_Arbok,pkm_Arcanine,pkm_Beedrill,pkm_Bellsprout,pkm_Bulbasaur,pkm_Butterfree,pkm_Caterpie,...,pkm_Victreebel,pkm_Vileplume,pkm_Voltorb,pkm_Vulpix,pkm_Wartortle,pkm_Weedle,pkm_Weepinbell,pkm_Weezing,pkm_Wigglytuff,pkm_Zubat
pkm_Abra,1.00,-0.06,-0.11,0.36,0.15,-0.34,-0.38,-0.25,-0.22,-0.39,...,,-0.11,-0.08,0.34,-0.16,-0.40,-0.10,-0.08,-0.02,-0.15
pkm_Aerodactyl,-0.06,1.00,-0.03,-0.06,-0.03,0.17,0.10,0.02,0.18,0.30,...,,0.16,0.04,0.07,0.65,0.20,-0.06,-0.03,0.18,0.57
pkm_Alakazam,-0.11,-0.03,1.00,-0.13,-0.07,0.05,-0.11,-0.04,-0.06,-0.10,...,,-0.02,0.01,-0.11,-0.06,-0.10,-0.08,-0.04,-0.04,-0.10
pkm_Arbok,0.36,-0.06,-0.13,1.00,0.26,-0.21,-0.44,-0.09,-0.13,-0.42,...,,-0.02,-0.14,0.18,-0.25,-0.44,-0.18,-0.08,0.15,-0.14
pkm_Arcanine,0.15,-0.03,-0.07,0.26,1.00,0.02,-0.14,0.20,-0.08,-0.08,...,,-0.07,0.00,0.45,-0.16,-0.12,-0.01,-0.07,0.21,0.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
pkm_Weedle,-0.40,0.20,-0.10,-0.44,-0.12,0.34,0.86,0.50,0.44,0.95,...,,0.26,0.09,-0.07,0.38,1.00,0.36,0.02,0.03,0.63
pkm_Weepinbell,-0.10,-0.06,-0.08,-0.18,-0.01,0.10,0.48,0.31,0.20,0.36,...,,0.22,0.28,0.07,0.02,0.36,1.00,-0.13,-0.10,0.20
pkm_Weezing,-0.08,-0.03,-0.04,-0.08,-0.07,-0.08,-0.08,-0.05,-0.10,0.04,...,,-0.04,-0.02,-0.08,-0.04,0.02,-0.13,1.00,0.23,-0.05
pkm_Wigglytuff,-0.02,0.18,-0.04,0.15,0.21,0.11,-0.04,-0.03,-0.07,0.00,...,,-0.04,0.14,-0.01,0.01,0.03,-0.10,0.23,1.00,0.01


In [435]:
corr_dict = {}
for col in pokemon_dummies:
    if len(df_corr_viz.loc[(abs(df_corr_viz[col])>.8), col])> 0:
        corr_dict[col] = dict(df_corr_viz.loc[(abs(df_corr_viz[col])>.8), col])
        print(col[4:], corr_dict[col])

Aerodactyl {'pkm_Porygon': 0.83}
Bellsprout {'pkm_Caterpie': 0.81, 'pkm_Oddish': 0.87, 'pkm_Staryu': 0.83, 'pkm_Weedle': 0.86}
Bulbasaur {'pkm_Charizard': 0.93}
Caterpie {'pkm_Oddish': 0.81, 'pkm_Staryu': 0.84, 'pkm_Weedle': 0.95}
Clefairy {'pkm_Eevee': 0.81}
Cloyster {'pkm_Drowzee': 0.83, 'pkm_Hypno': 0.97}
Cubone {'pkm_Geodude': 0.84, 'pkm_Graveler': 0.82, 'pkm_Rhyhorn': 0.92, 'pkm_Sandshrew': 0.89}
Dewgong {'pkm_Ninetales': 0.89}
Diglett {'pkm_Ekans': 0.89, 'pkm_Growlithe': 0.9}
Dodrio {'pkm_Doduo': 0.94}
Doduo {'pkm_Pinsir': 0.83}
Ekans {'pkm_Growlithe': 0.95, 'pkm_Meowth': 0.84}
Geodude {'pkm_Graveler': 0.82, 'pkm_Ponyta': 0.86, 'pkm_Rhyhorn': 0.84, 'pkm_Sandshrew': 0.91}
Golbat {'pkm_Zubat': 0.91}
Goldeen {'pkm_Horsea': 0.88, 'pkm_Krabby': 0.82, 'pkm_Poliwag': 0.9, 'pkm_Squirtle': 0.83, 'pkm_Staryu': 0.9, 'pkm_Weedle': 0.81}
Graveler {'pkm_Rhyhorn': 0.81, 'pkm_Sandshrew': 0.85}
Growlithe {'pkm_Meowth': 0.82}
Gyarados {'pkm_Lickitung': 0.85}
Horsea {'pkm_Poliwag': 0.87, 'pkm_Squir

In [436]:
pval_dict = {}
for pkm1 in corr_dict.keys():
    print(pkm1)
    pvals_list = []
    for pkm2 in corr_dict[pkm1].keys():
        pval = pearsonr(df[pkm1], df[pkm2])[1].round(5)
        pvals_list.append({pkm2: pval})
    pval_dict[pkm1] = pvals_list

pval_dict

pkm_Aerodactyl
pkm_Bellsprout
pkm_Bulbasaur
pkm_Caterpie
pkm_Clefairy
pkm_Cloyster
pkm_Cubone
pkm_Dewgong
pkm_Diglett
pkm_Dodrio
pkm_Doduo
pkm_Ekans
pkm_Geodude
pkm_Golbat
pkm_Goldeen
pkm_Graveler
pkm_Growlithe
pkm_Gyarados
pkm_Horsea
pkm_Kakuna
pkm_Krabby
pkm_Machamp
pkm_Machop
pkm_Magikarp
pkm_Magnemite
pkm_Magneton
pkm_Nidoran (f)
pkm_Nidoran♀
pkm_Paras
pkm_Pidgey
pkm_Poliwag
pkm_Poliwrath
pkm_Ponyta
pkm_Psyduck
pkm_Rhyhorn
pkm_Staryu


{'pkm_Aerodactyl': [{'pkm_Porygon': 0.96121}],
 'pkm_Bellsprout': [{'pkm_Caterpie': 0.0},
  {'pkm_Oddish': 5e-05},
  {'pkm_Staryu': 1e-05},
  {'pkm_Weedle': 0.0}],
 'pkm_Bulbasaur': [{'pkm_Charizard': 0.84129}],
 'pkm_Caterpie': [{'pkm_Oddish': 0.0},
  {'pkm_Staryu': 0.0},
  {'pkm_Weedle': 0.0}],
 'pkm_Clefairy': [{'pkm_Eevee': 0.0}],
 'pkm_Cloyster': [{'pkm_Drowzee': 0.95729}, {'pkm_Hypno': 0.99072}],
 'pkm_Cubone': [{'pkm_Geodude': 0.00056},
  {'pkm_Graveler': 0.43318},
  {'pkm_Rhyhorn': 0.00342},
  {'pkm_Sandshrew': 0.00055}],
 'pkm_Dewgong': [{'pkm_Ninetales': 0.99413}],
 'pkm_Diglett': [{'pkm_Ekans': 0.0}, {'pkm_Growlithe': 0.0}],
 'pkm_Dodrio': [{'pkm_Doduo': 0.04177}],
 'pkm_Doduo': [{'pkm_Pinsir': 0.0}],
 'pkm_Ekans': [{'pkm_Growlithe': 0.0}, {'pkm_Meowth': 0.0}],
 'pkm_Geodude': [{'pkm_Graveler': 0.38488},
  {'pkm_Ponyta': 0.01007},
  {'pkm_Rhyhorn': 0.00117},
  {'pkm_Sandshrew': 0.00013}],
 'pkm_Golbat': [{'pkm_Zubat': 0.0}],
 'pkm_Goldeen': [{'pkm_Horsea': 0.0003},
  {'pkm_K

In [408]:
fig = go.Figure()
df_ = df_centroids[df_centroids['cluster_id'].isin(df[(df['pkm_Pidgey']==1)|(df['pkm_Zubat']==1)]['cluster_id'])]
df_['has_pidgey'] = 1*df_centroids['cluster_id'].isin(df[(df['pkm_Pidgey']==1)]['cluster_id'])
df_['has_zubat'] = 1*df_centroids['cluster_id'].isin(df[(df['pkm_Zubat']==1)]['cluster_id'])

df_['pidgey_zubat'] = df_.apply(lambda x: 'Only Pidgey' if (x['has_pidgey']==1)&(x['has_zubat']==0)
                                          else 'Both', axis=1)

df_.loc[(df_['has_zubat']==1)&(df_['has_pidgey']==0), 'pidgey_zubat'] = 'Only Zubat'

for i in ['Only Pidgey', 'Only Zubat', 'Both'
         ]:
    df__ = df_[df_['pidgey_zubat']==i]
    fig.add_trace(go.Scattermapbox( lat=df__ ['lat'],  
                                    lon=df__ ['lng'], 
                                    #name=cluster,
                                    mode='markers',
                                    customdata = df__ [['cluster_id', 'pidgey_zubat', 'size']],
                                   hovertemplate='''<b>%{customdata[0]}</b>
                                                    <br>lat: %{lat}, lng: %{lon}
                                                    <br>pidgey/zubat: %{customdata[1]}
                                                    <br>size: %{customdata[2]}
                                                     '''
                                             ,
                                   name=i,
                                    marker=go.scattermapbox.Marker(size=df__['size']/100, 
#                                                                    color=df_['pidgey_zubat'],

                                                                   #df_centroids_dict[pokemon]['size']*.5,
                                                                  #color='red',
                                                                  opacity=0.7, 
    #                                                               colorscale='Magma_r', 
    #                                                                colorbar=dict(title='distinct species')
                                   ),
                                   )
                 )

# update mapbox style and layout
fig.update_mapboxes(
    #domain=dict(column=2, row=1),
    style='open-street-map', #'mapbox://styles/mapbox/light-v10',
    zoom=8.5,
    center={'lat':37.65, 'lon':-122.2},
   # title_text='Pikachu'
)

fig.update_layout(height=700, 
                  margin_t=50, showlegend=False)

fig.show()

In [416]:
fig = go.Figure()
df_ = df_centroids[df_centroids['cluster_id'].isin(df[(df['pkm_Pidgey']==1)|(df['pkm_Rattata']==1)]['cluster_id'])]
df_['has_pidgey'] = 1*df_centroids['cluster_id'].isin(df[(df['pkm_Pidgey']==1)]['cluster_id'])
df_['has_rattata'] = 1*df_centroids['cluster_id'].isin(df[(df['pkm_Rattata']==1)]['cluster_id'])

df_['pidgey_rattata'] = df_.apply(lambda x: 'Only Pidgey' if (x['has_pidgey']==1)&(x['has_rattata']==0)
                                          else 'Both', axis=1)

df_.loc[(df_['has_rattata']==1)&(df_['has_pidgey']==0), 'pidgey_rattata'] = 'Only Rattata'

for i in ['Only Pidgey', 'Only Rattata', 'Both'
         ]:
    df__ = df_[df_['pidgey_rattata']==i]
    fig.add_trace(go.Scattermapbox( lat=df__ ['lat'],  
                                    lon=df__ ['lng'], 
                                    #name=cluster,
                                    mode='markers',
                                    customdata = df__ [['cluster_id', 'pidgey_rattata', 'size']],
                                   hovertemplate='''<b>%{customdata[0]}</b>
                                                    <br>lat: %{lat}, lng: %{lon}
                                                    <br>pidgey/rattata: %{customdata[1]}
                                                    <br>size: %{customdata[2]}
                                                     '''
                                             ,
                                   name=i,
                                    marker=go.scattermapbox.Marker(size=df__['size']/100, 
#                                                                    color=df_['pidgey_zubat'],

                                                                   #df_centroids_dict[pokemon]['size']*.5,
                                                                  #color='red',
                                                                  opacity=0.7, 
    #                                                               colorscale='Magma_r', 
    #                                                                colorbar=dict(title='distinct species')
                                   ),
                                   )
                 )

# update mapbox style and layout
fig.update_mapboxes(
    #domain=dict(column=2, row=1),
    style='open-street-map', #'mapbox://styles/mapbox/light-v10',
    zoom=8.5,
    center={'lat':37.65, 'lon':-122.2},
   # title_text='Pikachu'
)

fig.update_layout(height=700, 
                  margin_t=50, showlegend=False)

fig.show()

In [409]:
pokemon_dummies = [col_name for col_name in df.columns.tolist() if 'pkm' in col_name]
df_ = df[pokemon_dummies]
df_corr = df_.corr().round(2)

In [410]:
mask = np.triu(np.ones_like(df_corr, dtype=bool))
df_corr_viz = df_corr.mask(mask)

In [411]:
# set the self correlation to Nna
for pkm in pokemon_dummies:
    df_corr.loc[pkm, pkm] = None

In [412]:
corr_dict = {}
for col in pokemon_dummies:
    if len(df_corr_viz.loc[(abs(df_corr_viz[col])>.05), col])> 0:
        corr_dict[col] = dict(df_corr_viz.loc[(abs(df_corr_viz[col])>.03), col])
        print(col[4:], corr_dict[col])

Caterpie {'pkm_Pidgey': -0.07, 'pkm_Rattata': -0.05, 'pkm_Spearow': -0.04, 'pkm_Zubat': -0.06}
Doduo {'pkm_Pidgey': -0.07, 'pkm_Rattata': -0.05, 'pkm_Spearow': -0.04, 'pkm_Zubat': -0.06}
Eevee {'pkm_Ekans': -0.04, 'pkm_Paras': -0.04, 'pkm_Pidgey': -0.08, 'pkm_Rattata': -0.06, 'pkm_Spearow': -0.04, 'pkm_Weedle': -0.04, 'pkm_Zubat': -0.07}
Ekans {'pkm_Paras': -0.04, 'pkm_Pidgey': -0.08, 'pkm_Rattata': -0.06, 'pkm_Spearow': -0.05, 'pkm_Weedle': -0.04, 'pkm_Zubat': -0.08}
Magikarp {'pkm_Pidgey': -0.06, 'pkm_Rattata': -0.05, 'pkm_Spearow': -0.04, 'pkm_Zubat': -0.06}
Nidoran♂ {'pkm_Pidgey': -0.06, 'pkm_Rattata': -0.04, 'pkm_Zubat': -0.05}
Paras {'pkm_Pidgey': -0.08, 'pkm_Rattata': -0.07, 'pkm_Spearow': -0.05, 'pkm_Weedle': -0.04, 'pkm_Zubat': -0.08}
Pidgey {'pkm_Poliwag': -0.05, 'pkm_Psyduck': -0.04, 'pkm_Rattata': -0.13, 'pkm_Sandshrew': -0.04, 'pkm_Spearow': -0.1, 'pkm_Staryu': -0.04, 'pkm_Venonat': -0.06, 'pkm_Weedle': -0.09, 'pkm_Zubat': -0.17}
Rattata {'pkm_Spearow': -0.08, 'pkm_Venonat

In [322]:
from scipy.stats import pearsonr

In [396]:
pval_dict = {}
for pkm1 in corr_dict.keys():
    print(pkm1)
    pvals_list = []
    for pkm2 in corr_dict[pkm1].keys():
        pval = pearsonr(df[pkm1], df[pkm2])[1].round(5)
        pvals_list.append({pkm2: pval})
    pval_dict[pkm1] = pvals_list

pkm_Caterpie
pkm_Doduo
pkm_Eevee
pkm_Ekans
pkm_Magikarp
pkm_Nidoran♂
pkm_Paras
pkm_Pidgey
pkm_Rattata
pkm_Spearow
pkm_Venonat
pkm_Weedle


In [397]:
pval_dict

{'pkm_Caterpie': [{'pkm_Pidgey': 0.0},
  {'pkm_Rattata': 0.0},
  {'pkm_Spearow': 0.0},
  {'pkm_Zubat': 0.0}],
 'pkm_Doduo': [{'pkm_Pidgey': 0.0},
  {'pkm_Rattata': 0.0},
  {'pkm_Spearow': 0.0},
  {'pkm_Zubat': 0.0}],
 'pkm_Eevee': [{'pkm_Ekans': 0.0},
  {'pkm_Paras': 0.0},
  {'pkm_Pidgey': 0.0},
  {'pkm_Rattata': 0.0},
  {'pkm_Spearow': 0.0},
  {'pkm_Weedle': 0.0},
  {'pkm_Zubat': 0.0}],
 'pkm_Ekans': [{'pkm_Paras': 0.0},
  {'pkm_Pidgey': 0.0},
  {'pkm_Rattata': 0.0},
  {'pkm_Spearow': 0.0},
  {'pkm_Weedle': 0.0},
  {'pkm_Zubat': 0.0}],
 'pkm_Magikarp': [{'pkm_Pidgey': 0.0},
  {'pkm_Rattata': 0.0},
  {'pkm_Spearow': 0.0},
  {'pkm_Zubat': 0.0}],
 'pkm_Nidoran♂': [{'pkm_Pidgey': 0.0},
  {'pkm_Rattata': 0.0},
  {'pkm_Zubat': 0.0}],
 'pkm_Paras': [{'pkm_Pidgey': 0.0},
  {'pkm_Rattata': 0.0},
  {'pkm_Spearow': 0.0},
  {'pkm_Weedle': 0.0},
  {'pkm_Zubat': 0.0}],
 'pkm_Pidgey': [{'pkm_Poliwag': 0.0},
  {'pkm_Psyduck': 0.0},
  {'pkm_Rattata': 0.0},
  {'pkm_Sandshrew': 0.0},
  {'pkm_Spearow': 0

In [390]:
pd.DataFrame(corr_dict)

Unnamed: 0,pkm_Caterpie,pkm_Doduo,pkm_Eevee,pkm_Ekans,pkm_Magikarp,pkm_Nidoran♂,pkm_Paras,pkm_Pidgey,pkm_Rattata,pkm_Spearow,pkm_Venonat,pkm_Weedle
pkm_Pidgey,-0.07,-0.07,-0.08,-0.08,-0.06,-0.06,-0.08,,,,,
pkm_Rattata,-0.05,-0.05,-0.06,-0.06,-0.05,-0.04,-0.07,-0.13,,,,
pkm_Spearow,-0.04,-0.04,-0.04,-0.05,-0.04,,-0.05,-0.1,-0.08,,,
pkm_Zubat,-0.06,-0.06,-0.07,-0.08,-0.06,-0.05,-0.08,-0.17,-0.13,-0.09,-0.06,-0.08
pkm_Ekans,,,-0.04,,,,,,,,,
pkm_Paras,,,-0.04,-0.04,,,,,,,,
pkm_Weedle,,,-0.04,-0.04,,,-0.04,-0.09,-0.07,-0.05,,
pkm_Poliwag,,,,,,,,-0.05,,,,
pkm_Psyduck,,,,,,,,-0.04,,,,
pkm_Sandshrew,,,,,,,,-0.04,,,,


In [401]:
pval_dict

{'pkm_Caterpie': [{'pkm_Pidgey': 0.0},
  {'pkm_Rattata': 0.0},
  {'pkm_Spearow': 0.0},
  {'pkm_Zubat': 0.0}],
 'pkm_Doduo': [{'pkm_Pidgey': 0.0},
  {'pkm_Rattata': 0.0},
  {'pkm_Spearow': 0.0},
  {'pkm_Zubat': 0.0}],
 'pkm_Eevee': [{'pkm_Ekans': 0.0},
  {'pkm_Paras': 0.0},
  {'pkm_Pidgey': 0.0},
  {'pkm_Rattata': 0.0},
  {'pkm_Spearow': 0.0},
  {'pkm_Weedle': 0.0},
  {'pkm_Zubat': 0.0}],
 'pkm_Ekans': [{'pkm_Paras': 0.0},
  {'pkm_Pidgey': 0.0},
  {'pkm_Rattata': 0.0},
  {'pkm_Spearow': 0.0},
  {'pkm_Weedle': 0.0},
  {'pkm_Zubat': 0.0}],
 'pkm_Magikarp': [{'pkm_Pidgey': 0.0},
  {'pkm_Rattata': 0.0},
  {'pkm_Spearow': 0.0},
  {'pkm_Zubat': 0.0}],
 'pkm_Nidoran♂': [{'pkm_Pidgey': 0.0},
  {'pkm_Rattata': 0.0},
  {'pkm_Zubat': 0.0}],
 'pkm_Paras': [{'pkm_Pidgey': 0.0},
  {'pkm_Rattata': 0.0},
  {'pkm_Spearow': 0.0},
  {'pkm_Weedle': 0.0},
  {'pkm_Zubat': 0.0}],
 'pkm_Pidgey': [{'pkm_Poliwag': 0.0},
  {'pkm_Psyduck': 0.0},
  {'pkm_Rattata': 0.0},
  {'pkm_Sandshrew': 0.0},
  {'pkm_Spearow': 0

In [324]:
df_pvals = pd.DataFrame(index=pokemon_dummies)
for col_pkm in pokemon_dummies:
    pvals_list = []
    for row_pkm in pokemon_dummies:
        pvals_list.append(pearsonr(df[col_pkm],df[row_pkm])[1])
    df_pvals[col_pkm] = pvals_list


An input array is constant; the correlation coefficient is not defined.


An input array is constant; the correlation coefficient is not defined.


An input array is constant; the correlation coefficient is not defined.


An input array is constant; the correlation coefficient is not defined.


An input array is constant; the correlation coefficient is not defined.


An input array is constant; the correlation coefficient is not defined.


An input array is constant; the correlation coefficient is not defined.


An input array is constant; the correlation coefficient is not defined.


An input array is constant; the correlation coefficient is not defined.


An input array is constant; the correlation coefficient is not defined.


An input array is constant; the correlation coefficient is not defined.


An input array is constant; the correlation coefficient is not defined.


An input array is constant; the correlation coefficient is not defined.


An input array is constant; the corre


An input array is constant; the correlation coefficient is not defined.


An input array is constant; the correlation coefficient is not defined.


An input array is constant; the correlation coefficient is not defined.


An input array is constant; the correlation coefficient is not defined.


An input array is constant; the correlation coefficient is not defined.


An input array is constant; the correlation coefficient is not defined.


An input array is constant; the correlation coefficient is not defined.


An input array is constant; the correlation coefficient is not defined.


An input array is constant; the correlation coefficient is not defined.


An input array is constant; the correlation coefficient is not defined.


An input array is constant; the correlation coefficient is not defined.



KeyboardInterrupt: 

In [13]:
# Step 1 - select K random points as initial centroids
# Step 2 - calculate the Euclidean distance between each point and each centroid, and assign each point to its nearest centroid
# Step 3 - update centroids by calculating the mid point of each cluster
# Step 4 - repeat steps 2 and 3 until no changes

In [469]:
# '''FOR LEARNING, NOT SUITABLE FOR LARGE NUMBER OF CLUSTERS'''
# def k_means_from_scratch(data, k):
#     random.seed(0)
#     centroids = random.sample(data,k)
#     counter = 0
#     while counter < 1000:
#         clustering = {cluster:np.empty(shape=(0,2)) for cluster in np.arange(k)}
#         # for each data point
#         for i, spwn in enumerate(data):
#             # calculate distance from each centroid
#             distances = [((spwn[0]-c[0])**2+(spwn[1]-c[1])**2)**.5 for c in centroids]
#             # add data point to the cluster of the centroid that it is closest to
#             clustering[np.argmin(distances)] = np.vstack([clustering[np.argmin(distances)],spwn])

#         # calculate new centroids as the mid point of each cluster 
#         new_centroids = np.array([clustering[c].mean(axis=0) for c in np.arange(k)])

#         # update centroids until there are no longer changing
#         if np.array_equiv(new_centroids,centroids):
#             print(f'Converged, final centroids: {centroids}')
#             break

#         centroids = new_centroids
#         counter += 1
        
#         clustering_dict = {key:{'centroid':centroids[key], 
#                                 'size':len(clustering[key]),
#                                 'points':clustering[key]} for key, values in clustering.items()}

#     return clustering_dict

In [474]:
# pikachu_spawns = np.array(df[df['name']=='Pikachu'][['lat', 'lng']]).tolist()
# pikachu_clusters = k_means_from_scratch(pikachu_spawns, 10)
# df_pikachu_centroids = pd.DataFrame([i['centroid'] for i in list(pikachu_clusters.values())], columns=['lat', 'lng'])
# df_pikachu_centroids['size'] = [i['size'] for i in list(pikachu_clusters.values())]

In [495]:
df

Unnamed: 0,s2_id,s2_token,num,name,lat,lng,encounter_ms,disppear_ms,pkm_Abra,pkm_Aerodactyl,...,pkm_Victreebel,pkm_Vileplume,pkm_Voltorb,pkm_Vulpix,pkm_Wartortle,pkm_Weedle,pkm_Weepinbell,pkm_Weezing,pkm_Wigglytuff,pkm_Zubat
0,-9.185795e+18,8085808cc6d,13.0,Weedle,37.793592,-122.408721,1.469520e+12,1.469520e+12,0,0,...,0,0,0,0,0,1,0,0,0,0
1,-9.185795e+18,8085808b51d,16.0,Pidgey,37.794746,-122.406420,1.469520e+12,1.469520e+12,0,0,...,0,0,0,0,0,0,0,0,0,0
2,-9.185795e+18,8085808b271,41.0,Zubat,37.794999,-122.404384,1.469521e+12,1.469520e+12,0,0,...,0,0,0,0,0,0,0,0,0,1
3,-9.185794e+18,808580f3587,16.0,Pidgey,37.795644,-122.407128,-1.000000e+00,1.469520e+12,0,0,...,0,0,0,0,0,0,0,0,0,0
4,-9.185794e+18,808580f4b1d,60.0,Poliwag,37.795592,-122.406331,1.469521e+12,1.469520e+12,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
308529,,,,,,,,,0,0,...,0,0,0,0,0,0,0,0,0,0
308530,,,,,,,,,0,0,...,0,0,0,0,0,0,0,0,0,0
308531,,,,,,,,,0,0,...,0,0,0,0,0,0,0,0,0,0
308532,,,,,,,,,0,0,...,0,0,0,0,0,0,0,0,0,0


In [576]:
df_counts = df.groupby('name').agg({'s2_id':'count'}).reset_index().sort_values('name').rename(columns={'s2_id':'count'})
# df_counts[(df_counts['count']>1000)]
df_counts['count'].describe()

count      137.000000
mean      1620.284672
std       4490.644184
min          2.000000
25%         34.000000
50%        168.000000
75%       1247.000000
max      32532.000000
Name: count, dtype: float64

In [520]:
df_centroids_dict = {}
for pokemon in ['Pikachu', 'Squirtle', 'Bulbasaur','Charmander', 'Raticate']:
    print(pokemon, len(df[df['name']==pokemon]))
    df_lat_lng = np.array(df[df['name']==pokemon][['lat', 'lng']]).tolist()
    kmeans = KMeans(n_clusters=20, random_state=42, n_init="auto").fit(df_lat_lng)
    df_centroids = pd.DataFrame(kmeans.cluster_centers_, columns=['lat', 'lng'])
    df_centroids['size'] = pd.Series(kmeans.labels_).value_counts()
    df_centroids_dict[pokemon] = df_centroids

Pikachu 1435
Squirtle 603
Bulbasaur 1766
Charmander 326
Raticate 639


In [546]:
from sklearn.cluster import AgglomerativeClustering
df_aggcentroids_dict = {}
for pokemon in ['Pikachu', #'Squirtle', 'Bulbasaur','Charmander', 'Raticate'
               ]:
    print(pokemon, len(df[df['name']==pokemon]))
    df_lat_lng = np.array(df[df['name']==pokemon][['lat', 'lng']]).tolist()
    aggclustering = AgglomerativeClustering(n_clusters=None, distance_threshold=0.1, compute_distances=True).fit(df_lat_lng)
#     df_aggcentroids = pd.DataFrame({'label':aggclustering.labels_, 'distances':aggclustering.distances_})
    
#     df_aggcentroids_dict[pokemon] = df_aggcentroids

Pikachu 1435


In [566]:
df_ = pd.DataFrame(df_lat_lng).rename(columns={0:'lat', 1:'lng'})

In [567]:
df_

Unnamed: 0,lat,lng
0,37.759913,-122.422614
1,37.304823,-121.948866
2,37.304638,-121.954732
3,37.304638,-121.954732
4,37.304823,-121.948866
...,...,...
1430,37.845754,-122.404119
1431,37.874600,-122.257900
1432,37.874600,-122.257900
1433,37.874600,-122.257900


In [569]:

# df_ = df[df['name']==pokemon][['lat', 'lng']]
df_['cluster_label'] = pd.Series(aggclustering.labels_)

df_counts = df_.groupby('cluster_label').agg({'lat':'count'}).reset_index().rename(columns={'lat':'size'})
clusters_gt_10 = df_counts[df_counts['size']>10]['cluster_label'].to_list()


df_ = df_[df_['cluster_label'].isin(clusters_gt_10)]

df_

Unnamed: 0,lat,lng,cluster_label
0,37.759913,-122.422614,58
1,37.304823,-121.948866,1
2,37.304638,-121.954732,1
3,37.304638,-121.954732,1
4,37.304823,-121.948866,1
...,...,...,...
1426,37.840859,-122.366056,69
1427,37.840859,-122.366056,69
1428,37.845754,-122.404119,28
1429,37.845754,-122.404119,28


In [561]:
clusters_gt_10

[]

In [608]:
fig = go.Figure()

for cluster in clusters_gt_10:
    df__ = df_[df_['cluster_label']==cluster]
    fig.add_trace(go.Scattermapbox(
        lat=df__['lat'],  
        lon=df__['lng'], 
        name=cluster,
        mode='markers',
        marker=go.scattermapbox.Marker(
            size=5, #df_centroids_dict[pokemon]['size']*.5,
            #color='red',
            opacity=.7
        )
    ))

# update mapbox style and layout
fig.update_layout(
    mapbox_style='stamen-toner', #'mapbox://styles/mapbox/light-v10',
    mapbox_zoom=8,
    mapbox_center={'lat':37.65, 'lon':-122.2},
    width=800,
    height=600
)

fig.show()

In [541]:
pd.Series(aggclustering.distances_).describe()

count    1434.000000
mean        0.034606
std         0.294408
min         0.000000
25%         0.000000
50%         0.000000
75%         0.007563
max         7.895866
dtype: float64

In [540]:
pd.Series(aggclustering.distances_).value_counts()

0.000000    949
0.026456      1
0.032189      1
0.032140      1
0.032059      1
           ... 
0.009427      1
0.009243      1
0.009199      1
0.009179      1
7.895866      1
Length: 486, dtype: int64

In [521]:
# kmeans = KMeans(n_clusters=20, random_state=42, n_init="auto").fit()
# df_pikachu_centroids_sklearn = pd.DataFrame(kmeans.cluster_centers_, columns=['lat', 'lng'])
# df_pikachu_centroids_sklearn['size'] = pd.Series(kmeans.labels_).value_counts()

In [522]:
fig = go.Figure()

for pokemon in [#'Pikachu', 
                #'Bulbasaur', 
                'Squirtle', 
                'Charmander', 
                'Raticate']:
    
    fig.add_trace(go.Scattermapbox(
        lat=df_centroids_dict[pokemon]['lat'],  
        lon=df_centroids_dict[pokemon]['lng'], 
        name=pokemon,
        mode='markers',
        marker=go.scattermapbox.Marker(
            size=df_centroids_dict[pokemon]['size']*.5,
            #color='red',
            opacity=.7
        )
    ))

# update mapbox style and layout
fig.update_layout(
    mapbox_style='open-street-map', #'stamen-toner', #'mapbox://styles/mapbox/light-v10',
    mapbox_zoom=8,
    mapbox_center={'lat':37.65, 'lon':-122.2},
    width=800,
    height=600
)

fig.show()

In [583]:

df_counts[(df_counts['count']>np.percentile(df_counts['count'], 50))&(df_counts['count']<np.percentile(df_counts['count'], 75))].sort_values('count').head()

rare_pokemons = df_counts[df_counts['count']<10]['name'].tolist()

In [587]:
df_counts[df_counts['count']<10]['name'].tolist()

['Alakazam',
 'Charizard',
 'Cloyster',
 'Dewgong',
 'Gyarados',
 'Machamp',
 'Muk',
 'Ninetales',
 'Omastar',
 'Poliwrath',
 'Snorlax',
 'Vileplume']

In [607]:
fig = px.density_mapbox(df[df['name'].isin(rare_pokemons)], lat='lat', lon='lng',
                        mapbox_style='open-street-map',
                        zoom=8.3,
                        center={'lat':37.65, 'lon':-122.2},
                        radius=7,
                       width=800,
                       height=600)

fig.show()

In [606]:
rare_pokemons = df_counts[df_counts['count']<5]['name'].tolist()
fig = px.density_mapbox(df[df['name'].isin(rare_pokemons)], lat='lat', lon='lng',
                        mapbox_style='open-street-map',
                        zoom=8.3,
                        center={'lat':37.65, 'lon':-122.2},
                        radius=7,
                       width=800,
                       height=600)
fig.show()