# Find Cities in Songs

1. Load json of Dylan songs/lyrics
2. Use the `spacey` package for named entity recognition (ner)
3. Cross reference results of ner with csv of cities and their coordinates, to produce csv with cities, lat/lon, and count of references in songs

## Load Song Data

In [8]:
import pandas as pd

In [9]:
# Load from my JSON
df = pd.read_json('../data/songs.json')
df.set_index('title', inplace=True)

In [10]:
# Peak at first 8 entries
df.head(n=8)

Unnamed: 0_level_0,albums,author,lyrics,url
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
‘Cross The Green Mountain,"[The Bootleg Series, Vol 8: Tell Tale Signs]",,,https://bobdylan.com/songs/cross-green-mountain/
‘Til I Fell In Love With You,[Time Out Of Mind],Bob Dylan,"Well, my nerves are exploding and my body’s te...",https://bobdylan.com/songs/til-i-fell-love-you/
"10,000 Men",[Under The Red Sky],Bob Dylan,Ten thousand men on a hill\r\nTen thousand men...,https://bobdylan.com/songs/10000-men/
2 Dollars and 99 Cents,"[The Bootleg Series, Vol. 11: The Basement Tap...",Bob Dylan,,https://bobdylan.com/songs/2-dollars-and-99-ce...
2 X 2,[Under The Red Sky],Bob Dylan,"One by one, they followed the sun\r\nOne by on...",https://bobdylan.com/songs/2-x-2/
32-20 Blues,"[The Bootleg Series, Vol 8: Tell Tale Signs]",Robert Johnson,,https://bobdylan.com/songs/32-20-blues/
900 Miles from My Home,"[The Bootleg Series, Vol. 11: The Basement Tap...","Traditional, arranged by Bob Dylan",,https://bobdylan.com/songs/900-miles-my-home/
A Fool Such As I,"[The Bootleg Series, Vol. 11: The Basement Tap...",B. Abner,,https://bobdylan.com/songs/fool-such-i/


## Identify Places in Lyrics

In [11]:
import re
import itertools

In [12]:
city_meta_df = pd.read_csv(
    '../data/simplemaps-worldcities-basic.csv')

In [13]:
drop_cities = [
    'Man', 'San', 'Orleans', 'Mary', 'York', 'Young', 'Same',
    'Ye', 'Darwin', 'Orange', 'George', 'Bo', 'Leo', 'Gay',
    'Buy', 'Split', 'Nice', 'Nancy', 'Montana', 'Florida']

city_meta_df = city_meta_df[~city_meta_df['city'].isin(drop_cities)]

In [14]:
city_list = city_meta_df.set_index('city').index

def get_cities(word_list):
    cities = [w for w in word_list if w in city_list]
    return list(set(cities))

def n_grams(word_list, n=2):
    grams = []
    for i in range(0, len(word_list) - n):
        gram = ' '.join(word_list[i:i+n])
        grams.append(gram)
    return grams

def extract_cities(text):
    cap_words = re.findall('([A-Z][a-z]+)', text)
    two_grams = n_grams(cap_words, n=2)
    return get_cities(cap_words) + get_cities(two_grams)

In [15]:
df['places'] = df.lyrics.apply(extract_cities)

## Count place references

In [16]:
from collections import Counter, defaultdict

c = Counter()          # count appearances of each place
p = defaultdict(list)  # map places to songs
l = defaultdict(list)  # map places to songs

for title, row in df.iterrows():
    c.update(row.places)
    for place in row.places:
        p[place].append(title)
        l[place].append(row.lyrics)

### Merge place count data with city/cooridinates data

In [17]:
# Make place counts into df
places_df = pd.DataFrame(c.most_common(), columns=['city','cnt'])

# Make mapping of places to songs into df
song_map = pd.DataFrame(list(p.items()), columns=['city','songs'])

# Make mapping of places to lyrics
lyrics_map = pd.DataFrame(list(l.items()), columns=['city','lyrics'])

# Merge all dataframes together
city_df = pd.merge(places_df, song_map, on='city')
city_df = pd.merge(city_df, lyrics_map, on='city')
city_df = pd.merge(city_df, city_meta_df, on='city')

In [18]:
# For duplicate cities, drop the less populated one
city_df = (city_df
           .sort_values(by=['city','pop'])
           .drop_duplicates(subset='city', keep='last'))

# drop some columns
city_df = city_df.drop(labels=['city_ascii','pop','iso2','iso3'], axis=1)

In [19]:
city_df = city_df.sort_values('cnt', ascending=False)
city_df.head(n=10)

Unnamed: 0,city,cnt,songs,lyrics,lat,lng,country,province
1,New York,6,"[Hard Times In New York Town, I Shall Be Free ...","[Come you ladies and you gentlemen, a-listen t...",40.749979,-73.980017,United States of America,New York
0,New Orleans,6,"[Blind Willie McTell, Bob Dylan’s New Orleans ...","[Seen the arrow on the doorpost\r\nSaying, “Th...",29.995002,-90.039967,United States of America,Louisiana
2,Memphis,5,"[Gypsy Lou, Kingsport Town, Someone’s Got A Ho...","[If you getcha one girl, better get two\r\nCas...",35.119987,-89.999995,United States of America,Tennessee
9,London,4,"[Jack-A-Roe, Not Dark Yet, Something’s Burning...","[Oh, there was a wealthy merchant, in London h...",51.499995,-0.116722,United Kingdom,Westminster
3,El Paso,4,"[Billy 1, Billy 4, She’s Your Lover Now, Wante...",[There’s guns across the river aimin’ at ya\r\...,31.779984,-106.509995,United States of America,Texas
5,San Francisco,4,"[California, Maybe Someday, She’s Your Lover N...",[I’m goin’ down south\r\n’Neath the borderline...,37.740008,-122.459978,United States of America,California
7,Jackson,4,"[Catfish, George Jackson, Outlaw Blues, Rambli...",[Lazy stadium night\r\nCatfish on the mound\r\...,32.298815,-90.184997,United States of America,Mississippi
12,Lincoln,4,"[Long Ago, Far Away, Senor (Tales Of Yankee Po...","[To preach of peace and brotherhood\r\nOh, wha...",40.819975,-96.680001,United States of America,Nebraska
30,Juarez,2,"[Just Like Tom Thumb’s Blues, Wanted Man]",[When you’re lost in the rain in Juarez\r\nAnd...,-37.665519,-59.80003,Argentina,Ciudad de Buenos Aires
18,Chicago,2,"[Cold Irons Bound, The Death Of Emmett Till]",[I’m beginning to hear voices and there’s no o...,41.829991,-87.750055,United States of America,Illinois


In [20]:
# Save to csv
city_df.to_csv('../data/city_counts.csv', index=False)

# Save to Json
city_df.to_json('../data/city_counts.json')

In [21]:
c = city_meta_df.set_index('city')

In [34]:
def geojsonify_row(idx, row):
    return {
        "type": 'Feature',
        'geometry': {
            "type": "Point",
            "coordinates": [row.lng, row.lat]
        },
        "properties": {
            "id": idx,
            "name": row.city,
            "cnt" : row.cnt,
            "songs": row.songs,
            "lyrics": row.lyrics,
        }
    }

In [35]:
def geojsonify(df):
    return [geojsonify_row(idx, row) for idx, row in df.iterrows()]

In [36]:
import json

j = geojsonify(city_df)
with open('../data/places.geojson', 'w') as fp:
    json.dump(j, fp)

## Mapping

In [38]:
from ipyleaflet import Map, GeoJSON

In [39]:
with open('info.geojson') as f:
    data = json.load(f)

In [41]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
output_notebook()

In [42]:
from bokeh.tile_providers import CARTODBPOSITRON

In [43]:
from bokeh.models import GeoJSONDataSource

In [44]:
from bokeh.sampledata.sample_geojson import geojson

In [45]:
geo_source = GeoJSONDataSource(geojson=json.dumps(j))

In [46]:
p = figure()
p.circle(x='x', y='y', size=15, alpha=0.7, source=geo_source)

In [47]:
# p.add_tile(CARTODBPOSITRON)

In [48]:
show(p)

In [245]:
geo_source = GeoJSONDataSource(geojson=geojson)

In [246]:
# range bounds supplied in web mercator coordinates
p = figure(x_range=(-2000000, 6000000), y_range=(-1000000, 7000000),
           x_axis_type="mercator", y_axis_type="mercator")

p.add_tile(CARTODBPOSITRON)

In [248]:
show(p)

In [15]:
# Save to csv
city_df.to_csv('data/city_counts.csv', index=False)

# Save to Json
city_df.to_json('data/city_counts.json')