# Web Scraping Notebook
- **Second half of data compilation.**
    - This notebook continues from the "SQL_Data_Cleaning" notebook
    - Artist features will be scraped from website and added to `ticket_clean_df`.
    - The five features are as follows:
        - Years active
            - Difference between date of show and date of first album release ("Founded Date" is less effective because it is the date of birth for any artists that go by their own name, e.g. Paul Simon).
        - Miles from hometown
            - Distance between "Founded in" location and Oakland, CA
        - US Region
            - Based on artist_hometown column, assign US region to each artist. 
            - If artist is not from the US, then 'International' will be the assigned region.
        - Local or Not
            - If artist is within 75 miles of Fox Theater, then they are considered local.
            - Otherwise, the artist is labeled as from the US or international.
        - Genre
            - List of genres under header "Genres"
    - Data will be scraped from musicbrainz.org (see screenshot below)

![Screen%20Shot%202019-10-13%20at%2012.00.21%20PM.png](attachment:Screen%20Shot%202019-10-13%20at%2012.00.21%20PM.png)

---

In [813]:
from __future__ import print_function, division

import os

import pandas as pd
import numpy as np
import pickle
import collections

import time
from tqdm.notebook import tqdm

import re
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
# from selenium.webdriver.common.keys import Keys
import time
from geopy.geocoders import Nominatim
from geopy.distance import geodesic

# import chromedriver_binary

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.precision', 3)

In [5]:
import requests

requests.__path__

['/Users/andrewgraves/anaconda3/lib/python3.7/site-packages/requests']

In [644]:
with open('Data/sql_shows_artists.pkl','rb') as read_file:
    ticket_clean_df = pickle.load(read_file)

# Selenium Loop
- Headliner name will be entered into search bar on musicbrainz.org and first option of results will be selected for scraping artist data.
- The following data will be scraped:
    - Artist "start" date, i.e. date of first album release
    - Artist "founded" date
    - Artist hometown
    - Artist genre list    

In [8]:
headliners_to_scrape = ticket_clean_df.headliner.values

In [12]:
# artist_age_list = []
# found_date_list = []
# hometown_list = []
# genre_list = []

# driver = webdriver.Chrome(ChromeDriverManager().install());
# url = 'http://musicbrainz.org'
# driver.get(url);

# for artist in headliners_to_scrape:
    
#     query = driver.find_element_by_id('headerid-query');
#     query.send_keys(artist);
#     query.send_keys(Keys.RETURN);
    
#     name_selector = '//bdi[text()[contains(., artist)]]'
#     search = driver.find_element_by_xpath(name_selector)
    
#     try:
#         search.click();
#         current_url = driver.current_url
    
#         soup = BeautifulSoup(driver.page_source, 'html.parser')
#         soup.prettify();
        
#         if [loc for loc in soup.find_all(class_='c')]:
#             artist_age_list.append([loc for loc in soup.find_all(class_='c')][3].text)
#         else:
#             artist_age_list.append(np.nan)       
#         if soup.find(class_='begin-date'):
#             found_date_list.append(soup.find(class_='begin-date').text[:4])
#         else:
#             found_date_list.append(np.nan)
#         if soup.find(class_='begin_area'):
#             hometown_list.append(soup.find(class_='begin_area').text);
#         else:
#             hometown_list.append(np.nan)
#         if soup.find(class_='genre-list'):
#             genre_list.append(soup.find(class_='genre-list').text.split(','))
#         else:
#             genre_list.append(np.nan)
#     except:
#         artist_age_list.append(np.nan)
#         found_date_list.append(np.nan)
#         hometown_list.append(np.nan)
#         genre_list.append(np.nan)
        
    

[WDM] - Current google-chrome version is 83.0.4103
[WDM] - Get LATEST driver version for 83.0.4103
[WDM] - There is no [mac64] chromedriver for browser 83.0.4103 in cache
[WDM] - Get LATEST driver version for 83.0.4103
[WDM] - Trying to download new driver from http://chromedriver.storage.googleapis.com/83.0.4103.39/chromedriver_mac64.zip


 


[WDM] - Driver has been saved in cache [/Users/andrewgraves/.wdm/drivers/chromedriver/mac64/83.0.4103.39]


---

## Add scraped features to `ticket_clean_df`

In [613]:
# Compile all scraped features into a single dataframe.

# scraped_features_dict = {'artist_found_date': [int(year) if type(year) == str and len(year) > 1 else np.nan for year in found_date_list],
#                          'artist_start_date': [int(year) if type(year) == str and len(year) > 1 else np.nan for year in artist_age_list],
#                          'artist_hometown': hometown_list,
#                          'artist_genre': genre_list}
# scraped_features_df = pd.DataFrame(scraped_features_dict)
# scraped_features_df

In [636]:
# with open('Data/Web_Scrape_1/scraped_features.pkl', 'wb') as to_write:
#     pickle.dump(scraped_features_df, to_write)

In [645]:
with open('Data/Web_Scrape_2/fill_nan.pkl','rb') as read_file:
    fill_nan_df = pickle.load(read_file)

In [646]:
fill_nan_df.drop(['headliner'], axis=1, inplace=True)

In [647]:
with open('Data/Web_Scrape_1/scraped_features.pkl','rb') as read_file:
    scraped_features_df = pickle.load(read_file)

In [648]:
ticket_clean_df['artist_start_date']= scraped_features_df['artist_start_date']
ticket_clean_df['artist_hometown'] = scraped_features_df['artist_hometown']
ticket_clean_df['artist_genre'] = scraped_features_df['artist_genre']
ticket_clean_df

Unnamed: 0,event_date,year,month,season,day_of_week,time_of_week,num_shows,show_type,headliner,support,num_support,tickets_sold,gross_usd,venue_capacity,percentage_sold,ticket_price_min,ticket_price_max,artist_start_date,artist_hometown,artist_genre
0,2009-02-06,2009,February,Winter,Friday,Weekend,1,single headliner,Social Distortion,"The Black Tibetans, The Devil Makes Three",2,2800.0,91000.0,2800.0,100.0,32.5,32.5,1983.0,"Fullerton, California, United States","[punk, punk rock]"
1,2009-02-07,2009,February,Winter,Saturday,Weekend,1,single headliner,Michael Franti & Spearhead,"ALO (Animal Liberation Orchestra), Solillaquis...",2,2800.0,88563.0,2800.0,100.0,32.5,32.5,1994.0,"San Francisco, California, United States",[(none)]
2,2009-02-13,2009,February,Winter,Friday,Weekend,1,single headliner,Will Downing,Gerald Albright,1,1947.0,74893.0,1947.0,100.0,39.5,49.5,1988.0,"Brooklyn, New York, New York, United States",[soul]
3,2009-02-20,2009,February,Winter,Friday,Weekend,1,festival,K'naan,"Julian Marley, K'naan, Lee ""Scratch"" Perry, Ro...",5,2163.0,81763.0,2800.0,78.0,37.5,40.0,2006.0,"Mogadishu, Banaadir, Somalia",[hip hop]
4,2009-02-21,2009,February,Winter,Saturday,Weekend,1,single headliner,CAKE,The Lovemakers,1,2800.0,98000.0,2800.0,100.0,35.0,35.0,1994.0,"Sacramento, California, United States","[alternative rock, indie rock, rock, pop rock]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
853,2019-08-14,2019,August,Summer,Wednesday,Weekday,1,single headliner,Kirk Franklin,Travele Judon,1,1288.0,85120.0,1847.0,70.0,45.0,85.0,1993.0,"Fort Worth, Texas, United States",[gospel]
854,2019-08-23,2019,August,Summer,Friday,Weekend,1,single headliner,Daniel Caesar,Koffee,1,2957.0,107410.0,2957.0,100.0,35.0,49.5,2017.0,"Toronto, Ontario, Canada",[r&b]
855,2019-08-31,2019,August,Summer,Saturday,Weekend,1,single headliner,Bryan Ferry,Femme Schmidt,1,1931.0,172015.0,1931.0,100.0,59.5,249.5,1973.0,"Washington, Sunderland, Tyne and Wear, England...","[glam rock, art pop, art rock, pop, pop rock]"
856,2019-09-05,2019,September,Autumn,Thursday,Weekend,2,multiple shows,King Crimson,,0,1610.5,155522.0,1900.0,85.0,65.0,149.5,1969.0,"London, England, United Kingdom","[progressive rock, art rock, free improvisat..."


In [649]:
ticket_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 858 entries, 0 to 857
Data columns (total 20 columns):
event_date           858 non-null object
year                 858 non-null int64
month                858 non-null object
season               858 non-null object
day_of_week          858 non-null object
time_of_week         858 non-null object
num_shows            858 non-null int64
show_type            858 non-null object
headliner            858 non-null object
support              644 non-null object
num_support          858 non-null int64
tickets_sold         858 non-null float64
gross_usd            858 non-null float64
venue_capacity       858 non-null float64
percentage_sold      858 non-null float64
ticket_price_min     858 non-null float64
ticket_price_max     858 non-null float64
artist_start_date    846 non-null float64
artist_hometown      796 non-null object
artist_genre         858 non-null object
dtypes: float64(7), int64(3), object(10)
memory usage: 134.2+ KB


# Create Desired Features

- Using the scraped data, the following features will be created:
    - Genre
        - All scraped genres will be grouped into one of the predetermined main genres, and the first genre in each artist's genre list will be matched to one of those main genres.
    - Years active
        - Difference between date of show and date of first album release
    - Miles from hometown
        - Distance between "Founded in" location and Oakland, CA
    - US Region
        - Is the artist from the US, and if so, what region?
    - Location Class
        - Is the band local, from the US, or international?

## Insert `main_genre` column and update `artist_genre` column

Main Genres:

1. Pop
2. Religious
3. Rock
4. Punk
5. Indie
6. Hip Hop
7. Soul
8. Folk
9. Experimental
10. Electronic
11. Metal
12. Reggae
13. Country
14. Jazz
15. Blues
16. Comedy
17. World
18. None

In [650]:
first_genre_list = [np.nan if genre_list[0] == '(none)' else genre_list[0] for genre_list in ticket_clean_df.artist_genre.values]

In [651]:
ticket_clean_df['main_genre'] = first_genre_list

In [652]:
# Fill missing values with fill_nan_df (generated from second round of webscraping at end of notebook)

ticket_clean_df.fillna(fill_nan_df, inplace=True)

In [653]:
ticket_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 858 entries, 0 to 857
Data columns (total 21 columns):
event_date           858 non-null object
year                 858 non-null int64
month                858 non-null object
season               858 non-null object
day_of_week          858 non-null object
time_of_week         858 non-null object
num_shows            858 non-null int64
show_type            858 non-null object
headliner            858 non-null object
support              644 non-null object
num_support          858 non-null int64
tickets_sold         858 non-null float64
gross_usd            858 non-null float64
venue_capacity       858 non-null float64
percentage_sold      858 non-null float64
ticket_price_min     858 non-null float64
ticket_price_max     858 non-null float64
artist_start_date    846 non-null float64
artist_hometown      841 non-null object
artist_genre         858 non-null object
main_genre           858 non-null object
dtypes: float64(7), int64(3), 

In [654]:
genre_dict = {'Pop': ['pop', 'dance-pop', 'electropop', 'new wave', 'alternative pop', 'futurepop', 'singer-songwriter'],
              'Religious': ['chamber pop', 'gospel', 'christian rock'],
#               'Indie Pop': ['indie pop', 'synth-pop', 'art pop'],
              'Rock': ['rock', 'alternative rock', 'funk rock', 'progressive rock', 'psychedelic rock', 'art rock', 'glam rock', 'grunge', 'acoustic rock', 'funk', 'hard rock', 'southern rock', 'stoner rock', 'electronic rock', 'bluegrass', 'progressive bluegrass', 'production music', 'livetronica', 'experimental rock'],
              'Punk': ['punk', 'punk rock', 'celtic punk', 'pop punk', 'ska', 'garage punk', 'hardcore punk', 'emo'],    \
              'Indie': ['indie', 'indie rock', 'garage rock', 'post-rock', 'dream pop', 'shoegaze', 'indie pop', 'synth-pop', 'art pop', 'post-punk', 'indie alternative'],
              'Hip Hop': ['hip hop', 'r&b', 'west coast hip hop', 'alternative r&b', 'trap'],
              'Soul': ['soul', 'blue-eyed soul', 'neo soul'],
              'Folk': ['folk', 'americana', 'indie folk', 'folk rock', 'freak folk', 'contemporary folk', 'psychedelic folk', 'alternative folk', 'folk pop', 'folk-pop'],
              'Experimental': ['ambient', 'ambient minimal', 'experimental', 'dark ambient', 'avant-garde', 'avant-rock', 'minimal'],
              'Electronic': ['electronic', 'indietronica', 'dubstep', 'downtempo', 'electro house', 'house', 'dub', 'rave', 'alternative dance', 'indie dance', 'electronica', 'synthwave', 'dancehall', 'deep house', 'future house', 'progressive house', 'drum and bass', 'progressive trance', 'ambient house', 'production music', 'idm', 'trance', 'progressive trance', 'dance', 'electro', 'electro swing', 'moombahcore'],
              'Metal': ['metal', 'funk metal', 'progressive metal', 'thrash metal', 'nu metal', 'death metal', 'symphonic metal', 'heavy metal', 'alternative metal', 'doom metal'],
              'Reggae': ['reggae', 'calypso'],
              'Country': ['country', 'alternative country', 'country pop'],    \
              'Jazz': ['jazz', 'acid jazz', 'jazz fusion'],
              'Blues': ['blues', 'blues rock'],
              'Comedy': ['comedy', 'comedy rock'],
              'World': ['bolero', 'cumbia', 'latin'],
              'Children Music': ["children's music", 'preschool'],
              'Podcast': ['podcast'],
              'None': [np.nan]}


In [658]:
# ticket_clean_df[ticket_clean_df.main_genre == 'synthwave']

In [656]:
simplified_genres = pd.Series()

for index, genre in ticket_clean_df.main_genre.iteritems():
    for key, value in genre_dict.items():
        if genre in value:
            simplified_genres.at[index] = key
            break

In [657]:
# Replace genre lists with main genre.

ticket_clean_df['artist_genre'] = simplified_genres

### Manually Fill Missing Values Not Captured by Web Scraper

- For the following columns (before they are used below to create the `years_active` and `miles_from_home` features below)
    - Support
    - Artist Start Date
    - Artist Hometown

In [659]:
ticket_clean_df.support.fillna('No support', inplace=True)

In [660]:
# Manually fill in missing hometown data

ticket_clean_df.loc[168, 'artist_hometown'] = 'Chobham, Surrey, England'
ticket_clean_df.loc[362, 'artist_hometown'] = 'Moldova'
ticket_clean_df.loc[368, 'artist_hometown'] = 'Blackpool, Lancashire, England'
ticket_clean_df.loc[374, 'artist_hometown'] = 'Cape Town, South Africa'
ticket_clean_df.loc[406, 'artist_hometown'] = 'Los Angeles, California'
ticket_clean_df.loc[432, 'artist_hometown'] = 'Oakland, California'
ticket_clean_df.loc[489, 'artist_hometown'] = 'Asheville, North Carolina'
ticket_clean_df.loc[517, 'artist_hometown'] = 'Linz, Austria'
ticket_clean_df.loc[531, 'artist_hometown'] = 'Canyon Lake, California'
ticket_clean_df.loc[532, 'artist_hometown'] = 'Canyon Lake, California'
ticket_clean_df.loc[534, 'artist_hometown'] = 'London, England'
ticket_clean_df.loc[598, 'artist_hometown'] = 'Atlanta, Georgia'
ticket_clean_df.loc[611, 'artist_hometown'] = 'Jersey City, New Jersey'
ticket_clean_df.loc[698, 'artist_hometown'] = 'New York City, New York'
ticket_clean_df.loc[773, 'artist_hometown'] = 'Houston, Texas'
ticket_clean_df.loc[813, 'artist_hometown'] = 'Canyon Lake, California'
ticket_clean_df.loc[823, 'artist_hometown'] = 'Llandudno, Wales'

In [661]:
# Manually fill in missing artist_start_date data

ticket_clean_df.loc[368, 'artist_start_date'] = 1962
ticket_clean_df.loc[430, 'artist_start_date'] = 2012
ticket_clean_df.loc[484, 'artist_start_date'] = 2013
ticket_clean_df.loc[525, 'artist_start_date'] = 2014
ticket_clean_df.loc[544, 'artist_start_date'] = 2012
ticket_clean_df.loc[574, 'artist_start_date'] = 2013
ticket_clean_df.loc[681, 'artist_start_date'] = 2012
ticket_clean_df.loc[721, 'artist_start_date'] = 2017
ticket_clean_df.loc[774, 'artist_start_date'] = 2013
ticket_clean_df.loc[775, 'artist_start_date'] = 2013
ticket_clean_df.loc[833, 'artist_start_date'] = 1996
ticket_clean_df.loc[848, 'artist_start_date'] = 2011

In [662]:
ticket_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 858 entries, 0 to 857
Data columns (total 21 columns):
event_date           858 non-null object
year                 858 non-null int64
month                858 non-null object
season               858 non-null object
day_of_week          858 non-null object
time_of_week         858 non-null object
num_shows            858 non-null int64
show_type            858 non-null object
headliner            858 non-null object
support              858 non-null object
num_support          858 non-null int64
tickets_sold         858 non-null float64
gross_usd            858 non-null float64
venue_capacity       858 non-null float64
percentage_sold      858 non-null float64
ticket_price_min     858 non-null float64
ticket_price_max     858 non-null float64
artist_start_date    858 non-null float64
artist_hometown      858 non-null object
artist_genre         858 non-null object
main_genre           858 non-null object
dtypes: float64(7), int64(3), 

In [663]:
# with open('Data/ticket_cleaner.pkl', 'wb') as to_write:
#     pickle.dump(ticket_clean_df, to_write)

## Insert `years_active` column

In [690]:
with open('Data/ticket_cleaner.pkl','rb') as read_file:
    ticket_clean_df = pickle.load(read_file)

In [691]:
years_active_list = []

for index, rows in ticket_clean_df.iterrows():
    years_active = int(rows['year']) - int(rows['artist_start_date'])
    if years_active < 0:
        years_active = 0
        years_active_list.append(years_active)
    else:
        years_active_list.append(years_active)

ticket_clean_df.insert(18, 'years_active', years_active_list)

## Insert `miles_from_home` column

In [723]:
ticket_clean_df[716:]

Unnamed: 0,event_date,year,month,season,day_of_week,time_of_week,num_shows,show_type,headliner,support,num_support,tickets_sold,gross_usd,venue_capacity,percentage_sold,ticket_price_min,ticket_price_max,artist_start_date,years_active,artist_hometown,artist_genre,main_genre
716,2018-04-07,2018,April,Spring,Saturday,Weekend,1,single headliner,Chromeo,Phantoms,1,2828.0,109054.0,2828.0,100.0,38.5,38.5,2004.0,14,"Montreal (city), Quebec, Canada",Pop,dance-pop
717,2018-04-13,2018,April,Spring,Friday,Weekend,1,single headliner,Camila Cabello,Bazzi,1,2828.0,118415.0,2828.0,100.0,39.5,49.5,2018.0,0,"Cojímar, La Habana (historical, until 2011), Cuba",Pop,dance-pop
718,2018-04-16,2018,April,Spring,Monday,Weekday,1,single headliner,King Krule,No support,0,2635.0,92375.0,2800.0,95.0,35.0,35.0,2013.0,5,"London, England, United Kingdom",Jazz,jazz fusion
719,2018-05-09,2018,May,Spring,Wednesday,Weekday,1,single headliner,George Ezra,Noah Kahan,1,2249.0,78825.0,2800.0,81.0,35.0,35.0,2014.0,4,"Bristol, England, United Kingdom",Blues,blues
720,2018-05-10,2018,May,Spring,Thursday,Weekend,1,single headliner,Unknown Mortal Orchestra,Makeness,1,1671.0,49537.0,2800.0,60.0,29.5,29.5,2011.0,7,"Portland, Oregon, United States",Rock,psychedelic rock
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
853,2019-08-14,2019,August,Summer,Wednesday,Weekday,1,single headliner,Kirk Franklin,Travele Judon,1,1288.0,85120.0,1847.0,70.0,45.0,85.0,1993.0,26,"Fort Worth, Texas, United States",Religious,gospel
854,2019-08-23,2019,August,Summer,Friday,Weekend,1,single headliner,Daniel Caesar,Koffee,1,2957.0,107410.0,2957.0,100.0,35.0,49.5,2017.0,2,"Toronto, Ontario, Canada",Hip Hop,r&b
855,2019-08-31,2019,August,Summer,Saturday,Weekend,1,single headliner,Bryan Ferry,Femme Schmidt,1,1931.0,172015.0,1931.0,100.0,59.5,249.5,1973.0,46,"Washington, Sunderland, Tyne and Wear, England...",Rock,glam rock
856,2019-09-05,2019,September,Autumn,Thursday,Weekend,2,multiple shows,King Crimson,No support,0,1610.5,155522.0,1900.0,85.0,65.0,149.5,1969.0,50,"London, England, United Kingdom",Rock,progressive rock


In [725]:
# Fix any necessary artist hometowns to be readable for geolocator:

ticket_clean_df.loc[48, 'artist_hometown'] = 'Liverpool, England, United Kingdom'
ticket_clean_df.loc[240, 'artist_hometown'] = 'Waitsfield, Vermont, United States'
ticket_clean_df.loc[292, 'artist_hometown'] = 'Saratov, Russia'
ticket_clean_df.loc[293, 'artist_hometown'] = 'Saratov, Russia'
ticket_clean_df.loc[329, 'artist_hometown'] = 'Auckland, New Zealand'
ticket_clean_df.loc[330, 'artist_hometown'] = 'Auckland, New Zealand'
ticket_clean_df.loc[644, 'artist_hometown'] = 'Cincinnati, Ohio, United States'
ticket_clean_df.loc[717, 'artist_hometown'] = 'Cojímar, La Habana, Cuba'

In [724]:
geolocator = Nominatim(user_agent='myGeocoder')
test_loc = geolocator.geocode('Cojímar, La Habana, Cuba')
test_lat = venue_loc.latitude
test_long = venue_loc.longitude

### Create `location_df`

In [672]:
venue_address = '1807 Telegraph Ave, Oakland, California'

In [676]:
# Venue latitude and longitude series:

geolocator = Nominatim(user_agent='myGeocoder')
venue_loc = geolocator.geocode(venue_address)
venue_lat = [venue_loc.latitude]*858
venue_long = [venue_loc.longitude]*858

In [727]:
# hometown_lat.keys()

In [701]:
# hometown_lat = {}
# hometown_long = {}

In [726]:
# Artist hometown latitude and longitude:

# geolocator = Nominatim(user_agent='myGeocoder')

# for index, loc in tqdm(ticket_clean_df.artist_hometown.iteritems()):
#     loc_data = geolocator.geocode(loc)
#     hometown_lat[index] = loc_data.latitude
#     hometown_long[index] = loc_data.longitude

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [819]:
# Create dataframe of all latitudes and longitudes:

# locations_dict = {'Headliner': ticket_clean_df.headliner.values,
#                   'Venue Latitude': venue_lat,
#                   'Venue Longitude': venue_long,
#                   'Hometown Latitude': list(hometown_lat.values()),
#                   'Hometown Longitude': list(hometown_long.values())}
# locations_df = pd.DataFrame(locations_dict)
# locations_df.head(10)

In [734]:
# with open('Data/hometown_lat_long.pkl', 'wb') as to_write:
#     pickle.dump(locations_df, to_write)

In [736]:
with open('Data/hometown_lat_long.pkl','rb') as read_file:
    locations_df = pickle.load(read_file)

In [737]:
# Calculate miles from home and add to main dataframe.

distance_list = []
for index, rows in tqdm(locations_df.iterrows()):
    venue_coord = (rows['Venue Latitude'], rows['Venue Longitude'])
    hometown_coord = (rows['Hometown Latitude'], rows['Hometown Longitude'])
    distance = geodesic(venue_coord, hometown_coord).miles
    distance_list.append(distance)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [738]:
ticket_clean_df.insert(20, 'miles_from_home', distance_list)

In [739]:
ticket_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 858 entries, 0 to 857
Data columns (total 23 columns):
event_date           858 non-null object
year                 858 non-null int64
month                858 non-null object
season               858 non-null object
day_of_week          858 non-null object
time_of_week         858 non-null object
num_shows            858 non-null int64
show_type            858 non-null object
headliner            858 non-null object
support              858 non-null object
num_support          858 non-null int64
tickets_sold         858 non-null float64
gross_usd            858 non-null float64
venue_capacity       858 non-null float64
percentage_sold      858 non-null float64
ticket_price_min     858 non-null float64
ticket_price_max     858 non-null float64
artist_start_date    858 non-null float64
years_active         858 non-null int64
artist_hometown      858 non-null object
miles_from_home      858 non-null float64
artist_genre         858 non-n

## Insert `us_region` and `local_or_not` columns

### `us_region`

- Based on artist_hometown column, assign US region to each artist. 
- If artist is not from the US, then 'International' will be the assigned region.

In [None]:
region_dict = {
                'Northeast': ['Maryland', 'MD', 'Delaware', 'DE', 'District of Columbia', 'DC', 'New Jersey', 'NJ', 'Pennsylvania', 'PA', 'New York', 'NY', 'Massachusetts', 'MA', 
                              'Connecticut', 'CT', 'Rhode Island', 'RI', 'New Hampshire', 'NH', 'Vermont', 'VT', 'Maine', 'ME', 'Virginia', 'VA'],
               'West Coast': ['California', 'CA', 'Oregon', 'OR', 'Washington', 'WA'],
                  'Rockies': ['Nevada', 'NV', 'Utah', 'UT', 'Idaho', 'ID', 'Montana', 'MT', 'Wyoming', 'WY', 'Colorado', 'CO'],
                'Southwest': ['Arizona', 'AZ', 'New Mexico', 'NM'],
                  'Midwest': ['Missouri', 'MO', 'Kansas', 'KS', 'North Dakota', 'ND', 'South Dakota', 'SD', 'Nebraska', 'NE', 'Minnesota', 'MN', 'Iowa', 'IA', 'Wisconsin', 'WI', 
                              'Illinois', 'IL', 'Indiana', 'IN', 'Michigan', 'MI', 'Ohio', 'OH'],
                    'South': ['Texas', 'TX', 'Oklahoma', 'OK', 'Arkansas', 'AR', 'Louisiana', 'LA', 'Mississippi', 'MS', 'Alabama', 'AL', 'Georgia', 'GA', 'Florida', 'FL', 
                              'Kentucky', 'KY', 'Tennessee', 'TN', 'North Carolina', 'NC', 'South Carolina', 'SC', 'West Virginia', 'WV'],
                   'Hawaii': ['Hawaii', 'HI'],
                   'Alaska': ['Alaska', 'AK'],
                       'US': ['US', 'U.S.', 'U.S.A.', 'USA', 'United States']
              }

In [743]:
# ticket_clean_df.artist_hometown.values

In [814]:
region_feature_dict = {}

for index, loc in ticket_clean_df.artist_hometown.iteritems():
    for key, region in region_dict.items():
        for state in region:
            if state in [l.strip() for l in loc.split(',')]:
                if index in region_feature_dict.keys():
                    break
                else:
                    region_feature_dict[index] = key
                    break
                    
for i in range(858):
    if i not in region_feature_dict.keys():
        region_feature_dict[i] = 'International'
        
region_feature_dict = collections.OrderedDict(sorted(region_feature_dict.items()))

In [805]:
pd.Series(region_feature_dict).value_counts()

International    271
West Coast       206
Northeast        165
South            127
Midwest           59
Rockies           15
US                 8
Southwest          4
Hawaii             2
Alaska             1
dtype: int64

In [806]:
ticket_clean_df.insert(20, 'us_region', region_feature_dict.values())

### `local_or_not`
- If artist is within 75 miles of Fox Theater, then they are considered local.
- Otherwise, the artist is labeled as from the US or international.

In [827]:
local_dict = {}

for index, rows in ticket_clean_df.iterrows():
    if rows.us_region in region_dict.keys():
        if rows.miles_from_home <= 75:
            local_dict[index] = 'Local'
        else:
            local_dict[index] = 'US'
    else:
        local_dict[index] = 'International'

In [828]:
pd.Series(local_dict).value_counts()

US               534
International    271
Local             53
dtype: int64

In [832]:
ticket_clean_df.insert(21, 'local_or_not', local_dict.values())

In [833]:
ticket_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 858 entries, 0 to 857
Data columns (total 25 columns):
event_date           858 non-null object
year                 858 non-null int64
month                858 non-null object
season               858 non-null object
day_of_week          858 non-null object
time_of_week         858 non-null object
num_shows            858 non-null int64
show_type            858 non-null object
headliner            858 non-null object
support              858 non-null object
num_support          858 non-null int64
tickets_sold         858 non-null float64
gross_usd            858 non-null float64
venue_capacity       858 non-null float64
percentage_sold      858 non-null float64
ticket_price_min     858 non-null float64
ticket_price_max     858 non-null float64
artist_start_date    858 non-null float64
years_active         858 non-null int64
artist_hometown      858 non-null object
us_region            858 non-null object
local_or_not         858 non-nu

In [834]:
ticket_clean_df

Unnamed: 0,event_date,year,month,season,day_of_week,time_of_week,num_shows,show_type,headliner,support,num_support,tickets_sold,gross_usd,venue_capacity,percentage_sold,ticket_price_min,ticket_price_max,artist_start_date,years_active,artist_hometown,us_region,local_or_not,miles_from_home,artist_genre,main_genre
0,2009-02-06,2009,February,Winter,Friday,Weekend,1,single headliner,Social Distortion,"The Black Tibetans, The Devil Makes Three",2,2800.0,91000.0,2800.0,100.0,32.5,32.5,1983.0,26,"Fullerton, California, United States",West Coast,US,364.697,Punk,punk
1,2009-02-07,2009,February,Winter,Saturday,Weekend,1,single headliner,Michael Franti & Spearhead,"ALO (Animal Liberation Orchestra), Solillaquis...",2,2800.0,88563.0,2800.0,100.0,32.5,32.5,1994.0,15,"San Francisco, California, United States",West Coast,Local,8.441,Hip Hop,hip hop
2,2009-02-13,2009,February,Winter,Friday,Weekend,1,single headliner,Will Downing,Gerald Albright,1,1947.0,74893.0,1947.0,100.0,39.5,49.5,1988.0,21,"Brooklyn, New York, New York, United States",Northeast,US,2567.264,Soul,soul
3,2009-02-20,2009,February,Winter,Friday,Weekend,1,festival,K'naan,"Julian Marley, K'naan, Lee ""Scratch"" Perry, Ro...",5,2163.0,81763.0,2800.0,78.0,37.5,40.0,2006.0,3,"Mogadishu, Banaadir, Somalia",International,International,9577.129,Hip Hop,hip hop
4,2009-02-21,2009,February,Winter,Saturday,Weekend,1,single headliner,CAKE,The Lovemakers,1,2800.0,98000.0,2800.0,100.0,35.0,35.0,1994.0,15,"Sacramento, California, United States",West Coast,Local,68.028,Rock,alternative rock
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
853,2019-08-14,2019,August,Summer,Wednesday,Weekday,1,single headliner,Kirk Franklin,Travele Judon,1,1288.0,85120.0,1847.0,70.0,45.0,85.0,1993.0,26,"Fort Worth, Texas, United States",South,US,1447.369,Religious,gospel
854,2019-08-23,2019,August,Summer,Friday,Weekend,1,single headliner,Daniel Caesar,Koffee,1,2957.0,107410.0,2957.0,100.0,35.0,49.5,2017.0,2,"Toronto, Ontario, Canada",International,International,2261.183,Hip Hop,r&b
855,2019-08-31,2019,August,Summer,Saturday,Weekend,1,single headliner,Bryan Ferry,Femme Schmidt,1,1931.0,172015.0,1931.0,100.0,59.5,249.5,1973.0,46,"Washington, Sunderland, Tyne and Wear, England...",West Coast,US,5152.435,Rock,glam rock
856,2019-09-05,2019,September,Autumn,Thursday,Weekend,2,multiple shows,King Crimson,No support,0,1610.5,155522.0,1900.0,85.0,65.0,149.5,1969.0,50,"London, England, United Kingdom",International,International,5361.522,Rock,progressive rock


In [835]:
# with open('Data/ticket_final.pkl', 'wb') as to_write:
#     pickle.dump(ticket_clean_df, to_write)

---

# Data gathering and cleaning completed.
- **See "Modeling_for_Gross_USD_Target" notebook for regression modeling.**
    - In that notebook, the remaining work will be completed to determine the best regression model.
    
---

---

# Web Scraping - Round #2
- In an attempt fill in the NaN values from web scraping attempt #1, the code below scrapes artist info from wikipedia.

In [232]:
remaining_headliners_to_scrape = ticket_clean_df[ticket_clean_df.artist_genre.isna() == True].headliner

In [233]:
# hometown2_dict = {}
# genre2_dict = {}

# while len(remaining_headliners_to_scrape.values) > 50:

#     driver = webdriver.Chrome(ChromeDriverManager().install());
#     url = 'https://en.wikipedia.org/wiki/Radiohead'
#     driver.get(url);

#     for index, artist in remaining_headliners_to_scrape.iteritems():

#         query = driver.find_element_by_id('searchInput');
#         query.send_keys(artist);
#         query.send_keys(Keys.RETURN);

#         try:
#             soup = BeautifulSoup(driver.page_source, 'html.parser')
#             soup.prettify();

#     #         tables = soup.find_all('table')
#             table = soup.find('table', class_ = 'infobox vcard plainlist')
#             names = table.find_all('th')
#             info = table.find_all('td')

#             for idx, name in enumerate(names):
#                 if name.text.strip() == 'Origin':
#                     hometown_idx = idx-1
#                 elif name.text.strip() == 'Born':
#                     hometown_idx = idx-1
#                 else:
#                     pass
#                 if name.text.strip() == 'Genres':
#                     genre_idx = idx-1

#             for idx, i in enumerate(info):
#                 if idx == hometown_idx:
#                     hometown2_dict[index] = i.text.strip()
#                 if idx == genre_idx:
#                     genre2_dict[index] = i.text.strip()

#         except:
#             hometown2_dict[index] = 'NaN'
#             genre2_dict[index] = 'NaN'
    
#     remaining_headliners_to_scrape = pd.Series()
    
#     for index, headliner in ticket_clean_df.headliner.iteritems():
#         for key, value in hometown2_dict.items():
#             if key == index and value == 'NaN':
#                 remaining_headliners_to_scrape.at[index] = headliner
                
#     print('Remaining headliners to scrape: ', len(remaining_headliners_to_scrape.values))

[WDM] - Current google-chrome version is 83.0.4103
[WDM] - Get LATEST driver version for 83.0.4103


 


[WDM] - Driver [/Users/andrewgraves/.wdm/drivers/chromedriver/mac64/83.0.4103.39/chromedriver] found in cache
[WDM] - Current google-chrome version is 83.0.4103
[WDM] - Get LATEST driver version for 83.0.4103
[WDM] - Driver [/Users/andrewgraves/.wdm/drivers/chromedriver/mac64/83.0.4103.39/chromedriver] found in cache


Remaining headliners to scrape:  60
 
Remaining headliners to scrape:  48


In [639]:
# with open('Data/Web_Scrape_2/hometown2_dict.pkl', 'wb') as to_write:
#     pickle.dump(hometown2_dict, to_write)

In [640]:
# with open('Data/Web_Scrape_2/genre2_dict.pkl', 'wb') as to_write:
#     pickle.dump(genre2_dict, to_write)

In [637]:
with open('Data/Web_Scrape_2/hometown2_dict.pkl','rb') as read_file:
    hometown2_dict = pickle.load(read_file)

In [638]:
with open('Data/Web_Scrape_2/genre2_dict.pkl','rb') as read_file:
    genre2_dict = pickle.load(read_file)

## Data Cleaning

- The data that is scraped is fairly messy and will require significant cleaning

In [239]:
# Clean scraped hometown data

hometown2_clean = {}
char_list = ['1','2']

for key, value in hometown2_dict.items():
    if value == 'NaN':
        hometown2_clean[key] = np.nan
    else:
        hometown2_clean[key] = re.sub("|".join(char_list), "", value.split(')')[-1].split('liforniaAtlanta')[0]).strip().replace('[]', '')

In [384]:
# Clean scraped genre data

genre2_clean = {}
sub_dict = {
            'rock  ': ['rockblues', 'rockambient' , 'rockfolk', 'rockindie', 'rockacoustic', 'rocksoft'],
            'pop  ': ['popindie', 'popalternative', 'poppsychedelic', 'PopRock',  'PopRockFolk', 'Popart', 'Vocalsguitarviolin'],
            'Folk  ': ['folkalternative', 'Folkindie'],
            'wave  ': ['wavesynth'],
            'dance  ': ['dancealternative'],
            'hop  ': ['hopR&B', 'hopR&Balternative', 'hoptrap', 'R&Bpop', 'R&Bpopneo', 'hophyphy', 'hoptrapbasshousewonkyexperimental'],
            'fusion  ': ['fusionpost-punkhip'],
            'house  ': ['HouseEDM']
           }

for key, value in genre2_dict.items():
    if value == 'NaN':
        genre2_clean[key] = np.nan
    else:
        clean_genre = re.sub("1|2|3|4|5", '', value)
        clean_genre = clean_genre.split('\n')[0].split(',')[0].strip().replace('[]', '')
        for genre, sub_list in sub_dict.items():
            clean_genre = re.sub("|".join(sub_list), genre, clean_genre)
            
#         print(clean_genre.split())
        
        if len(clean_genre.split()) == 1:
            genre2_clean[key] = clean_genre
        elif clean_genre.split()[1] == genre:
            genre2_clean[key] = clean_genre.split(genre)[0] + genre
        else:
            genre2_clean[key] = clean_genre.split('  ')[0]
        

## Compile scraped data into dataframe that will be merged with `ticket_clean_df`

In [578]:
fill_nan_df = pd.DataFrame(ticket_clean_df[ticket_clean_df.artist_genre.isna() == True].headliner)
fill_nan_df


Unnamed: 0,headliner
1,Michael Franti & Spearhead
21,M. Ward
23,John Vanderslice
47,Monsters Of Folk
49,Mika
...,...
831,Maggie Rogers
833,Chris D'Elia
841,Rainbow Kitten Surprise
842,Rainbow Kitten Surprise


In [388]:
hometown2_df = pd.DataFrame(hometown2_clean.values(), index=hometown2_clean.keys(), columns=['artist_hometown'])
genre2_df = pd.DataFrame(genre2_clean.values(), index=genre2_clean.keys(), columns=['artist_genre'])

In [579]:
fill_nan_df = fill_nan_df.merge(hometown2_df, left_index=True, right_index=True).merge(genre2_df, left_index=True, right_index=True)


## Fix web scraping errors

- The selenium web scraping script made several errors (entering repeat info for consecutive rows, etc.) that will be manually fixed below

In [580]:
fill_nan_df.loc[21, 'artist_hometown'] = 'Portland, Oregon'
fill_nan_df.loc[21, 'artist_genre'] = 'Indie Folk'
fill_nan_df.loc[49, 'artist_hometown'] = 'Beirut, Lebanon'
fill_nan_df.loc[49, 'artist_genre'] = 'Pop'
fill_nan_df.loc[92, 'artist_hometown'] = 'Rhinebeck, New York'
fill_nan_df.loc[92, 'artist_genre'] = 'Pop'
fill_nan_df.loc[95, 'artist_hometown'] = 'Charlottesville, Virginia'
fill_nan_df.loc[95, 'artist_genre'] = 'Indie Rock'
fill_nan_df.loc[148, 'artist_hometown'] = 'Louisville, Kentucky'
fill_nan_df.loc[148, 'artist_genre'] = 'Indie Rock'
fill_nan_df.loc[159, 'artist_hometown'] = 'Dublin, Ireland'
fill_nan_df.loc[159, 'artist_genre'] = 'Folk'
fill_nan_df.loc[178, 'artist_hometown'] = 'Los Angeles, California'
fill_nan_df.loc[178, 'artist_genre'] = 'Preschool'
fill_nan_df.loc[179, 'artist_hometown'] = 'Los Angeles, California'
fill_nan_df.loc[179, 'artist_genre'] = "Children's Music"
fill_nan_df.loc[194, 'artist_genre'] = 'Indie Pop'
fill_nan_df.loc[198, 'artist_hometown'] = 'California City, California'
fill_nan_df.loc[198, 'artist_genre'] = 'Comedy'
fill_nan_df.loc[203, 'artist_hometown'] = 'Los Angeles, California'
fill_nan_df.loc[203, 'artist_genre'] = 'Indie Folk'
fill_nan_df.loc[209, 'artist_hometown'] = 'Rhinebeck, New York'
fill_nan_df.loc[209, 'artist_genre'] = 'Pop'
fill_nan_df.loc[213, 'artist_hometown'] = 'San Jose, CA'
fill_nan_df.loc[213, 'artist_genre'] = 'Doom Metal'
fill_nan_df.loc[245, 'artist_hometown'] = 'Los Angeles, California'
fill_nan_df.loc[245, 'artist_genre'] = "Children's Music"
fill_nan_df.loc[246, 'artist_hometown'] = 'Los Angeles, California'
fill_nan_df.loc[246, 'artist_genre'] = "Children's Music"
fill_nan_df.loc[282, 'artist_hometown'] = 'Juazeiro, Bahia, Brazil'
fill_nan_df.loc[282, 'artist_genre'] = 'Reggae'
fill_nan_df.loc[393, 'artist_hometown'] = 'Stockholm, Sweden'
fill_nan_df.loc[393, 'artist_genre'] = 'Folk'
fill_nan_df.loc[410, 'artist_hometown'] = 'London, England'
fill_nan_df.loc[410, 'artist_genre'] = 'Folk'
fill_nan_df.loc[417, 'artist_hometown'] = 'San Francisco, California'
fill_nan_df.loc[417, 'artist_genre'] = 'Indie Rock'
fill_nan_df.loc[422, 'artist_hometown'] = 'New York City, New York'
fill_nan_df.loc[422, 'artist_genre'] = 'Comedy'
fill_nan_df.loc[426, 'artist_hometown'] = 'New York City, New York'
fill_nan_df.loc[426, 'artist_genre'] = 'Comedy'
fill_nan_df.loc[476, 'artist_hometown'] = 'Nashville, Tennessee'
fill_nan_df.loc[476, 'artist_genre'] = 'Electronic'
fill_nan_df.loc[486, 'artist_hometown'] = 'Oakland, California'
fill_nan_df.loc[486, 'artist_genre'] = 'Hip Hop'
fill_nan_df.loc[487, 'artist_hometown'] = 'Oakland, California'
fill_nan_df.loc[487, 'artist_genre'] = 'Hip Hop'
fill_nan_df.loc[496, 'artist_hometown'] = 'Louisville, Kentucky'
fill_nan_df.loc[496, 'artist_genre'] = 'R&B'
fill_nan_df.loc[497, 'artist_hometown'] = 'Santa Cruz, California'
fill_nan_df.loc[497, 'artist_genre'] = 'Livetronica'
fill_nan_df.loc[533, 'artist_hometown'] = 'Los Angeles, California'
fill_nan_df.loc[533, 'artist_genre'] = 'Podcast'
fill_nan_df.loc[544, 'artist_hometown'] = 'Los Angeles, California'
fill_nan_df.loc[544, 'artist_genre'] = 'Podcast'
fill_nan_df.loc[550, 'artist_hometown'] = 'Casablanca, Morocco'
fill_nan_df.loc[550, 'artist_genre'] = 'Comedy'
fill_nan_df.loc[557, 'artist_hometown'] = 'Los Angeles, California'
fill_nan_df.loc[570, 'artist_hometown'] = 'Vancouver, British Columbia, Canada'
fill_nan_df.loc[570, 'artist_genre'] = 'Electronic'
fill_nan_df.loc[580, 'artist_hometown'] = 'Compton, California'
fill_nan_df.loc[580, 'artist_genre'] = 'Hip Hop'
fill_nan_df.loc[581, 'artist_hometown'] = 'Compton, California'
fill_nan_df.loc[581, 'artist_genre'] = 'Hip Hop'
fill_nan_df.loc[590, 'artist_hometown'] = 'Jackson, Mississippi'
fill_nan_df.loc[590, 'artist_genre'] = 'Comedy'
fill_nan_df.loc[601, 'artist_hometown'] = 'Los Angeles, California'
fill_nan_df.loc[601, 'artist_genre'] = 'Podcast'
fill_nan_df.loc[608, 'artist_hometown'] = 'Stockholm, Sweden'
fill_nan_df.loc[608, 'artist_genre'] = 'House'
fill_nan_df.loc[613, 'artist_hometown'] = 'London, England'
fill_nan_df.loc[613, 'artist_genre'] = "Children's Music"
fill_nan_df.loc[618, 'artist_hometown'] = 'New York City, New York'
fill_nan_df.loc[618, 'artist_genre'] = 'Comedy'
fill_nan_df.loc[644, 'artist_hometown'] = 'Cincinatti, Ohio'
fill_nan_df.loc[644, 'artist_genre'] = 'Comedy'
fill_nan_df.loc[654, 'artist_hometown'] = 'Bergenfield, New Jersey'
fill_nan_df.loc[654, 'artist_genre'] = 'Indie Pop'
fill_nan_df.loc[656, 'artist_hometown'] = 'Dublin, Ireland'
fill_nan_df.loc[656, 'artist_genre'] = 'Folk'
fill_nan_df.loc[658, 'artist_hometown'] = 'New York City, New York'
fill_nan_df.loc[658, 'artist_genre'] = 'Comedy'
fill_nan_df.loc[665, 'artist_hometown'] = 'Sheffield, South Yorkshire, United Kingdom'
fill_nan_df.loc[665, 'artist_genre'] = 'Electronic'
fill_nan_df.loc[679, 'artist_hometown'] = 'New York City, New York'
fill_nan_df.loc[679, 'artist_genre'] = 'Comedy'
fill_nan_df.loc[681, 'artist_hometown'] = 'Los Angeles, California'
fill_nan_df.loc[681, 'artist_genre'] = 'Podcast'
fill_nan_df.loc[685, 'artist_hometown'] = 'Chicago, Illinois'
fill_nan_df.loc[685, 'artist_genre'] = 'Electronic'
fill_nan_df.loc[697, 'artist_hometown'] = 'Stockholm, Sweden'
fill_nan_df.loc[697, 'artist_genre'] = 'Folk'
fill_nan_df.loc[702, 'artist_hometown'] = 'Compton, California'
fill_nan_df.loc[702, 'artist_genre'] = 'Hip Hop'
fill_nan_df.loc[705, 'artist_genre'] = 'Hip Hop'
fill_nan_df.loc[706, 'artist_hometown'] = 'New Canaan, Connecticut'
fill_nan_df.loc[706, 'artist_genre'] = 'Art Pop'
fill_nan_df.loc[707, 'artist_hometown'] = 'West Bromwich, Staffordshire, England'
fill_nan_df.loc[707, 'artist_genre'] = 'Rock'
fill_nan_df.loc[746, 'artist_hometown'] = 'St. Louis, Missouri'
fill_nan_df.loc[746, 'artist_genre'] = 'Blues Rock'
fill_nan_df.loc[747, 'artist_hometown'] = 'St. Louis, Missouri'
fill_nan_df.loc[747, 'artist_genre'] = 'Blues Rock'
fill_nan_df.loc[748, 'artist_hometown'] = 'Middlebury, Vermont'
fill_nan_df.loc[748, 'artist_genre'] = 'Reggae'
fill_nan_df.loc[757, 'artist_hometown'] = 'Frankenmuth, Michigan'
fill_nan_df.loc[757, 'artist_genre'] = 'Hard Rock'
fill_nan_df.loc[760, 'artist_hometown'] = 'Vancouver, British Columbia, Canada'
fill_nan_df.loc[760, 'artist_genre'] = 'Electronic'
fill_nan_df.loc[785, 'artist_hometown'] = 'Austin, Texas'
fill_nan_df.loc[785, 'artist_genre'] = 'Americana'
fill_nan_df.loc[787, 'artist_hometown'] = 'Philadelphia, Pennsylvania'
fill_nan_df.loc[787, 'artist_genre'] = 'Indie Rock'
fill_nan_df.loc[798, 'artist_hometown'] = 'St. Paul, Minnesota'
fill_nan_df.loc[798, 'artist_genre'] = 'Indie Rock'
fill_nan_df.loc[802, 'artist_hometown'] = 'New York City, New York'
fill_nan_df.loc[802, 'artist_genre'] = 'Comedy'
fill_nan_df.loc[821, 'artist_hometown'] = 'Concord, Massachusetts'
fill_nan_df.loc[821, 'artist_genre'] = 'Pop Punk'
fill_nan_df.loc[828, 'artist_hometown'] = 'Gothenburg, Sweden'
fill_nan_df.loc[828, 'artist_genre'] = 'Indie Folk'
fill_nan_df.loc[833, 'artist_hometown'] = 'Montclair, New Jersey'
fill_nan_df.loc[833, 'artist_genre'] = 'Comedy'
fill_nan_df.loc[848, 'artist_hometown'] = 'Los Angeles, California'
fill_nan_df.loc[848, 'artist_genre'] = 'Podcast'

In [581]:
fill_nan_df.loc[131, 'artist_hometown'] = 'Meols, Wirral, Merseyside, England'
fill_nan_df.loc[242, 'artist_hometown'] = 'Belgium'
fill_nan_df.loc[363, 'artist_hometown'] = 'Chicago, Illinois'
fill_nan_df.loc[430, 'artist_hometown'] = 'Los Angeles, California'
fill_nan_df.loc[516, 'artist_hometown'] = 'United States'
fill_nan_df.loc[525, 'artist_hometown'] = 'Los Angeles, California'
fill_nan_df.loc[551, 'artist_hometown'] = 'Los Angeles, California'
fill_nan_df.loc[552, 'artist_hometown'] = 'Los Angeles, California'
fill_nan_df.loc[600, 'artist_hometown'] = 'Atlanta, Georgia'
fill_nan_df.loc[689, 'artist_hometown'] = 'Tours, France'
fill_nan_df.loc[694, 'artist_hometown'] = 'United States'
fill_nan_df.loc[721, 'artist_hometown'] = 'Marietta, Georgia'
fill_nan_df.loc[796, 'artist_hometown'] = 'United States'
fill_nan_df.loc[815, 'artist_hometown'] = 'Long Island, New York'

In [582]:
fill_nan_df.loc[131, 'artist_genre'] = 'Electronic'
fill_nan_df.loc[242, 'artist_genre'] = 'Electronic'
fill_nan_df.loc[363, 'artist_genre'] = 'Comedy'
fill_nan_df.loc[430, 'artist_genre'] = 'Podcast'
fill_nan_df.loc[516, 'artist_genre'] = 'Metal'
fill_nan_df.loc[525, 'artist_genre'] = 'Electronic'
fill_nan_df.loc[551, 'artist_genre'] = 'Electronic'
fill_nan_df.loc[552, 'artist_genre'] = 'Electronic'
fill_nan_df.loc[600, 'artist_genre'] = 'Hip Hop'
fill_nan_df.loc[689, 'artist_genre'] = 'Electronic'
fill_nan_df.loc[694, 'artist_genre'] = 'Rock'
fill_nan_df.loc[721, 'artist_genre'] = 'Rock'
fill_nan_df.loc[796, 'artist_genre'] = 'Rock'
fill_nan_df.loc[815, 'artist_genre'] = 'Pop'

In [584]:
# Convert genres to lowercase, change column name

fill_nan_df.artist_genre = fill_nan_df.artist_genre.str.lower()
fill_nan_df.artist_genre = fill_nan_df.artist_genre.str.strip()
fill_nan_df.rename(columns={"artist_genre": "main_genre"}, inplace=True)

## Save `fill_nan_df` as pickle file

In [642]:
# with open('Data/Web_Scrape_2/fill_nan.pkl', 'wb') as to_write:
#     pickle.dump(fill_nan_df, to_write)