In [1]:
#--------- Python Libraries ---------#

import requests
import json
import re
from html.parser import HTMLParser
from bs4 import BeautifulSoup
import csv
import pandas as pd
from datetime import date
import datetime
import dateutil.parser

import SpotifyLookup

import string
import Levenshtein as lev




In [2]:
#--------- Web crawling for URLs ---------#

source = requests.get('https://www.nme.com/reviews/live').text
soup = BeautifulSoup(source, 'lxml')
print(source)

<!doctype html >
<html lang="en-GB">
<head>
    <title>Live Reviews | NME</title>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <link rel="icon" type="image/png" href="https://www.nme.com/wp-content/uploads/2019/12/logo-nme@64w.png">
	<!-- This site is optimized with the Yoast SEO Premium plugin v14.9 - https://yoast.com/wordpress/plugins/seo/ -->
	<meta name="description" content="Live reviews from the best gigs and concerts around the world. Team NME are down the front, reporting on what things are really like in the crowd" />
	<meta name="robots" content="index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1" />
	<link rel="canonical" href="https://www.nme.com/reviews/live" />
	<link rel="next" href="https://www.nme.com/reviews/live/page/2" />
	<meta property="og:locale" content="en_GB" />
	<meta property="og:locale:alternate" content="en_US" />
	<meta property="og:type" content="article" />
	<meta pr

In [3]:
list_of_URLs = []

# Loop through all review links to find concert reviews specifically
for link in soup.find_all('div',{'class' : 'td-module-thumb'}):
    link = link.find('a').get('href')
    list_of_URLs.append(link)

# Remove duplicate URLs in list
list_of_URLs = list(set(list_of_URLs))
print(list_of_URLs)

['https://www.nme.com/reviews/declan-mckenna-live-london-zeros-lafayete-review-2747071', 'https://www.nme.com/reviews/sam-fender-newcastle-virgin-money-unity-arena-review-2728106', 'https://www.nme.com/reviews/megan-thee-stallion-livestream-review-2741646', 'https://www.nme.com/reviews/sleaford-mods-live-london-100-club-review-billy-nomates-2752388', 'https://www.nme.com/reviews/live/notting-hill-carnival-virtual-digital-review-jeremiah-asiamah-alicai-harley-2742613', 'https://www.nme.com/reviews/idles-livestream-abbey-road-studios-review-2741734', 'https://www.nme.com/reviews/life-live-hull-moon-factory-review-2734352', 'https://www.nme.com/reviews/live/goose-island-presents-supergrass-live-in-oxford-immersive-virtual-reality-extravaganza-2736894', 'https://www.nme.com/reviews/live/bdrmm-live-in-hull-review-radar-2728052', 'https://www.nme.com/reviews/margo-price-live-nashville-brooklyn-bowl-review-2749940']


In [4]:
# 2 test URLs
#list_of_URLs = ['https://www.nme.com/reviews/megan-thee-stallion-livestream-review-2741646']

# Create empty dictionary
critic_review_dict = {} 
critic_review_dict['concerts'] = []

# Loop - For every link in the URL_links do the following with the reviews:
for i in list_of_URLs:

    # Reset values
    critic_name = 'NME'
    author = ''
    url = ''
    headline = ''
    text = ''
    rating = 0
    tribute_rating = 0
    max_rating = 5
    artist_name = ''
    venue_name = ''
    publication_date = ''
    lang = 'en'
    genre = []
    venue_place = ''
    concert_date = ''
    
    url = i
    
    # Get the source code of the webpage as text
    source = requests.get(i).text
    soup = BeautifulSoup(source, 'lxml')
    
    ######### Clean Soup for correct text mining #########
    
    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.decompose()    # rip it out
    
    # Find headline
    try:
        headline = soup.find('h1').text
        print('Headline found')
    
    except:
        print('No headline found')
    
    # Find text
    try:
        text = soup.find('div',{'id' : 'td-outer-wrap'})
        text = text.find('article')
        text = text.find_all('p')
        
        text_list = []
        
        for i in text:
            sub_text = i.getText()
            text_list.append(sub_text)
        
        text = text_list
        
        text = [re.sub(r'<.+?>',r'',str(a)) for a in text]
        
        for i in text:
            if 'credit:' in i.lower():
                text.remove(i)
        
        for i in text:
            if 'read more:' in i.lower():
                text.remove(i)
        
        for i in text:
            if 'a post shared by' in i.lower():
                text.remove(i)
        
        
        
        text = '\n\n'.join(text)

        print('Text found')
    
    except:
        print('No text found')
    
    # Find artist name
    try:
        all_links_in_text = soup.select('p a[href]')
        
        for i in all_links_in_text:
            if '/artist' in i.get('href'):
                artist_name = i.text
                artist_name = artist_name.lstrip()
                print('Artist name found')
                break
            else:
                continue        
                
        
    except:
        artist_name = ''
        print('No artist name found')
        continue
    
    # Find venue name
    print('No venue name found')
    
    
    # Find publication date
    try:    
        publication_date = soup.find('meta',{'property' : 'article:published_time'})
        publication_date = publication_date['content']
        print('publication date found')

    except:
        publication_date = ''
        print('No publication_date found') 
        continue
    
    # Find author
    try:
        author = soup.find('a', {'class' : 'tdb-author-name'}).text
        print('Author found')
        
    except:
        print('No author found')
    
    # Find rating
    try:
        rating = soup.find('meta',{'itemprop' : 'ratingValue'})
        rating = rating['content']
        tribute_rating = (5/max_rating)*rating
        tribute_rating = round(tribute_rating, 2)
        print('Rating found')
    except:
        print('No rating found')
    
    ########## Spotify lookup ############
    try:
        # Preprocessing: Remove all within () and punctuation, set lowercase
        preprocessed_name = re.sub("([\(\[]).*?([\)\]])", "\g<1>\g<2>", artist_name)
        preprocessed_name = preprocessed_name.lower()
        preprocessed_name = preprocessed_name.translate(str.maketrans('', '', string.punctuation))
        print('Name preprocessing:', artist_name, '=' ,preprocessed_name)

        # Reset spotify values
        spotify_match = False
        artist_name_spotify = ""
        artist_img_spotify = ""
        artist_spotify_id = ""
        subgenres_spotify = []
        artist_popularity_spotify = ""

        # Make Spotify lookup on raw artist name
        results = spotify.search(q='artist:' + artist_name, type='artist')
        items = results['artists']['items']
        
        # If spotify returns no results on raw artist name search, make Spotify lookup on preprocessed artist name

        if len(items) == 0:
            print('No result on raw name, trying preprocessed name...')
            results = spotify.search(q='artist:' + preprocessed_name, type='artist')
            items = results['artists']['items']

        # If spotify returns results on raw artist name
        if len(items) > 0:

            print('found', len(items), 'results')
            print('searching for exact string match...')

            # First check all results for exact string match
            for ii in range(len(items)):
                artist = items[ii]

                # Set lower case and match songkick name with spotify name
                if artist_name.lower() == artist['name'].lower():

                    spotify_match = True
                    print('Found exact string match on index', ii, artist['name'])

                    artist_name_spotify = artist['name']
                    artist_img_spotify = artist['images'][0]['url']
                    artist_spotify_id = artist['id']
                    subgenres_spotify = artist['genres']
                    artist_popularity_spotify = artist['popularity']

                    break

                else:
                    print('No exact match on result from index ', ii, artist['name'])

            # If no match, then check all spotify results for fuzzy string match above 90%
            if spotify_match == False:
                print('searching for fuzzy string match...')

                for iii in range(len(items)):
                    artist = items[iii]

                    Distance = lev.distance(preprocessed_name,artist['name'].lower()),
                    Ratio = lev.ratio(preprocessed_name,artist['name'].lower())

                    if Ratio > 0.90:

                        spotify_match = True
                        print('Found fuzzy string match on index', iii, artist['name'])
                        print('with ratio:', Ratio)

                        artist_name_spotify = artist['name']
                        artist_img_spotify = artist['images'][0]['url']
                        artist_spotify_id = artist['id']
                        subgenres_spotify = artist['genres']
                        artist_popularity_spotify = artist['popularity']
                        break

                    else:
                        print('No fuzzy match on result from index ', iii, artist['name'])

        else:
            print('no Spotify search results at all from the preprocessed string')
    except:
        print('unsuccesful spotify lookup')
    
    print('')
    print('Critic:', critic_name)
    print('Author:', author)
    print('URL:', url)
    print('Artist:', artist_name)
    print('Venue:', venue_name)
    print('Venue Place:', venue_place)
    print('Rating:', str(rating))
    print('Max Rating:', str(max_rating))
    print('Tribute Rating:', str(tribute_rating))
    print('Publication Date:', publication_date)
    print('Concert Date:', concert_date)
    print('artist spotify name:', artist_name_spotify)
    print('artist spotify image:', artist_img_spotify)
    print('artist spotify ID:', artist_spotify_id)
    print('artist spotify subgenres:', subgenres_spotify)
    print('artist spotify popularity:', artist_popularity_spotify)
    print('Language of review:', lang)
    print('Headline:', headline)
    print('Text:', text)
    print('')

    
    # Set dictionary values

    temp_dict = {}
    temp_dict['critic_name'] = critic_name
    temp_dict['author'] = author
    temp_dict['url'] = url
    temp_dict['artist_name'] = artist_name
    temp_dict['artist_name_spotify'] = artist_name_spotify
    temp_dict['artist_img_spotify'] = artist_img_spotify
    temp_dict['artist_spotify_id'] = artist_spotify_id
    temp_dict['artist_popularity_spotify'] = artist_popularity_spotify
    temp_dict['genre'] = genre
    temp_dict['subgenre_spotify'] = subgenres_spotify
    temp_dict['publication_date'] = publication_date
    temp_dict['concert_date'] = concert_date
    temp_dict['venue_name'] = venue_name
    temp_dict['venue_place'] = venue_place
    temp_dict['rating'] = rating
    temp_dict['max_rating'] = max_rating
    temp_dict['tribute_rating'] = tribute_rating
    temp_dict['review_language'] = lang
    temp_dict['headline'] = headline
    temp_dict['text'] = text

    critic_review_dict['concerts'].append(temp_dict)

    
    
    


Headline found
Text found
Artist name found
No venue name found
publication date found
Author found
No rating found
Name preprocessing: Declan McKenna = declan mckenna
found 1 results
searching for exact string match...
Found exact string match on index 0 Declan McKenna

Critic: NME
Author: Caitlin O'Reilly
URL: https://www.nme.com/reviews/declan-mckenna-live-london-zeros-lafayete-review-2747071
Artist: Declan McKenna
Venue: 
Venue Place: 
Rating: 3
Max Rating: 5
Tribute Rating: 0
Publication Date: 2020-09-07T18:33:41+00:00
Concert Date: 
artist spotify name: Declan McKenna
artist spotify image: https://i.scdn.co/image/f5d6134fd136c452fc065d96883e9d65f47076c8
artist spotify ID: 2D4FOOOtWycb3Aw9nY5n3c
artist spotify subgenres: ['indie pop', 'modern alternative rock', 'modern rock', 'rock']
artist spotify popularity: 72
Language of review: en
Headline: Declan McKenna live in London: baby Bowie skips through glam-pop for the pandemic age
Text: Lafayette, September 4: the glam-rock-loving 

found 6 results
searching for exact string match...
No exact match on result from index  0 Nick Cave & The Bad Seeds
Found exact string match on index 1 Nick Cave

Critic: NME
Author: Mark Beaumont
URL: https://www.nme.com/reviews/sleaford-mods-live-london-100-club-review-billy-nomates-2752388
Artist: Nick Cave
Venue: 
Venue Place: 
Rating: 4
Max Rating: 5
Tribute Rating: 0
Publication Date: 2020-09-14T12:41:50+00:00
Concert Date: 
artist spotify name: Nick Cave
artist spotify image: https://i.scdn.co/image/09cdeb20ab054927616a66a87986313f07b83c9e
artist spotify ID: 1RM5gp0RFfjpJhCYFPB30p
artist spotify subgenres: ['melancholia']
artist spotify popularity: 50
Language of review: en
Headline: Sleaford Mods live in London: a rallying cry for the swelling ranks of angry pandemic jobseekers
Text: The 100 Club, September 12: the Nottingham duo celebrate the release of top 10 compilation album 'All That Glue', enlisting Billy Nomates for the festivities

If it was fitting for Nick Cave to op

Headline found
Text found
Artist name found
No venue name found
publication date found
Author found
No rating found
Name preprocessing: LIFE = life
found 10 results
searching for exact string match...
No exact match on result from index  0 Lifehouse
No exact match on result from index  1 Rexx Life Raj
No exact match on result from index  2 Young Stoner Life Records
No exact match on result from index  3 Cali Life Style
No exact match on result from index  4 Easy Life
No exact match on result from index  5 This Wild Life
No exact match on result from index  6 Dada Life
No exact match on result from index  7 Life.Church Worship
No exact match on result from index  8 Have A Nice Life
No exact match on result from index  9 Life Line Hub
searching for fuzzy string match...
No fuzzy match on result from index  0 Lifehouse
No fuzzy match on result from index  1 Rexx Life Raj
No fuzzy match on result from index  2 Young Stoner Life Records
No fuzzy match on result from index  3 Cali Life Style

Headline found
Text found
Artist name found
No venue name found
publication date found
Author found
No rating found
Name preprocessing: bdrmm = bdrmm
found 1 results
searching for exact string match...
Found exact string match on index 0 bdrmm

Critic: NME
Author: Rhys Buchanan
URL: https://www.nme.com/reviews/live/bdrmm-live-in-hull-review-radar-2728052
Artist: bdrmm
Venue: 
Venue Place: 
Rating: 5
Max Rating: 5
Tribute Rating: 0
Publication Date: 2020-08-14T12:42:18+00:00
Concert Date: 
artist spotify name: bdrmm
artist spotify image: https://i.scdn.co/image/be016388683ff69d3868fa658c24f0c27be85368
artist spotify ID: 4Cx5LnF4WNJIn9SSqyeq9C
artist spotify subgenres: ['bedroom pop', 'chamber psych', 'english indie rock', 'garage psych', 'hull indie']
artist spotify popularity: 37
Language of review: en
Headline: bdrmm live in Hull: shoegazers trigger all the senses in sweaty homecoming showcase
Text: In their first live set since dropping their debut album, the shoegazers summon up a d

In [5]:
# Create json object with utf-8
with open('NME_critic_reviews.json', 'w', encoding='utf-8') as json_file:
    json.dump(critic_review_dict, json_file, ensure_ascii=False)

In [7]:
with open('NME_critic_reviews.json') as json_file:
    data = json.load(json_file)
    jstr = json.dumps(data, indent=4)
    print(jstr)

{
    "concerts": [
        {
            "critic_name": "NME",
            "author": "Rhys Buchanan",
            "url": "https://www.nme.com/reviews/live/bdrmm-live-in-hull-review-radar-2728052",
            "artist_name": "bdrmm",
            "artist_name_spotify": "bdrmm",
            "artist_img_spotify": "https://i.scdn.co/image/be016388683ff69d3868fa658c24f0c27be85368",
            "artist_spotify_id": "4Cx5LnF4WNJIn9SSqyeq9C",
            "artist_popularity_spotify": 37,
            "genre": [],
            "subgenre_spotify": [
                "bedroom pop",
                "chamber psych",
                "english indie rock",
                "garage psych",
                "hull indie"
            ],
            "publication_date": "2020-08-14T12:42:18+00:00",
            "concert_date": "",
            "venue_name": "",
            "venue_place": "",
            "rating": "5",
            "max_rating": 5,
            "review_language": "en",
            "headline": "bdrmm l