In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from time import sleep

In [2]:
def reading_top(page):
    # function that reads top 100 songs from a popvortex ranking
    headers = {"Accept-Language": "en-US,en;q=0.5"}
    url = page

    response = requests.get(url,headers=headers)
    response.status_code 
    soup = BeautifulSoup(response.content, "html.parser")

    #initialize empty lists
    title = []
    singers = []
    genre = []

    # define the number of iterations of our for loop
    # by checking how many elements are in the retrieved result set
    # (this is equivalent but more robust than just explicitly defining 250 iterations)
    num_iter = len(soup.select("div.chart-content"))

    title_list = soup.select("div.chart-content p.title-artist cite.title")
    singer_list = soup.select("div.chart-content p.title-artist em.artist")

    genre_list = soup.select("div.chart-content ul li")
    # removing extra elements that are not the genre
    for i in reversed(range(len(genre_list))):
        if 'Genre' in str(genre_list[i]) or 'Gênero' in str(genre_list[i]) or 'ジャンル' in str(genre_list[i]): 
            pass
        else:
            del genre_list[i]
            
    # removing extra text that is not the genre
    for i in range(len(genre_list)):
        genre_list[i] = genre_list[i].get_text().replace('Genre: ','')
        genre_list[i] = genre_list[i].replace('ジャンル: ','')
        genre_list[i] = genre_list[i].replace('Gênero: ','')


    # iterate through the result set and retrive all the data
    for i in range(num_iter):
        title.append(title_list[i].get_text())
        singers.append(singer_list[i].get_text().split('&'))
        genre.append(genre_list[i].split('/'))


    # each list becomes a column
    return pd.DataFrame({"title":title,
                           "singers":singers,
                           "genre":genre
                          })


In [3]:
# Reading the first international TOP 100
songs = reading_top('https://www.popvortex.com/music/charts/top-100-songs.php')

In [4]:
songs

Unnamed: 0,title,singers,genre
0,Unholy,"[Sam Smith , Kim Petras]",[Pop]
1,Eagle (feat. KB),[Transformation Worship],"[Hip-Hop , Rap]"
2,I'm Good (Blue),"[David Guetta , Bebe Rexha]",[Dance]
3,Everywhere,[Fleetwood Mac],[Rock]
4,wait in the truck,"[HARDY , Lainey Wilson]",[Country]
...,...,...,...
95,Sand In My Boots,[Morgan Wallen],[Country]
96,No Se Va (En Vivo),[Grupo Frontera],[Regional Mexicano]
97,Perfectly Loved (feat. TobyMac),[Rachael Lampa],[Christian & Gospel]
98,How Far I'll Go,[Auli'i Cravalho],[Soundtrack]


# Lab | Web Scraping Multiple Pages
I want to add the TOP 100 charts for several more countries, so first I will use webscraping to get a list of the links and the name of the countries, and then I will webscrap each single link to get the top 100 from that country.

In [5]:
# GETTIN THE TOP 100 LINKS FOR EACH COUNTRY
headers = {"Accept-Language": "en-US,en;q=0.5"}
url = 'https://www.popvortex.com/charts/international-charts.php'

response = requests.get(url,headers=headers)
response.status_code 
soup = BeautifulSoup(response.content, "html.parser")

links = []
countries = []
for item in soup.select('h2.country'):
    countries.append(item.get_text())

for item in soup.select(' ul li'):
    if item.find("a", string="Top 100 Songs") is not None:
        links.append(item.find("a", string="Top 100 Songs").get('href'))
        
# We only want the first 25 of countries and the first 25 of links
links = links[0:-2]
countries = countries[0:-1]

for i,link in enumerate(links):
    links
    links[i] = link.replace("..","")
    links[i] = 'https://www.popvortex.com' + links[i]

# example
print(links[-1])
print(countries[-1])

https://www.popvortex.com/music/itunes-charts/top-100-songs-uk.php
United Kingdom (UK)


In [6]:
# READING TOP COUNTRIES
tops = {}
for i,country in enumerate(countries):
    wait_time = np.random.randint(1,4)
    print("I will sleep for " + str(wait_time) + " second/s before reading " +country+".")
    sleep(wait_time)
    tops[country] = reading_top(links[i])

I will sleep for 1 second/s before reading Australia.
I will sleep for 2 second/s before reading Austria.
I will sleep for 2 second/s before reading Belgium.
I will sleep for 2 second/s before reading Brasil.
I will sleep for 2 second/s before reading Canada.
I will sleep for 2 second/s before reading Denmark.
I will sleep for 3 second/s before reading Finland.
I will sleep for 2 second/s before reading France.
I will sleep for 2 second/s before reading Germany.
I will sleep for 2 second/s before reading Greece.
I will sleep for 3 second/s before reading India.
I will sleep for 2 second/s before reading Ireland.
I will sleep for 2 second/s before reading Italy.
I will sleep for 1 second/s before reading Japan.
I will sleep for 3 second/s before reading Mexico.
I will sleep for 2 second/s before reading Netherlands.
I will sleep for 2 second/s before reading New Zealand.
I will sleep for 1 second/s before reading Norway.
I will sleep for 2 second/s before reading Philippines.
I will sle

In [7]:
# Creating a column to know from which list the song is coming from
songs['from'] = 'international'
for country in tops:
    tops[country]['from'] = country

In [8]:
# Concatenating everything
for country in tops:
    songs = pd.concat([songs,tops[country]],axis=0)
    
# dropping duplicates
songs = songs.drop_duplicates(subset=['title'])

# The index+1 is the position in the top 100 chart, so I keep the old index+1
songs = songs.reset_index()
songs['index'] +=1

In [9]:
songs

Unnamed: 0,index,title,singers,genre,from
0,1,Unholy,"[Sam Smith , Kim Petras]",[Pop],international
1,2,Eagle (feat. KB),[Transformation Worship],"[Hip-Hop , Rap]",international
2,3,I'm Good (Blue),"[David Guetta , Bebe Rexha]",[Dance],international
3,4,Everywhere,[Fleetwood Mac],[Rock],international
4,5,wait in the truck,"[HARDY , Lainey Wilson]",[Country],international
...,...,...,...,...,...
2525,96,Too Good For Giving Up,[Liam Gallagher],[Alternative],United Kingdom (UK)
2526,97,That'll Do (feat. Paddy Maloney and the Black ...,[Peter Gabriel],[Rock],United Kingdom (UK)
2527,98,Waterfall (feat. The Dunwells),[Tricia Longford],"[Singer, Songwriter]",United Kingdom (UK)
2528,99,ILLELLA,[MAMAMOO],[K-Pop],United Kingdom (UK)


# Recommendation

In [10]:
def recommend_song(record):
    # Recommends a song from the dataframe with the same author or genre
    
    # first I check if the song is in the top 100
    if record in songs['title'].values:
        
        # I create an empty dataframe for recommendations
        recommendations = pd.DataFrame()
        # I get the list of artists from the song
        record_artists = songs[songs['title']==record]['singers']
        # I get the list of genres from the song
        record_genres = songs[songs['title']==record]['genre']
        
        # For every artist in my list of artists
        for artist in record_artists[0]:
            # I add to my recommendations everything with this artist
            recommendations = pd.concat([recommendations,songs[songs['singers'].apply(lambda x: any([artist == singer for singer in x]))]],axis=0)
        
        # For every genre in my list of genres
        for genre in record_genres[0]:
            # I add to my recommendations everything containing this genre
            recommendations = pd.concat([recommendations,songs[songs['genre'].apply(lambda x: any([genre == genero for genero in x]))]],axis=0)               
                
        # dropping duplicates because my code can easily create them if they share artist and genre
        recommendations= recommendations.drop_duplicates(subset=['title'])
        
        # Dropping the record we were given because we don't want to recommend the same song
        recommendations = recommendations[recommendations['title']!=record]

        chosen_one = recommendations.iloc[np.random.randint(0,high=len(recommendations))]
        # Returning a random value
        print('We recommend you the song number '+str(chosen_one['index'])+' from the '+chosen_one['from']+' TOP 100 charts:')
        return chosen_one[['title','genre','singers']]
        
    else:
        return "We don't have a recommendation for you."

In [11]:
recommend_song('Unholy')

We recommend you the song number 64 from the Finland TOP 100 charts:


title      Bad Romance
genre            [Pop]
singers    [Lady Gaga]
Name: 749, dtype: object