# Scraping a list of top 100 song lists

In [1]:
from bs4 import BeautifulSoup

import requests # allows us to access information on any page
import pandas as pd

In [2]:
# find a url and store it in a variable
url= 'https://playback.fm/charts/top-100-songs/2015'

In [3]:
# download html with a get request
response = requests.get(url)
response.status_code # 200 status code means OK!

200

In [None]:
response.content

In [5]:
# parse html (create the 'soup')
soup = BeautifulSoup(response.content, "html.parser")

In [None]:
# check that the html code looks like it should
soup

In [None]:
#myTable > tbody > tr:nth-child(1) > td.mobile-hide > a > span.song

#myTable > tbody > tr:nth-child(5) > td:nth-child(2) > a

In [None]:
soup.select("td.mobile-hide")

In [None]:
soup.select("td:nth-child(2) > a")

In [None]:
print(soup.prettify())

In [14]:
soup.select("td:nth-child(2) > a")[0]

<a class="artist" href="/artist/mark-ronson-top-songs" itemprop="byArtist">
Mark Ronson featuring Bruno Mars
</a>

In [56]:
soup.select("td:nth-child(2) > a")[0].get_text()

'\nMark Ronson featuring Bruno Mars\n'

In [58]:
soup.select("td.mobile-hide > a > span.song")[0].get_text()

'Uptown Funk'

In [59]:
#initialize empty lists
artist = []
song = []

# define the number of iterations of our for loop
# by checking how many elements are in the retrieved result set
# (this is equivalent but more robust than just explicitly defining 250 iterations)
num_iter = len(soup.select("td:nth-child(2) > a"))

tClist = soup.select("td:nth-child(2) > a")
spanlist = soup.select("td.mobile-hide > a > span.song")
# iterate through the result set and retrive all the data
for i in range(num_iter):
    artist.append(tClist[i].get_text())
    song.append(spanlist[i].get_text())

print(artist)
print(song)


['\nMark Ronson featuring Bruno Mars\n', '\nEd Sheeran\n', '\nWiz Khalifa featuring Charlie Puth\n', '\nFetty Wap\n', '\nMaroon 5\n', '\nWalk the Moon\n', '\nTaylor Swift\n', '\nSilentó\n', '\nThe Weeknd\n', '\nThe Weeknd\n', '\nOMI\n', '\nThe Weeknd\n', '\nEllie Goulding\n', '\nHozier\n', '\nTaylor Swift featuring Kendrick Lamar\n', '\nMajor Lazer and DJ Snake featuring MØ\n', '\nJason Derulo\n', '\nTaylor Swift\n', '\nSkrillex and Diplo featuring Justin Bieber\n', '\nRachel Platten\n', '\nFetty Wap featuring Remy Boyz\n', '\nMeghan Trainor\n', '\nFifth Harmony featuring Kid Ink\n', '\nOmarion featuring Chris Brown and Jhené Aiko\n', '\nAndy Grammer\n', '\nSam Smith\n', '\nSelena Gomez featuring A$AP Rocky\n', '\nMeghan Trainor\n', '\nTaylor Swift\n', '\nDrake\n', '\nDavid Guetta featuring Nicki Minaj, Bebe Rexha, and Afrojack\n', '\nFlo Rida featuring Sage the Gemini and Lookas\n', '\nJustin Bieber\n', '\nEd Sheeran\n', '\nAdele\n', '\nShawn Mendes\n', '\nTove Lo\n', '\nNick Jonas\n'

In [60]:
# each list becomes a column

artist_song_df= pd.DataFrame({"artist":artist,"song":song})

In [61]:
artist_song_df.head()

Unnamed: 0,artist,song
0,\nMark Ronson featuring Bruno Mars\n,Uptown Funk
1,\nEd Sheeran\n,Thinking Out Loud
2,\nWiz Khalifa featuring Charlie Puth\n,See You Again
3,\nFetty Wap\n,Trap Queen
4,\nMaroon 5\n,Sugar


In [64]:
artist_song_df['artist'] = artist_song_df['artist'].str.replace('\n', ' ')


In [65]:
artist_song_df.head()

Unnamed: 0,artist,song
0,Mark Ronson featuring Bruno Mars,Uptown Funk
1,Ed Sheeran,Thinking Out Loud
2,Wiz Khalifa featuring Charlie Puth,See You Again
3,Fetty Wap,Trap Queen
4,Maroon 5,Sugar


# Adding 100 more songs to the df by more webscraping

In [145]:
# find url and store it in a variable
url= 'https://www.popvortex.com/music/charts/top-100-songs.php'

In [146]:
# download html with a get request
response = requests.get(url)
response.status_code # 200 status code means OK!

200

In [None]:
#response.content

In [147]:
# parse html (create the 'soup')
soup = BeautifulSoup(response.content, "html.parser")

In [None]:
# check that the html code looks like it should
soup

In [None]:
#chart-position-6 > div.chart-content.col-xs-12.col-sm-8 > p > cite
#chart-position-6 > div.chart-content.col-xs-12.col-sm-8 > p > em

In [None]:
print(soup.prettify())

In [158]:
soup.select("div.chart-content.col-xs-12.col-sm-8 > p > cite")[0].get_text()

'Unholy'

In [159]:
soup.select("div.chart-content.col-xs-12.col-sm-8 > p > em")[0].get_text()

'Sam Smith & Kim Petras'

In [160]:
#initialize empty lists
artist = []
song = []

# define the number of iterations of our for loop
# by checking how many elements are in the retrieved result set
# (this is equivalent but more robust than just explicitly defining 250 iterations)
num_iter = len(soup.select("div.chart-content.col-xs-12.col-sm-8 > p > cite"))

artist_list = soup.select("div.chart-content.col-xs-12.col-sm-8 > p > cite")
song_list = soup.select("div.chart-content.col-xs-12.col-sm-8 > p > em")
# iterate through the result set and retrive all the data
for i in range(num_iter):
    artist.append(artist_list[i].get_text())
    song.append(song_list[i].get_text())

print(artist)
print(song)


['Unholy', 'Eagle (feat. KB)', 'Everywhere', "I'm Good (Blue)", 'wait in the truck', 'A Thousand Years', 'Thank God', 'You Proof', 'Son Of A Sinner', "I Ain't Worried", 'Unstoppable', 'CUFF IT', 'Bedroom Singer', 'Left and Right', 'TRUCK BED', 'She Had Me At Heads Carolina', 'Wasted On You', 'here lies country music', 'As It Was', 'The Kind of Love We Make', 'Fall In Love', 'the mockingbird & THE CROW', 'Shallow', 'Under the Influence', 'Celestial', 'Lose Yourself', 'Life Is a Highway', 'Sunroof', 'Love Me Like You Do', 'Shivers', 'Super Freaky Girl', 'High Heels', 'Victoria’s Secret', 'Hold Me Closer', 'I Like You (A Happier Song) [feat. Doja Cat]', 'About Damn Time', 'Running Up That Hill (A Deal with God)', '2 Be Loved (Am I Ready)', 'Numb', 'You, Me, And Whiskey', 'Earned It', "You'll Be In My Heart", 'Rock and a Hard Place', 'Next Thing You Know', 'Cold Heart (PNAU Remix)', 'Soul', 'Vegas (From the Original Motion Picture Soundtrack ELVIS)', "Something in the Orange (Z&E's Version

In [161]:
# each list becomes a column

artist_song_df_2= pd.DataFrame({"artist":artist,"song":song})

In [162]:
artist_song_df_2.head()

Unnamed: 0,artist,song
0,Unholy,Sam Smith & Kim Petras
1,Eagle (feat. KB),Transformation Worship
2,Everywhere,Fleetwood Mac
3,I'm Good (Blue),David Guetta & Bebe Rexha
4,wait in the truck,HARDY & Lainey Wilson


In [165]:
# concatenate with the first df
music_df = pd.concat([artist_song_df, artist_song_df_2], axis=0)
music_df=music_df.reset_index(drop=True)

In [166]:
music_df.head()

Unnamed: 0,artist,song
0,Mark Ronson featuring Bruno Mars,Uptown Funk
1,Ed Sheeran,Thinking Out Loud
2,Wiz Khalifa featuring Charlie Puth,See You Again
3,Fetty Wap,Trap Queen
4,Maroon 5,Sugar


In [167]:
music_df.shape

(200, 2)

# Song recommendation

In [117]:
import random

In [141]:
secure_random = random.SystemRandom()

In [143]:
favorite_song = input("One of my favorite songs is: ")
#print(favorite_song)

One of my favorite songs is: Trap Queen


In [None]:
#print(favorite_song)
# artist_song_df[(artist_song_df['song'].isin(['']))]
# len(artist_song_df[(artist_song_df['song'].isin(['Shake It Off']))])

In [144]:
if len(music_df[(music_df['song'].isin([favorite_song]))])>0:
    print("Our song recommendation is: ", secure_random.choice(music_df['song'])) 
else:
    print("We are sorry that we do not have a song recommendation for you now.")

Our song recommendation is:  Elastic Heart
