# ULTIMATE MUSIC DATABASE web scraping

In [17]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta
import time
import random

# Set headers
headers = requests.utils.default_headers()
headers.update({ 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'})

In [6]:
def weekly_hits (date):
    
    url = 'http://umdmusic.com/default.asp?Lang=English&Chart=D&ChDate=' + str(date)
    req = requests.get(url, headers)
    soup = BeautifulSoup(req.content, 'html.parser')

    my_table = soup.find_all("table", attrs={'border': '1', 'width': '100%', 'bgcolor': '#CCFFFF'})

    title = []
    artist = []

    rows = my_table[2].find_all('tr')
    for row in rows:
        cells = row.find_all('td', attrs={'style': 'font-size:10pt;font-family:Arial;padding-left:0.1in'})
        for cell in cells:
            title.append(cell.contents[0].get_text().strip())
            artist.append(cell.contents[2].strip())
    
    year = [str(date)[0:4]] * len(title)
    
    df = pd.DataFrame([year, artist, title]).transpose()
    df.columns = ['year', 'artist', 'title']

    return (df)

In [7]:
weekly_hits(20200620)

Unnamed: 0,year,artist,title
0,2020,DaBABY featuring RODDY RICCH,ROCKSTAR
1,2020,MEGAN THEE STALLION,Savage
2,2020,THE WEEKND,Blinding Lights
3,2020,DOJA CAT,Say So
4,2020,JUSTIN BIEBER featuring QUAVO,Intentions
...,...,...,...
95,2020,BRETT YOUNG,Catch
96,2020,SURF MESA featuring EMILEE,ily
97,2020,POLO G,21
98,2020,FUTURE featuring YOUNGBOY NEVER BROKE AGAIN,Trillionaire


In [26]:
# Scraping from 2020-06-20 backwards, till 2000-01-01, 1 week earlier on each iteration

initial_date = '20200620'
date_object = datetime.strptime(initial_date, '%Y%m%d')

df = pd.DataFrame()

while date_object > datetime.strptime('20000101', '%Y%m%d'):
    df = pd.concat([df, weekly_hits(date_object.strftime('%Y%m%d'))], ignore_index=True)
    # Substrating 7 days to date
    date_object = date_object - timedelta(days=7)
    # Adding random delay to avoid getting banned
    time.sleep(random.randint(0, 5))

In [27]:
df

Unnamed: 0,year,artist,title
0,2020,DaBABY featuring RODDY RICCH,ROCKSTAR
1,2020,MEGAN THEE STALLION,Savage
2,2020,THE WEEKND,Blinding Lights
3,2020,DOJA CAT,Say So
4,2020,JUSTIN BIEBER featuring QUAVO,Intentions
...,...,...,...
106795,2000,BACKSTREET BOYS,Largen Than Life
106796,2000,GINUWINE / TYRESE / CASE,The Best Man I Can Be
106797,2000,BETH HART,L.A. Song
106798,2000,LIMP BIZKIT,Re-Arranged


In [28]:
df.isnull().sum()

year      0
artist    0
title     0
dtype: int64

In [29]:
df.to_csv('weekly_hot100_from_umdb_2000-2020.csv', encoding='utf-8', index=False)

In [31]:
datetime.strptime('20200621', '%Y%m%d') > datetime.strptime('20200620', '%Y%m%d')

True

In [32]:
# Scraping from 2000-01-01 backwards, till 1990-01-01, 1 week earlier on each iteration

initial_date = '20000101'
date_object = datetime.strptime(initial_date, '%Y%m%d')

df = pd.DataFrame()

while date_object > datetime.strptime('19900101', '%Y%m%d'):
    df = pd.concat([df, weekly_hits(date_object.strftime('%Y%m%d'))], ignore_index=True)
    # Substrating 7 days to date
    date_object = date_object - timedelta(days=7)
    # Adding random delay to avoid getting banned
    time.sleep(random.randint(0, 5))

In [33]:
df

Unnamed: 0,year,artist,title
0,2000,SANTANA featuring ROB THOMAS,Smooth
1,2000,BRIAN McKNIGHT,Back At One
2,2000,JESSICA SIMPSON,I Wanna Love You Forever
3,2000,WHITNEY HOUSTON,My Love Is Your Love
4,2000,SAVAGE GARDEN,I Knew I Loved You
...,...,...,...
52195,1990,AFTER 7,Heat Of The Moment
52196,1990,NEW KIDS ON THE BLOCK,Cover Girl
52197,1990,HOOTERS,500 Miles
52198,1990,SARAYA,Back To The Bullet


In [34]:
df.to_csv('weekly_hot100_from_umdb_1990-1999.csv', encoding='utf-8', index=False)

In [35]:
df.isnull().sum()

year      0
artist    0
title     0
dtype: int64

In [36]:
# Scraping from 1990-01-01 backwards, till 1970-01-01, 1 week earlier on each iteration

initial_date = '19900106'
date_object = datetime.strptime(initial_date, '%Y%m%d')

df = pd.DataFrame()

while date_object > datetime.strptime('19700101', '%Y%m%d'):
    df = pd.concat([df, weekly_hits(date_object.strftime('%Y%m%d'))], ignore_index=True)
    # Substrating 7 days to date
    date_object = date_object - timedelta(days=7)
    # Adding random delay to avoid getting banned
    time.sleep(random.randint(0, 5))

In [37]:
df

Unnamed: 0,year,artist,title
0,1990,PHIL COLLINS,Another Day In Paradise
1,1990,JANET JACKSON,Rhythm Nation
2,1990,LINDA RONSTADT & AARON NEVILLE,Don't Know Much
3,1990,TECHNOTRONIC,Pump Up The Jam
4,1990,TAYLOR DAYNE,With Every Beat Of My Heart
...,...,...,...
111001,1970,LES McCANN & EDDIE HARRIS,Compared To What / Cold Duck
111002,1970,THE CANNONBALL ADDERLY QUINTET,Country Preacher
111003,1970,MARVIN GAYE,How Can I Forget You / Gonna Give Her All The ...
111004,1970,BROOK BENTON,Rainy Night In Georgia


In [38]:
df.isnull().sum()

year      0
artist    0
title     0
dtype: int64

In [39]:
df.to_csv('weekly_hot100_from_umdb_1970-1989.csv', encoding='utf-8', index=False)

In [40]:
# On January 6, 1962 they started publishing the chart on Saturdays
# From August 4, 1958 until December 25, 1961, they were published on Mondays

# Scraping from 1969-12-27 backwards, till 1962-01-06, 1 week earlier on each iteration

initial_date = '19691227'
date_object = datetime.strptime(initial_date, '%Y%m%d')

df = pd.DataFrame()

while date_object > datetime.strptime('19620101', '%Y%m%d'):
    df = pd.concat([df, weekly_hits(date_object.strftime('%Y%m%d'))], ignore_index=True)
    # Substrating 7 days to date
    date_object = date_object - timedelta(days=7)
    # Adding random delay to avoid getting banned
    time.sleep(random.randint(0, 5))

In [41]:
df

Unnamed: 0,year,artist,title
0,1969,DIANA ROSS & THE SUPREMES,Someday We'll Be Together
1,1969,"PETER, PAUL AND MARY",Leaving On A Jet Plane
2,1969,B.J. THOMAS,Raindrops Keep Fallin' On My Head
3,1969,CREEDENCE CLEARWATER REVIVAL,Down On The Corner / Fortunate Son
4,1969,STEAM,Na Na Hey Hey Kiss Him Goodbye
...,...,...,...
54630,1962,RAY CHARLES,But On The Other Hand Baby
54631,1962,PAUL ANKA,The Bells Of My Wedding
54632,1962,JIM REEVES,Losing Your Love
54633,1962,CHARLIE DRAKE,My Boomerang Won't Come Back


In [42]:
df.isnull().sum()

year      0
artist    0
title     0
dtype: int64

In [43]:
df.to_csv('weekly_hot100_from_umdb_1962-1969.csv', encoding='utf-8', index=False)