# Web Scraping Wikipedia


## Import Libraries



In [1]:
import requests
from bs4 import BeautifulSoup

import pandas as pd

## The Function that will do the web scraping

In [2]:
## get the URL of the wikipedia we want to scrape:

URL = input("Please copy and paste the URL of the group discrography you want to scrape: ")
page = requests.get(URL)

soup = BeautifulSoup(page.content, "html.parser")

Please copy and paste the URL of the group discrography you want to scrape: https://en.wikipedia.org/wiki/Red_Velvet_discography


This includes Albums: Studio, Reissue, and Compilation Albums. Also, Extended Plays. Singles, Soundtrack Appearances, etc. does not include.

In [3]:
data = []

for table in soup.findAll('table', attrs={'class':'wikitable plainrowheaders'}):
    table_body = table.find('tbody')
    rows = table_body.findAll('tr')
    for row in rows:
        cols = [d.get_text(strip=True) for d in row.select('th, td')]
        data.append(cols)

In [25]:
data


[['Title', 'Details', 'Peak chart positions', 'Sales'],
 ['KOR[1]',
  'FRADig.[2]',
  'JPN[3]',
  'JPN Hot[4]',
  'US Heat.[5]',
  'US World[6]'],
 ['Korean'],
 ['The Red',
  'Released: September 9, 2015[7]Label:SM EntertainmentFormats:CD,digital download,streaming',
  '1',
  '—',
  '47',
  '—',
  '24',
  '1',
  'KOR: 79,838[8]JPN: 4,478[9]US: 3,000[10]'],
 ['Perfect Velvet',
  'Released: November 17, 2017Label: SM EntertainmentFormats: CD, digital download, streaming',
  '2',
  '95',
  '20',
  '52',
  '3',
  '1',
  'KOR: 117,759[11]JPN: 12,799[A][12]US: 3,000[13]'],
 ['Japanese'],
 ['Bloom',
  'Scheduled: February 2, 2022Label:Avex Trax,SM Entertainment JapanFormats: CD, digital download, streaming',
  'TBA',
  'TBA'],
 ['"—" denotes releases that did not chart or were not released in that region.'],
 ['Title', 'Details', 'Peak chart positions', 'Sales'],
 ['KOR[1]',
  'FRA Dig.[14]',
  'JPN[15]',
  'JPN Hot[16]',
  'US Heat.[5]',
  'US World[6]'],
 ['The Perfect Red Velvet',
  'Relea

In [26]:
# Obtain information from range (0, 27) 
# These only includes the album and the extended plays.

data_1 = data[0:30]
print(data_1)

[['Title', 'Details', 'Peak chart positions', 'Sales'], ['KOR[1]', 'FRADig.[2]', 'JPN[3]', 'JPN Hot[4]', 'US Heat.[5]', 'US World[6]'], ['Korean'], ['The Red', 'Released: September 9, 2015[7]Label:SM EntertainmentFormats:CD,digital download,streaming', '1', '—', '47', '—', '24', '1', 'KOR: 79,838[8]JPN: 4,478[9]US: 3,000[10]'], ['Perfect Velvet', 'Released: November 17, 2017Label: SM EntertainmentFormats: CD, digital download, streaming', '2', '95', '20', '52', '3', '1', 'KOR: 117,759[11]JPN: 12,799[A][12]US: 3,000[13]'], ['Japanese'], ['Bloom', 'Scheduled: February 2, 2022Label:Avex Trax,SM Entertainment JapanFormats: CD, digital download, streaming', 'TBA', 'TBA'], ['"—" denotes releases that did not chart or were not released in that region.'], ['Title', 'Details', 'Peak chart positions', 'Sales'], ['KOR[1]', 'FRA Dig.[14]', 'JPN[15]', 'JPN Hot[16]', 'US Heat.[5]', 'US World[6]'], ['The Perfect Red Velvet', 'Released: January 29, 2018Label: SM EntertainmentFormats: CD, digital downl

In [39]:
for a in range(len(data_1)):
    if len(data_1[a]) == 1:
        data_1.pop(a)


Now I want to create several dataframes

One for each album and one for extended plays



### Album Dataframe

In [27]:
def column_values(a,b):
    titles = []
    details = []
    sales = []
    
    for i in range(a,b):
        if len(data_1[i]) <= 11:
            titles.append(data_1[i][0])
            details.append(data_1[i][1])
            sales.append(data_1[i][-1])
        else:
            titles.append(data_1[i][0])
            details.append(data_1[i][1])
            sales.append(data_1[i][-2])
        
    return titles, details, sales

In [42]:
for i in range(0, 25):
    print(i)
    print(data_1[i])


0
['Title', 'Details', 'Peak chart positions', 'Sales']
1
['KOR[1]', 'FRADig.[2]', 'JPN[3]', 'JPN Hot[4]', 'US Heat.[5]', 'US World[6]']
2
['The Red', 'Released: September 9, 2015[7]Label:SM EntertainmentFormats:CD,digital download,streaming', '1', '—', '47', '—', '24', '1', 'KOR: 79,838[8]JPN: 4,478[9]US: 3,000[10]']
3
['Perfect Velvet', 'Released: November 17, 2017Label: SM EntertainmentFormats: CD, digital download, streaming', '2', '95', '20', '52', '3', '1', 'KOR: 117,759[11]JPN: 12,799[A][12]US: 3,000[13]']
4
['Bloom', 'Scheduled: February 2, 2022Label:Avex Trax,SM Entertainment JapanFormats: CD, digital download, streaming', 'TBA', 'TBA']
5
['Title', 'Details', 'Peak chart positions', 'Sales']
6
['KOR[1]', 'FRA Dig.[14]', 'JPN[15]', 'JPN Hot[16]', 'US Heat.[5]', 'US World[6]']
7
['The Perfect Red Velvet', 'Released: January 29, 2018Label: SM EntertainmentFormats: CD, digital download, streaming,SMC', '1', '87', '29', '49', '7', '3', 'KOR: 120,957[17]JPN: 12,799[A][12]']
8
['Titl

In [43]:
# Start with Studio Albums

title, detail, sale = column_values(2,5)

studio_album = pd.DataFrame(list(zip(title, detail, sale)), columns=['Title', 'Details', 'Sales'])
studio_album

Unnamed: 0,Title,Details,Sales
0,The Red,"Released: September 9, 2015[7]Label:SM Enterta...","KOR: 79,838[8]JPN: 4,478[9]US: 3,000[10]"
1,Perfect Velvet,"Released: November 17, 2017Label: SM Entertain...","KOR: 117,759[11]JPN: 12,799[A][12]US: 3,000[13]"
2,Bloom,"Scheduled: February 2, 2022Label:Avex Trax,SM ...",TBA


In [45]:
# Reissue Albums

title, detail, sale = column_values(7,8)

reissue_album = pd.DataFrame(list(zip(title, detail, sale)), columns=['Title', 'Details', 'Sales'])
reissue_album

Unnamed: 0,Title,Details,Sales
0,The Perfect Red Velvet,"Released: January 29, 2018Label: SM Entertainm...","KOR: 120,957[17]JPN: 12,799[A][12]"


In [46]:
# Compilation Albums

title, detail, sale = column_values(10,11)

compilation_album = pd.DataFrame(list(zip(title, detail, sale)), columns=['Title', 'Details', 'Sales'])
compilation_album

Unnamed: 0,Title,Details,Sales
0,The ReVe Festival: Finale,"Released: December 23, 2019Label: SM Entertain...","KOR: 194,290[20]JPN: 7,442(Phy.)[21]JPN: 590(D..."


In [47]:
# Extended Play

title, detail, sale = column_values(13,25)

extended_album = pd.DataFrame(list(zip(title, detail, sale)), columns=['Title', 'Details', 'Sales'])
extended_album

Unnamed: 0,Title,Details,Sales
0,Ice Cream Cake,"Released: March 18, 2015Label: SM Entertainmen...","KOR: 90,970[29]JPN: 2,796[30]US: 3,000[10]"
1,The Velvet,"Released: March 17, 2016Label: SM Entertainmen...","KOR: 56,944[31]JPN: 2,104[32]"
2,Russian Roulette,"Released: September 7, 2016Label: SM Entertain...","KOR: 87,693[33]JPN: 3,446[34]US: 2,000[10]"
3,Rookie,"Released: February 1, 2017Label: SM Entertainm...","KOR: 118,747[35]JPN: 4,351[36]"
4,The Red Summer,"Released: July 10, 2017Label: SM Entertainment...","KOR: 110,705[37]JPN: 4,659[38]US: 2,000[39]"
5,Summer Magic,"Released: August 6, 2018Label: SM Entertainmen...","KOR: 174,531[40]JPN: 8,004(Phy.)[41]JPN: 946(D..."
6,RBB,"Released: November 30, 2018Label: SM Entertain...","KOR: 108,796[44]JPN: 5,411[45]US: 10,000[46]"
7,The ReVe Festival: Day 1,"Released: June 19, 2019Label: SM Entertainment...","KOR: 193,669[47]JPN: 4,855[48]"
8,The ReVe Festival: Day 2,"Released: August 20, 2019Label: SM Entertainme...","KOR: 140,537[49]JPN: 3,960(Phy.)[50]JPN: 427(D..."
9,Queendom,"Released: August 16, 2021Label: SM Entertainme...","KOR: 360,070[53]JPN: 12,452(Phy.)[54]"


### Chart DataFrame

In [48]:
# Names of the Columns
names_chart = data_1[1]

In [51]:
def chart_values(a, b):
    KOR_chart = []
    FRA_chart = []
    JPN_chart = []
    JPN_Hot_chart = []
    US_Heat_chart = []
    US_World_chart = []
    
    for i in range(a,b):
        if len(data_1[i]) <= 4:
            KOR_chart.append('—')
            FRA_chart.append('—')
            JPN_chart.append('—')
            JPN_Hot_chart.append('—')
            US_Heat_chart.append('—')
            US_World_chart.append('—')
        elif len(data_1[i]) == 9:
            KOR_chart.append(data_1[i][2])
            FRA_chart.append(data_1[i][3])
            JPN_chart.append(data_1[i][4])
            JPN_Hot_chart.append(data_1[i][5])
            US_Heat_chart.append(data_1[i][6])
            US_World_chart.append(data_1[i][7])
        elif len(data_1[i]) < 9:
            KOR_chart.append(data_1[i][2])
            FRA_chart.append('—')
            JPN_chart.append(data_1[i][3])
            JPN_Hot_chart.append(data_1[i][4])
            US_Heat_chart.append(data_1[i][5])
            US_World_chart.append(data_1[i][6])
        else:
            KOR_chart.append(data_1[i][2])
            FRA_chart.append(data_1[i][4])
            JPN_chart.append(data_1[i][5])
            JPN_Hot_chart.append(data_1[i][6])
            US_Heat_chart.append(data_1[i][8])
            US_World_chart.append(data_1[i][9])
    
    return KOR_chart, FRA_chart, JPN_chart, JPN_Hot_chart, US_Heat_chart, US_World_chart
    

In [52]:
# Studio ALbum Charts

kchart, fchart, jchart, j1chart, uchart, u1chart = chart_values(2,5)

studio_chart = pd.DataFrame(list(zip(kchart, fchart, jchart, j1chart, uchart, u1chart)), columns=names_chart)
studio_chart

Unnamed: 0,KOR[1],FRADig.[2],JPN[3],JPN Hot[4],US Heat.[5],US World[6]
0,1,—,47,—,24,1
1,2,95,20,52,3,1
2,—,—,—,—,—,—


In [53]:
# Reissue Album Charts
kchart, fchart, jchart, j1chart, uchart, u1chart = chart_values(7,8)

reissue_chart = pd.DataFrame(list(zip(kchart, fchart, jchart, j1chart, uchart, u1chart)), columns=names_chart)
reissue_chart

Unnamed: 0,KOR[1],FRADig.[2],JPN[3],JPN Hot[4],US Heat.[5],US World[6]
0,1,87,29,49,7,3


In [54]:
# Extended Play Charts
kchart, fchart, jchart, j1chart, uchart, u1chart = chart_values(13,25)
    
extended_chart = pd.DataFrame(list(zip(kchart, fchart, jchart, j1chart, uchart, u1chart)), columns=names_chart)
extended_chart

Unnamed: 0,KOR[1],FRADig.[2],JPN[3],JPN Hot[4],US Heat.[5],US World[6]
0,1,—,76,—,24,2
1,1,—,75,—,—,8
2,1,174,40,—,18,2
3,1,—,43,—,21,1
4,1,108,27,—,8,1
5,1,42,12,24,3,3
6,3,92,33,51,1,2
7,1,53,20,46,5,7
8,1,52,17,54,11,6
9,2,—,11,23,16,11
