<a href="https://colab.research.google.com/github/ilmaaliyaf/indonesian-songs-lyrics/blob/main/scrape_kapanlagi_lyrics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import concurrent.futures # for multi-threading
import time
import csv
from tqdm import tqdm
import pickle
from google.colab import files

def get_lxml(link):
    '''
        input: link of a webpage
        output: BeautifulSoup object in lxml format
    '''
    res = requests.get(link, headers={"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}).content
    return BeautifulSoup(res, 'lxml')

def get_lyrics(link):
    '''
        get lyrics of a song
        input: link of the lyrics in kapalagi.com
        output: string of lyrics, different line separated by slash-n
    '''
    s = '\n'.join([l.text.strip() for l in get_lxml(link).select('.lirik_line')])
    return s if s != 'Belum Ada Lirik' else ''


In [None]:
def get_songs(a):
    ''' 
    get songs details: title, artist, link of the lyrics in kapalagi.com
    input : the-first-alphabet of the song
    output : list of dict
    '''
    songs_id = []
    
    i = 1
    link = f'https://lirik.kapanlagi.com/lagu/{a}_id/' # link of the first page

    while str(requests.head(link).status_code) == '200':
        soup = get_lxml(link)

        # get list of song titles
        titles = [a for a in soup.findAll('a', {'href': True, 'class': False}) \
                  if 'artis' in a.get('href')]
        # get list of the corresponding song artists
        artists = [a.text.strip() for a in soup.findAll('span', {'class': False})]
        # create the song dictionary
        s = [ {'link': t.get('href'),
               'title': t.text.strip(), 
               'artist': a
               } for (t, a) in zip(titles, artists) ]
        songs_id += s # append the dict to the preceeding song list
        
        # next page if any
        i += 1
        link = f'https://lirik.kapanlagi.com/lagu/{a}_id/index{i}.html'
    
    # print result
    toc = time.time()
    print(f'{a} \t#songs: {len(songs_id)} \ttime passed: {round(toc-tic, 1)}s')
    
    return songs_id

alphabets = [chr(i+97) for i in range(26)] + ['num']
num_pages = {}

tic = time.time()

with concurrent.futures.ThreadPoolExecutor() as executor:
    future = executor.map(get_songs, alphabets)
    songs_kapanlagi = [song for f in future for song in f]
    time.sleep(1e-8)

e 	#songs: 207 	time passed: 11.8s
f 	#songs: 203 	time passed: 12.2s
g 	#songs: 547 	time passed: 25.3s
i 	#songs: 674 	time passed: 34.7s
h 	#songs: 862 	time passed: 38.7s
j 	#songs: 862 	time passed: 41.5s
d 	#songs: 1251 	time passed: 46.9s
o 	#songs: 244 	time passed: 52.5s
c 	#songs: 1055 	time passed: 52.6s
l 	#songs: 848 	time passed: 53.9s
n 	#songs: 466 	time passed: 55.9s
q 	#songs: 11 	time passed: 57.2s
v 	#songs: 45 	time passed: 63.8s
u 	#songs: 229 	time passed: 65.6s
a 	#songs: 1695 	time passed: 65.9s
b 	#songs: 1874 	time passed: 75.3s
x 	#songs: 10 	time passed: 76.4s
z 	#songs: 22 	time passed: 85.1s
w 	#songs: 283 	time passed: 86.7s
r 	#songs: 607 	time passed: 87.1s
y 	#songs: 278 	time passed: 89.5s
num 	#songs: 155 	time passed: 91.9s
p 	#songs: 1236 	time passed: 101.6s
m 	#songs: 1741 	time passed: 105.6s
k 	#songs: 2188 	time passed: 106.4s
t 	#songs: 1913 	time passed: 148.2s
s 	#songs: 2639 	time passed: 184.9s


In [None]:
from google.colab import drive
drive.mount('/gdrive')

DIR = "/gdrive/My Drive/Colab Notebooks/dataset/kapanlagi/"

# save to a pickle file
with open(DIR+'songs_kapanlagi.pickle', 'wb') as f:
    pickle.dump(songs_kapanlagi, f)

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


## Checkpoint

[Run this first.](#scrollTo=itRP2d0VL-wD)

In [None]:
from google.colab import drive
drive.mount('/gdrive')

DIR = "/gdrive/My Drive/Colab Notebooks/dataset/kapanlagi/"

# load the pickle file
import pickle

with open(DIR + 'songs_kapanlagi.pickle','rb') as f:
    songs_kapanlagi = pickle.load(f)
len(songs_kapanlagi)

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


22145

In [None]:
# j = 4
step = 100

for j in range(220, 223):
    print('\n', j)
    links = [song['link'] for song in songs_kapanlagi][j*step:(j+1)*step]
    # print('#total songs: ', len(links))

    with concurrent.futures.ThreadPoolExecutor() as executor:
        future = executor.map(get_lyrics, links)
        for i,f in enumerate(tqdm(future, total=len(links))):
            k = i+j*step
            songs_kapanlagi[k]['lyrics'] = f if f != 'Belum ada lirik' else ''
            time.sleep(1e-2)

    # get keys for csv header
    keys = songs_kapanlagi[-1].keys()

    # save dict to csv
    with open(DIR + f'lyrisc_{j}.csv', 'a')  as f:
        dict_writer = csv.DictWriter(f, fieldnames=keys, extrasaction='ignore')
        dict_writer.writeheader()
        dict_writer.writerows(songs_kapanlagi)

  0%|          | 0/100 [00:00<?, ?it/s]


 220


100%|██████████| 100/100 [00:15<00:00,  6.39it/s]
  0%|          | 0/45 [00:00<?, ?it/s]


 221


100%|██████████| 45/45 [00:06<00:00,  7.10it/s]
0it [00:00, ?it/s]



 222
