# 1 - Captura dos Dados - Letras.Mus.Br

In [1]:
import requests
import re
import json
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup as soup

## 1.1 Extração dos Links das Músicas

In [2]:
#Extração dos links das músicas dos gêneros
def musicLinks(criteria,search,length):
    if criteria in ['genre','artist']:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'}
        lyrics_ref = []
        if criteria == 'genre':
            search_by_genre = 'https://www.letras.mus.br/mais-acessadas/{}/'
            for genre in search:
                my_url = search_by_genre.format(genre)
                page = requests.get(my_url, headers=headers)
                page_soup = soup(page.content, "lxml")
                links_container = page_soup.findAll("ol",{"class":"top-list_mus cnt-list--col1-3"})
                page_soup = soup(str(links_container), "lxml")
                links_array = page_soup.findAll("a")
                i = 0
                while i < length:
                    lyrics_ref.append(str("https://www.letras.mus.br"+links_array[i].attrs['href']).replace("['","").replace("']",""))
                    i+=1
                print('{} songs by {} were found and retrieved.'.format(str(i),genre))
        else:
            search_by_artist = 'https://www.letras.mus.br/{}/mais_tocadas.html'
            for artist in search:
                my_url = search_by_artist.format(artist)
                page = requests.get(my_url, headers=headers)
                page_soup = soup(page.content, "lxml")
                links_container = page_soup.findAll("li",{"class":"cnt-list-row -song is-visible"})
                if len(links_container)>=length:
                    page_soup = soup(str(links_container), "lxml")
                    i = 0
                    print('Collecting {} songs by {}.'.format(len(links_container),artist))
                    while i < length:
                        lyrics_ref.append(str(links_container[i].attrs['data-shareurl']))
                        i+=1
                else:
                    print('Only {} songs by {} were found and retrieved.'.format(len(links_container),artist))
                    for links in links_container:
                        lyrics_ref.append(str(links.attrs['data-shareurl']))
    else:
        print('Invalid Criteria')
    return lyrics_ref

In [3]:
#Lista dos generos para extração. Obtida de "www.letras.mus.br/estilos"
#genres = ['rock-roll', 'punk-rock', 'heavy-metal', 'tecnopop', 'hard-rock', 'power-pop', 'soft-rock', 'post-rock', 'poprock', 'progressivo']
#genres = ['samba','pagode']
#artists = ['ariana-grande']

search = ['rock-roll']

In [4]:
links = musicLinks('genre',search,100)

100 songs by rock-roll were found and retrieved.


In [5]:
len(links)

100

In [6]:
#Exportar
music_links = pd.DataFrame({'music': links})
music_links.to_csv('music_links.csv',index=False)

In [7]:
music_links = pd.read_csv('music_links.csv')
music_links = music_links.music

In [8]:
music_links

0              https://www.letras.mus.br/bon-jovi/4932/
1              https://www.letras.mus.br/bon-jovi/4923/
2          https://www.letras.mus.br/linkin-park/23091/
3     https://www.letras.mus.br/creedence-clearwater...
4               https://www.letras.mus.br/cazuza/43861/
                            ...                        
95           https://www.letras.mus.br/the-beatles/189/
96               https://www.letras.mus.br/kiss/520817/
97        https://www.letras.mus.br/rod-stewart/108434/
98             https://www.letras.mus.br/bon-jovi/4964/
99         https://www.letras.mus.br/raul-seixas/48326/
Name: music, Length: 100, dtype: object

## 1.2 Extração das Letras

In [9]:
# Remove pontuação de um texto
#Utilizada para garantir posteriomente uma tokenização de sentenças correta.
def remove_punctuation(text):  
    # re.sub(replace_expression, replace_string, target)
    new_text = text.replace(",","").replace("-"," ")
    new_text = re.sub(r"\|,|;|:|-|!|\?", " ", new_text)
    return new_text

In [10]:
def lyricFormat(string):
    lyric = str(string)
    lyric = lyric.replace('[<div class="cnt-letra p402_premium"> <p>','')
    lyric = lyric.replace("]","")
    lyric = lyric.replace('<p>','')
    lyric = lyric.replace('</p>','.')
    lyric = lyric.replace('<br/>','.')
    lyric = lyric.replace('</div>','')
    lyric = lyric.strip()
    if (lyric[-1] == '.'):
        lyric = (lyric[0:-1])
    return lyric

In [11]:
#Armazena os dados do dataset
lang = []
exib = []
genero = []
titulo = []
artista = []
letras = []
bad_links_ref = []

In [12]:
#Extração das letras das músicas
for ref in tqdm(music_links, desc= 'Baixando letras: '):
    
    try:
        agent = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'}
        page = requests.get(ref, headers=agent)
        page_soup = soup(page.content, "lxml")
        #Extração das informações
        metadata = page_soup.find("script", id="head_scripts_vars")
        #Extração de metadados(idioma e número de exibições/acessos)
        metadata = str(metadata).replace('\n','').strip()
        metadata = json.loads(metadata.split(';')[0].split('=')[-1])
        lyric_lang = metadata["lyric_lang"]
        cnt_info_exib = page_soup.findAll("div",{"class":"cnt-info_exib"})[0].text
        cnt_info_exib = cnt_info_exib.replace(".","")
        cnt_info_exib = int(re.findall("\d+", cnt_info_exib)[0])
        #Extrai nome , artista e gênero
        song_info = page_soup.findAll("span",{"itemprop":"name"}) 
        genero_musica = song_info[1].text
        titulo_musica = song_info[3].text
        artista_musica = song_info[2].text
        lyrics_container = (page_soup.findAll("div",{"class":"cnt-letra p402_premium"}))
        letras_musica = lyricFormat(lyrics_container)
        letras_musica = remove_punctuation(letras_musica)
        #Adiciona nas listas para posterior exportação.
        lang.append(lyric_lang)
        genero.append(genero_musica)
        exib.append(cnt_info_exib)
        titulo.append(titulo_musica)
        artista.append(artista_musica)
        letras.append(letras_musica)
    except:
        print('Fail')
        bad_links_ref.append(ref)
        continue        
print("Número de links com falha: "+str(len(bad_links_ref)))

Baixando letras: 100%|██████████| 100/100 [00:38<00:00,  2.62it/s]

Número de links com falha: 0





## 1.3 Exportar Dataset (CSV)

In [13]:
Dataset = pd.DataFrame({'lang': lang,
                        'genero': genero,
                        'exib': exib,
                        'titulo': titulo,
                       'artista': artista,
                        'letras': letras})

In [14]:
Dataset.head(5)

Unnamed: 0,lang,genero,exib,titulo,artista,letras
0,en,Rock and Roll,2822230,Livin' On a Prayer,Bon Jovi,(Once upon a time not so long ago).Tommy used ...
1,en,Rock and Roll,3612178,Always,Bon Jovi,This Romeo is bleeding.But you can't see his b...
2,en,Rock and Roll,3995342,In The End,Linkin Park,(It starts with one).One thing I don't know wh...
3,en,Rock and Roll,2213512,Have You Ever Seen The Rain?,Creedence Clearwater Revival,Someone told me long ago.There's a calm before...
4,pt,Rock and Roll,3586479,Exagerado,Cazuza,Amor da minha vida.Daqui até a eternidade.Noss...


In [15]:
Dataset.genero.unique()

array(['Rock and Roll'], dtype=object)

In [16]:
Dataset.to_csv('datasetLetras.csv')

## Adicionais
## 1.4 Filtrando por lang (CSV)

In [None]:
dataset_en = Dataset[Dataset["lang"]=="en"]

In [None]:
def count_types(df,atype):
    for t in set(df[atype]):
        print(t + " "+str(len(df[df[atype]==t])))

In [None]:
count_types(dataset_en,"genero")

In [None]:
def balanceData(df,fieldName,value):
    subDataframes = []
    for t in set(df[fieldName]):
        subDataframes.append(df[df[fieldName]==t][0:value])
    data = pd.concat(subDataframes)
    #Reconfigura o index da nova tabela
    data = data.reset_index()
    #Remove as colunas index e Linha
    data = data.drop(labels=['index'],axis=1)
    return data

In [None]:
# Leitura do dataset
def read_data(path, sep):
    data = pd.read_csv(path, sep=sep)
    data = data.loc[:, ~data.columns.str.contains('^Unnamed')]
    return data

In [None]:
df = read_data('datasetSubgenresRockPop.csv',',')

In [None]:
df = df[df["lang"]=="en"]

In [None]:
df.genero.unique()

In [None]:
selecao = (df['genero'] == 'Tecnopop') |  (df['genero'] == 'Pop Rock') | (df['genero'] == 'Power-Pop') | (df['genero'] == 'Heavy Metal')| (df['genero'] == 'Punk Rock')| (df['genero'] == 'Soft Rock')

In [None]:
df = df[selecao]

In [None]:
count_types(df,'genero')

In [None]:
df = balanceData(df,'genero',100)

In [None]:
df.to_csv('EnRockPop100.csv')