In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from joblib import Parallel, delayed
import time
import pyarrow
import warnings
warnings.filterwarnings('ignore')  

In [2]:
#driver configuration

opciones=Options()

opciones.add_experimental_option('excludeSwitches', ['enable-automation'])
opciones.add_experimental_option('useAutomationExtension', False)
opciones.headless=False    # si True, no aperece la ventana (headless=no visible)
opciones.add_argument('--start-maximized')         # comienza maximizado
# opciones.add_argument('Cookies')    # mantiene las cookies
# opciones.add_extension('./Ad_Blocker/adblock.crx')       # adblocker
opciones.add_argument('--incognito')

In [3]:
# Construcción de url's

countries = ['es','us','uk','au','de','ca','br','id',
             'nl','fr','tr','mx','se','it','ar','ie']

url = 'https://www.songkick.com/es/festivals/countries/'
url_countries = [url + c for c in countries]

In [4]:
PATH = ChromeDriverManager().install()     # instala el driver de chrome

In [5]:
# Función que devuelve un dataframe con los festivales y respectivas url's por cada país
def fest_country(url):
    driver=webdriver.Chrome(PATH,options=opciones)   
    driver.get(url)
    time.sleep(5)
    
    try:
        driver.find_element(By.ID,'onetrust-accept-btn-handler').click()  # Aceptar cookies
    except:
        pass
    time.sleep(4)
    fest_list = []

    for fest in driver.find_elements(By.XPATH,'//p[@class="artists summary"]//a'):
        temp = dict()
        temp['Festival'] = fest.text.split('\n')[0]
        temp['url'] = fest.get_property('href')
        fest_list.append(temp)

    return pd.DataFrame(fest_list)

In [6]:
paralelo = Parallel(n_jobs=6, verbose=True)

lst_df_fest = paralelo(delayed(fest_country)(url) for url in url_countries)

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  16 out of  16 | elapsed:  3.7min finished


In [7]:
df_fest = pd.concat(lst_df_fest)
df_fest = df_fest.reset_index(drop=True)
df_fest.tail()

Unnamed: 0,Festival,url
385,INDIE-PENDANCE FESTIVAL 2023,https://www.songkick.com/es/festivals/50496-in...
386,Indiependence Festival 2023,https://www.songkick.com/es/festivals/137571-i...
387,All together Now 2023,https://www.songkick.com/es/festivals/2236829-...
388,Ballaghdream Arts festival 2023,https://www.songkick.com/es/festivals/3540758-...
389,Playing Fields Festivals 2023,https://www.songkick.com/es/festivals/3551033-...


In [8]:
dod_fest = pd.read_csv('../data/Dod_fest.csv')
dod_fest

Unnamed: 0,Festival
0,WAN 2023 Madrid
1,Inverfest 2023
2,Actual Festival 2023
3,Microsonidos 2023
4,Horteralia 2023
...,...
255,Primavera Sound Buenos Aires 2023
256,Primavera Sound Santiago de Chile 2023
257,Lollapalooza Chile 2023
258,Lollapalooza Argentina 2023


In [13]:
url = 'https://www.songkick.com/es/search?utf8=%E2%9C%93&query='
dod_url = [url + f.replace(' ','+') for f in dod_fest.Festival]

In [14]:
def check_dod_fest(url):
    driver=webdriver.Chrome(PATH,options=opciones)   
    
    # Desactivar la extensión
     
    try:
        driver.get(url)
        time.sleep(4)
    except:
        return {'Festival':None,'url':None}
    
    temp = dict()
    temp['Festival'] = url.split('query=')[-1].replace('+',' ')
    
    
    try:
        driver.find_element(By.ID,'onetrust-accept-btn-handler').click()  # Aceptar cookies
    except:
        pass
    
    try:
        if driver.find_element(By.XPATH,'//div[@class="no-results"]') == []:  # Si es True, no hay registro en Songkick
            temp['url'] = None
            return temp
    except:
        pass
    
    try:
        if driver.find_element(By.XPATH,'/html/body/div[4]/div/div/div[2]/div/ul/li[1]/div[1]/span').text == 'FESTIVAL PASADO':
            temp['url'] = None
            return temp
        pass
    except:
        pass
    
    xpath = '/html/body/div[4]/div/div/div[2]/div/ul/li/div[1]'

    try:
        temp['url'] = driver.find_element(By.XPATH,xpath).find_element(By.XPATH,'//p[@class="summary"]//a').get_property('href') # url si existe
        return temp
    
    except:
        temp['url'] = None
        return temp
   

In [15]:
lst_dod_fest = paralelo(delayed(check_dod_fest)(url) for url in dod_url)

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:  2.7min
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed: 16.2min
[Parallel(n_jobs=6)]: Done 260 out of 260 | elapsed: 22.2min finished


In [16]:
pd.DataFrame(lst_dod_fest).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260 entries, 0 to 259
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Festival  260 non-null    object
 1   url       128 non-null    object
dtypes: object(2)
memory usage: 4.2+ KB


In [17]:
dod_fest = pd.DataFrame(lst_dod_fest)
dod_fest.head()

Unnamed: 0,Festival,url
0,WAN 2023 Madrid,
1,Inverfest 2023,
2,Actual Festival 2023,
3,Microsonidos 2023,https://www.songkick.com/es/concerts/40648056-...
4,Horteralia 2023,


In [18]:
dod_fest = dod_fest[dod_fest.url.notna()]

In [19]:
pd.concat([df_fest,dod_fest],axis=0)[pd.concat([df_fest,dod_fest],axis=0).duplicated()]

Unnamed: 0,Festival,url
20,Mallorca Live Festival 2023,https://www.songkick.com/es/festivals/1520684-...
37,Les Nits de Barcelona 2023,https://www.songkick.com/es/festivals/3547261-...
43,Azkena Rock Festival 2023,https://www.songkick.com/es/festivals/1515-azk...
64,Resurrection Fest 2023,https://www.songkick.com/es/festivals/71471-re...
65,Rock Imperium Festival 2023,https://www.songkick.com/es/festivals/3373532-...
92,Bilbao BBK Live 2023,https://www.songkick.com/es/festivals/6331-bil...
99,Mad Cool Festival 2023,https://www.songkick.com/es/festivals/1481913-...
223,Leeds Festival 2023,https://www.songkick.com/es/festivals/58-leeds...
228,Lollapalooza 2023,https://www.songkick.com/es/festivals/1235-lol...
237,Rock am Ring 2023,https://www.songkick.com/es/festivals/1237-roc...


In [20]:
index_dup = pd.concat([df_fest,dod_fest],axis=0)[pd.concat([df_fest,dod_fest],axis=0)[['Festival','url']].duplicated()].index

In [21]:
dod_fest.loc[index_dup]

Unnamed: 0,Festival,url
20,Mallorca Live Festival 2023,https://www.songkick.com/es/festivals/1520684-...
37,Les Nits de Barcelona 2023,https://www.songkick.com/es/festivals/3547261-...
43,Azkena Rock Festival 2023,https://www.songkick.com/es/festivals/1515-azk...
64,Resurrection Fest 2023,https://www.songkick.com/es/festivals/71471-re...
65,Rock Imperium Festival 2023,https://www.songkick.com/es/festivals/3373532-...
92,Bilbao BBK Live 2023,https://www.songkick.com/es/festivals/6331-bil...
99,Mad Cool Festival 2023,https://www.songkick.com/es/festivals/1481913-...
223,Leeds Festival 2023,https://www.songkick.com/es/festivals/58-leeds...
228,Lollapalooza 2023,https://www.songkick.com/es/festivals/1235-lol...
237,Rock am Ring 2023,https://www.songkick.com/es/festivals/1237-roc...


In [22]:
dod_fest = dod_fest.drop(index=index_dup)
dod_fest = dod_fest.dropna().reset_index(drop=True)
dod_fest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118 entries, 0 to 117
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Festival  118 non-null    object
 1   url       118 non-null    object
dtypes: object(2)
memory usage: 2.0+ KB


In [24]:
df_fest = pd.concat([df_fest,dod_fest],axis=0).reset_index(drop=True)
df_fest

Unnamed: 0,Festival,url
0,Mallorca Live Festival 2023,https://www.songkick.com/es/festivals/1520684-...
1,Primavera a la Ciutat 2023,https://www.songkick.com/es/festivals/3452756-...
2,Primavera In The City - Madrid 2023,https://www.songkick.com/es/festivals/3500869-...
3,Primavera Sound Festival 2023,https://www.songkick.com/es/festivals/618-prim...
4,Primavera Pack 2023,https://www.songkick.com/es/festivals/3495900-...
...,...,...
503,NOS Alive 2023,https://www.songkick.com/es/festivals/1192678-...
504,Primavera Sound São Paulo 2023,https://www.songkick.com/es/festivals/3439946-...
505,Sziget Festival 2023,https://www.songkick.com/es/festivals/608-szig...
506,Corona Capital 2023,https://www.songkick.com/es/festivals/106236-c...


In [25]:
def fest_detail(url):
    try:
        driver=webdriver.Chrome(PATH,options=opciones)   
        driver.get(url)
        time.sleep(4)
    except:
        new_cols = {'Cartel':None,'Date':None,'Location':None,'info':None}
        return new_cols
        
    try:
        driver.find_element(By.ID,
                            'onetrust-accept-btn-handler').click()  # Aceptar cookies
    except:
        pass
    time.sleep(2)
    
    try:
        cart = [artist.text for artist in driver.find_elements(By.XPATH,
                                                               '//ul[@class="festival"]//li')]  # Lista de artistas del festival
    except:
        cart = None
    try: 
        date = driver.find_element(By.XPATH,
                                   '//div[@class="date-and-name"]//p').text  # Fecha del festival
    except:
        date = None
    try:
        location = driver.find_element(By.XPATH,
                                       '//div[@class="venue-info-details"]//a').text # Localización del evento
    except:
        location = None
    try:
        info = driver.find_element(By.XPATH,
                                   '//div[@class="venue-info-details"]//p[@class="venue-hcard"]').text # Infirmación del lugar del evento
    except:
        info = None

    new_cols = {'Cartel':cart,'Date':date,'Location':location,'info':info}
    return new_cols

In [26]:
lst_new_cols = paralelo(delayed(fest_detail)(url) for url in df_fest.url)

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:  5.8min
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed: 26.9min
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed: 60.9min
[Parallel(n_jobs=6)]: Done 508 out of 508 | elapsed: 69.6min finished


In [27]:
df_fest = pd.concat([df_fest,pd.DataFrame(lst_new_cols)],axis=1)
df_fest.head()

Unnamed: 0,Festival,url,Cartel,Date,Location,info
0,Mallorca Live Festival 2023,https://www.songkick.com/es/festivals/1520684-...,"[Black Eyed Peas, Chemical Brothers, The Dandy...",jueves 18 mayo 2023 – sábado 20 mayo 2023,Antic Aquapark,"Camí Cala Figuera, 1\n07181\nCalvia, Spain\nww..."
1,Primavera a la Ciutat 2023,https://www.songkick.com/es/festivals/3452756-...,"[Jake Bugg, Pet Shop Boys, Confidence Man, La ...",martes 30 mayo 2023 – miércoles 31 mayo 2023,Sala Apolo,"C/ Nou De La Rambla 113\n08004\nBarcelona, Spa..."
2,Primavera In The City - Madrid 2023,https://www.songkick.com/es/festivals/3500869-...,"[Bleachers, Julia Jacklin, PUP, Black Country,...",lunes 05 junio 2023 – domingo 11 junio 2023,,
3,Primavera Sound Festival 2023,https://www.songkick.com/es/festivals/618-prim...,"[Kendrick Lamar, Calvin Harris, Skrillex, Hals...",jueves 08 junio 2023 – domingo 11 junio 2023,Ciudad del Rock,"Autovía A3: Sentido Valencia Salida 33A, 35\n2..."
4,Primavera Pack 2023,https://www.songkick.com/es/festivals/3495900-...,"[Kendrick Lamar, Calvin Harris, Skrillex, Hals...",jueves 08 junio 2023 – sábado 10 junio 2023,Ciudad del Rock,"Autovía A3: Sentido Valencia Salida 33A, 35\n2..."


In [28]:
df_fest.to_parquet('../data/df_fest_raw.parquet',index=False,engine='pyarrow')

In [29]:
df_fest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 508 entries, 0 to 507
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Festival  508 non-null    object
 1   url       508 non-null    object
 2   Cartel    508 non-null    object
 3   Date      508 non-null    object
 4   Location  433 non-null    object
 5   info      433 non-null    object
dtypes: object(6)
memory usage: 23.9+ KB


In [30]:
# Festivales cuyo cartel no está disponible en songkick.com

df_fest[df_fest.Cartel.notna()][df_fest[df_fest.Cartel.notna()]['Cartel'].apply(len) == 0]

Unnamed: 0,Festival,url,Cartel,Date,Location,info
98,P!NK: Summer Carnival 2024,https://www.songkick.com/es/festivals/3487862-...,[],viernes 23 febrero 2024,,
99,P!NK: Summer Carnival 2024,https://www.songkick.com/es/festivals/3487862-...,[],viernes 01 marzo 2024,Optus Stadium,"333 Victoria Park Drive\n6100\nBurswood, WA, A..."
121,Damage Done Fest 2023,https://www.songkick.com/es/festivals/3343435-...,[],sábado 26 agosto 2023,Uferpark,"Schlengendeich 21\n21107\nHamburg, Germany"
263,Konya Müzik Festivali 2023,https://www.songkick.com/es/festivals/3537027-...,[],jueves 22 junio 2023 – domingo 25 junio 2023,,
309,Lollapalooza 2023,https://www.songkick.com/es/festivals/1235-lol...,[],miércoles 28 junio 2023 – sábado 01 julio 2023,Gärdet,"Valhallavägen\n10252\nStockholm, Sweden"
372,Doolin Folk Festival Early Bird 2023,https://www.songkick.com/es/festivals/3456578-...,[],viernes 09 junio 2023,Hotel Doolin,"Clare, Ireland"
381,Otherside Campervan(Must Hold Valid Weekend Fe...,https://www.songkick.com/es/festivals/3514075-...,[],viernes 07 julio 2023 – domingo 09 julio 2023,Rock Farm,"C15 FNP4\nSlane, Ireland"
390,Microsonidos 2023,https://www.songkick.com/es/concerts/40648056-...,[],viernes 10 febrero 2023,Microsonidos,"Murcia, Spain\nwww.12ymedio.com/"
392,MUTEK Barcelona 2023,https://www.songkick.com/es/concerts/40977295-...,[],viernes 17 marzo 2023,MUTEK Barcelona @Bridge_48,"Bridge_48 Carrer Llull, 48\n08005\nBarcelona, ..."
393,Ressons Penedès 2023,https://www.songkick.com/es/concerts/40927420-...,[],domingo 30 abril 2023,Elyssia by Freixenet,"Partida, Carrer de la Torre del Gall, s/n. San..."


In [31]:
index_empty_cartel = df_fest[df_fest.Cartel.notna()][df_fest[df_fest.Cartel.notna()]['Cartel'].apply(len) == 0].index

In [34]:
# Eliminamos filas de las que no podemos acceder al cartel

df_fest = df_fest.drop(index=index_empty_cartel)


In [35]:
df_fest = df_fest.reset_index(drop=True)

In [36]:
# Solo nos quedan nulos en Location e info
df_fest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 480 entries, 0 to 479
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Festival  480 non-null    object
 1   url       480 non-null    object
 2   Cartel    480 non-null    object
 3   Date      480 non-null    object
 4   Location  409 non-null    object
 5   info      409 non-null    object
dtypes: object(6)
memory usage: 22.6+ KB


In [37]:
df_fest[(df_fest['info'].isna())|(df_fest['Location'].isna())]

Unnamed: 0,Festival,url,Cartel,Date,Location,info
2,Primavera In The City - Madrid 2023,https://www.songkick.com/es/festivals/3500869-...,"[Bleachers, Julia Jacklin, PUP, Black Country,...",lunes 05 junio 2023 – domingo 11 junio 2023,,
7,Sonar 2023,https://www.songkick.com/es/festivals/1535-son...,"[Fever Ray, Bad Gyal, Richie Hawtin, Charlotte...",jueves 15 junio 2023 – sábado 17 junio 2023,,
13,Vida Festival 2023,https://www.songkick.com/es/festivals/739019-v...,"[The Libertines, AURORA, Julieta Venegas, Sued...",jueves 29 junio 2023 – sábado 01 julio 2023,,
16,Brunch In the Park - Barcelona 2023,https://www.songkick.com/es/festivals/3466699-...,[Fatboy Slim],domingo 02 julio 2023,,
22,Dreambeach 2023,https://www.songkick.com/es/festivals/697694-d...,"[Armin van Buuren, The Prodigy, Carl Cox, Apas...",miércoles 09 agosto 2023 – domingo 13 agosto 2023,,
...,...,...,...,...,...,...
455,Caudal Fest 2023,https://www.songkick.com/es/festivals/2620959-...,"[Vetusta Morla, Leiva, La La Love You, Veintiu...",viernes 15 septiembre 2023 – sábado 16 septiem...,,
456,Ebrovisión 2023,https://www.songkick.com/es/festivals/358-ebro...,"[Mujeres, Karavana]",jueves 31 agosto 2023 – domingo 03 septiembre ...,,
463,Glastonbury Festival 2023,https://www.songkick.com/es/festivals/585-glas...,"[Arctic Monkeys, Blondie, Rudimental, Lewis Ca...",miércoles 21 junio 2023 – domingo 25 junio 2023,,
468,Rock en Seine 2023,https://www.songkick.com/es/festivals/7031-roc...,"[Billie Eilish, The Strokes, Florence + The Ma...",miércoles 23 agosto 2023 – domingo 27 agosto 2023,,


In [38]:
# Sustituímos por 'Unknown'

df_fest.fillna('Unknown',inplace= True)

In [39]:
df_fest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 480 entries, 0 to 479
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Festival  480 non-null    object
 1   url       480 non-null    object
 2   Cartel    480 non-null    object
 3   Date      480 non-null    object
 4   Location  480 non-null    object
 5   info      480 non-null    object
dtypes: object(6)
memory usage: 22.6+ KB


Observamos que la estructura de la fecha es la siguiente:
* Si tiene fecha de inicio y fecha de fin, vienen separadas por '-'
* Parece que todas tienen estructura día de semana / día / mes / año.

Procederemos de la siguiente manera:
1. Crearemos columna fecha de inicio y fecha de fin
2. Si no tiene fecha de fin, esta sera igual a la fecha de inicio

In [40]:
# Creamos nuevas columnas y renombramos
df_fest.insert(loc=4,column='End_Date',value='')
df_fest.rename(columns={'Date':'Start_Date'},inplace=True)

In [41]:
def split_date(x):
    try:
        return x.split(' – ')[-1]  # OJO, es el caracter — (alt+0151) no el guión
    except:
        return x

In [42]:
df_fest['End_Date'] = df_fest['Start_Date'].apply(split_date)

In [43]:
df_fest.head()

Unnamed: 0,Festival,url,Cartel,Start_Date,End_Date,Location,info
0,Mallorca Live Festival 2023,https://www.songkick.com/es/festivals/1520684-...,"[Black Eyed Peas, Chemical Brothers, The Dandy...",jueves 18 mayo 2023 – sábado 20 mayo 2023,sábado 20 mayo 2023,Antic Aquapark,"Camí Cala Figuera, 1\n07181\nCalvia, Spain\nww..."
1,Primavera a la Ciutat 2023,https://www.songkick.com/es/festivals/3452756-...,"[Jake Bugg, Pet Shop Boys, Confidence Man, La ...",martes 30 mayo 2023 – miércoles 31 mayo 2023,miércoles 31 mayo 2023,Sala Apolo,"C/ Nou De La Rambla 113\n08004\nBarcelona, Spa..."
2,Primavera In The City - Madrid 2023,https://www.songkick.com/es/festivals/3500869-...,"[Bleachers, Julia Jacklin, PUP, Black Country,...",lunes 05 junio 2023 – domingo 11 junio 2023,domingo 11 junio 2023,Unknown,Unknown
3,Primavera Sound Festival 2023,https://www.songkick.com/es/festivals/618-prim...,"[Kendrick Lamar, Calvin Harris, Skrillex, Hals...",jueves 08 junio 2023 – domingo 11 junio 2023,domingo 11 junio 2023,Ciudad del Rock,"Autovía A3: Sentido Valencia Salida 33A, 35\n2..."
4,Primavera Pack 2023,https://www.songkick.com/es/festivals/3495900-...,"[Kendrick Lamar, Calvin Harris, Skrillex, Hals...",jueves 08 junio 2023 – sábado 10 junio 2023,sábado 10 junio 2023,Ciudad del Rock,"Autovía A3: Sentido Valencia Salida 33A, 35\n2..."


In [44]:
def split_start_date(x):
    try:
        return x.split(' – ')[0]  # OJO, es el caracter — (alt+0151) no el guión
    except:
        return x

In [45]:
df_fest['Start_Date'] = df_fest['Start_Date'].apply(split_start_date)

In [46]:
df_fest.head()

Unnamed: 0,Festival,url,Cartel,Start_Date,End_Date,Location,info
0,Mallorca Live Festival 2023,https://www.songkick.com/es/festivals/1520684-...,"[Black Eyed Peas, Chemical Brothers, The Dandy...",jueves 18 mayo 2023,sábado 20 mayo 2023,Antic Aquapark,"Camí Cala Figuera, 1\n07181\nCalvia, Spain\nww..."
1,Primavera a la Ciutat 2023,https://www.songkick.com/es/festivals/3452756-...,"[Jake Bugg, Pet Shop Boys, Confidence Man, La ...",martes 30 mayo 2023,miércoles 31 mayo 2023,Sala Apolo,"C/ Nou De La Rambla 113\n08004\nBarcelona, Spa..."
2,Primavera In The City - Madrid 2023,https://www.songkick.com/es/festivals/3500869-...,"[Bleachers, Julia Jacklin, PUP, Black Country,...",lunes 05 junio 2023,domingo 11 junio 2023,Unknown,Unknown
3,Primavera Sound Festival 2023,https://www.songkick.com/es/festivals/618-prim...,"[Kendrick Lamar, Calvin Harris, Skrillex, Hals...",jueves 08 junio 2023,domingo 11 junio 2023,Ciudad del Rock,"Autovía A3: Sentido Valencia Salida 33A, 35\n2..."
4,Primavera Pack 2023,https://www.songkick.com/es/festivals/3495900-...,"[Kendrick Lamar, Calvin Harris, Skrillex, Hals...",jueves 08 junio 2023,sábado 10 junio 2023,Ciudad del Rock,"Autovía A3: Sentido Valencia Salida 33A, 35\n2..."


In [47]:
df_fest.to_parquet('../data/df_fest_clean.parquet',index=False,engine='pyarrow')

In [48]:
grupos = list(set([grupo for cartel in df_fest.Cartel for grupo in cartel]))

In [49]:
len (grupos)

5174

In [50]:
pd.DataFrame({'Grupos':grupos}).to_csv('../data/grupos.csv',index=False)