In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('../data/raw/combined_df.csv')

In [3]:
df = data.copy()

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 315 entries, 0 to 314
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   name             315 non-null    object 
 1   url              315 non-null    object 
 2   start_time       252 non-null    object 
 3   start_date       315 non-null    object 
 4   end_time         20 non-null     object 
 5   end_date         315 non-null    object 
 6   duration         20 non-null     object 
 7   venue_name       298 non-null    object 
 8   venue_address    294 non-null    object 
 9   longitude        298 non-null    float64
 10  latitude         298 non-null    float64
 11  summary          224 non-null    object 
 12  is_online_event  20 non-null     object 
 13  tickets_url      315 non-null    object 
 14  tags             315 non-null    object 
 15  event_id         315 non-null    int64  
 16  data_source      315 non-null    object 
 17  image           

In [5]:
pd.set_option('display.max_colwidth', 500)

In [6]:
import pandas as pd
import ast

# Step 1: Make sure 'tags' are proper lists (not strings)
def safe_parse_list(tag_str):
    try:
        return ast.literal_eval(tag_str) if isinstance(tag_str, str) else tag_str
    except:
        return []

df['tags'] = df['tags'].apply(safe_parse_list)

# Step 2: Filter rows where data_source == 'eventbrite'
eventbrite_df = df[df['data_source'] == 'eventbrite']

# Step 3: Flatten the list of tags across all rows
all_tags = []
for tag_list in eventbrite_df['tags']:
    all_tags.extend([tag.strip() for tag in tag_list])  # strip to clean whitespace

# Step 4: Get unique tags
unique_tags = sorted(set(all_tags))

In [7]:
english_tags = [
    'Audio', 'Baby', 'Babygroup', 'Bookshop', 'Children', 'Children & Youth',
    'Childrenandfamilies', 'Childrens', 'Class, Training, or Workshop', 'Community',
    'Concert or Performance', 'Conference', 'Event', 'Family', 'Family & Education',
    'Familyworkshop', 'Gathering', 'Humour', 'Kids', 'Lettering', 'Parenting',
    'Podcast', 'Radio', 'Standup', 'Standupcomedy', 'Storyteller', 'Storytelling',
    'Workshop', 'kids_events'
]

spanish_tags = [
    'Audio', 'Bebé', 'Grupo de bebés', 'Librería', 'Niños', 'Niños y jóvenes',
    'Niños y familias', 'Niños', 'Clase, formación o taller', 'Comunidad',
    'Concierto o actuación', 'Conferencia', 'Evento', 'Familia', 'Familia y educación',
    'Taller familiar', 'Reunión', 'Humor', 'Niños', 'Lettering', 'Crianza',
    'Podcast', 'Radio', 'Monólogo', 'Comedia en Barcelona', 'Narrador',
    'Narración de cuentos', 'Taller', 'eventos_infantiles'
]

translation_dict = dict(zip(english_tags, spanish_tags))

import ast

# Ensure 'tags' is a list
df['tags'] = df['tags'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Function to translate individual tag lists
def translate_tag_list(tag_list):
    return [translation_dict.get(tag.strip(), tag.strip()) for tag in tag_list]

# Apply only to rows where data_source == 'eventbrite'
df.loc[df['data_source'] == 'eventbrite', 'tags'] = (
    df.loc[df['data_source'] == 'eventbrite', 'tags'].apply(translate_tag_list)
)


In [8]:
df.shape

(315, 18)

In [None]:
# df.to_csv('../data/raw/combined_df_with_tags_spanish.csv')

In [15]:
# Step 1: Ensure 'tags' is properly formatted as a list (in case it's a stringified list)
df['tags'] = df['tags'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Step 2: Flatten all tags from all rows
all_tags = []
for tag_list in df['tags']:
    all_tags.extend([tag.strip() for tag in tag_list])  # Clean whitespace and extend the list

# Step 3: Get unique tags
unique_tags = sorted(set(all_tags))
unique_tags_df = pd.DataFrame(unique_tags)
# Step 4: Print the result or return the unique tags

In [16]:
unique_tags

['Actividades',
 'Alicante',
 'Arte',
 'Audio',
 'Barcelona',
 'Bebé',
 'Clase, formación o taller',
 'Comedia en Barcelona',
 'Comunicación',
 'Comunidad',
 'Concierto o actuación',
 'Conferencia',
 'Conferencias',
 'Consciência',
 'Crianza',
 'Cuentacuentos',
 'Cuentos',
 'Curso',
 'Educacion',
 'Estimulacion',
 'Evento',
 'Familia',
 'Familia y educación',
 'Familiar',
 'Familias',
 'Floral',
 'Gratis',
 'Grupo de bebés',
 'Humor',
 'Jovenes,Niños',
 'Jovenes,Niños,Familias',
 'Juego',
 'Juegos',
 'Lectura',
 'Lettering',
 'Libreria',
 'Librería',
 'Locutores',
 'Madrid',
 'Mama',
 'Maternidad',
 'Microfono',
 'Montessori',
 'Monólogo',
 'Movimiento',
 'Narración de cuentos',
 'Narrador',
 'Niños',
 'Niños y familias',
 'Niños y jóvenes',
 'Niños,Familias',
 'Ocio',
 'Octubre',
 'Podcast',
 'Radio',
 'Reunión',
 'Silvestre',
 'Taller',
 'Taller familiar',
 'Teatro',
 'abecedario',
 'affaires_de_famille',
 'barcelona_comedy',
 'certe_mathurin',
 'colmena',
 'dibujamos',
 'eventos_inf