# AI-Events Extractor

This notebook tests the code to extract AI-related events using web-scrapping from different sources, the first one to test is: [Unite.AI](https://www.unite.ai/conferences/)

In [3]:
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
from pandas.tseries.offsets import MonthBegin
from loguru import logger
from datetime import datetime
from openpyxl.worksheet.table import Table, TableStyleInfo
from openpyxl.utils import get_column_letter
import sys

sys.path.append("../.")

from ai_events_pipeline.config import DataSources

In [4]:
data_sources = DataSources()

In [5]:
# Step 1: Fetch the page
url = data_sources.TOP_AI_CONFERENCES
headers = {"User-Agent": "Mozilla/5.0"} # To avoid get uncomplete data of the website

logger.info("Sending request to fetch web data")
response = requests.get(url, headers=headers)

if response.status_code != 200:
    raise ValueError(f"The request to {url} failed. Status code: {response.status_code}, {response.text}")

logger.info("Extracting AI events...")
soup = BeautifulSoup(response.content, "html.parser")


# Paso 2: Buscar el tbody directamente
tbody = soup.find("tbody", class_="row-striping")
rows = tbody.find_all("tr")

# Paso 3: Extraer los datos
conferences = []
for row in rows:
    cols = row.find_all("td")
    if len(cols) >= 3:
        dates = cols[0].get_text(strip=True)
        title_tag = cols[1].find("a")
        title = title_tag.get_text(strip=True) if title_tag else cols[1].get_text(strip=True)
        link = title_tag["href"].strip() if title_tag and "href" in title_tag.attrs else None
        location = cols[2].get_text(strip=True)

        conferences.append({
            "Title": title,
            "Dates": dates,
            "Location": location,
            "Link": link
        })

if conferences:
    logger.info("Retrieval succeeded")
else:
    logger.info("Retrieval failed. No data was fetched")

[32m2025-10-05 21:51:49.881[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mSending request to fetch web data[0m
[32m2025-10-05 21:51:50.774[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1mExtracting AI events...[0m
[32m2025-10-05 21:51:50.794[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m38[0m - [1mRetrieval succeeded[0m


In [6]:
ai_events = pd.DataFrame(conferences)

In [7]:
ai_events.tail()

Unnamed: 0,Title,Dates,Location,Link
46,European Chatbot & Conversational AI Summit 2026,"March 17 to 19, 2026","Edinburgh, Scotland",https://theeuropeanchatbot.com/
47,Big data and Data science Conference 2026,"April 13 to 14, 2026","Orlando, FL",https://datascience-machinelearning.averconfer...
48,MLcon San Diego,"June 1 to 5, 2026","San Diego, CA",https://mlconference.ai/san-diego/
49,4th Data Science & AI Summit,"June 16 to 17, 2025","London, UK",https://datascience.thepeopleevents.com/
50,2nd International Conference on Artificial Int...,"July 6 to 7, 2026","Singapore, SG",https://artificialintelligence.novelticsconfer...


In [8]:
def format_string_date(date_str: str) -> str:
    """
    Formats different types of date strings to the most common format 'YYYY-mm-dd'

    Args:
        date_str: str -> Date string with the format 'October 23, 2025', or 'Oct 23, 2025' 
    
    Returns:
        str -> Date string in the format: '2025-10-23'
    """
    if not isinstance(date_str, str):
        raise ValueError("The date is not a string data type")
    
    for fmt in (r"%B %d, %Y", r"%b %d, %Y"):
        try:
            return datetime.strptime(date_str, fmt).strftime(r"%Y-%m-%d")
        except ValueError:
            continue

    raise ValueError(f"Unrecognized date format: '{date_str}'")
        




def get_initial_and_final_dates(raw_date: str) -> tuple[str]:
    """
    Extracts the initial and final event's date from formats such as: 
            - 'October 23 to 27, 2025'
            - 'Oct 23 to 27, 2025'
            - 'October 23 to November 1, 2025'
    
    In case there's a single-day event, it needs to have the format 'October 23, 2025' or 'Oct 23, 2025'
    
    Args:
        raw_date: str -> Date with the formats provided early
    
    Returns:
        tuple[str] -> Tuple of strings initial and final dates in the format '%Y-%m-%d'
    """
    year_match = re.search(r"\b\d{4}\b", raw_date)
    year = year_match.group(0) if year_match else ""

    if " to " in raw_date:
        dates = raw_date.split(" to ") # spaces are necessary due to "October"

    elif "-" in raw_date: # In case it does not have "to" as a date separator
        dates = raw_date.split("-") # In case there's dates like "Oct 12-20, 2024"

    elif re.search(r"[A-Za-z]+ \d+, \d{4}",raw_date): # In case there's only a single-day event
        initial_date = format_string_date(raw_date)
        final_date = initial_date
        return initial_date, final_date

    else:
        raise ValueError(f"Unknown date format: {raw_date}")

    initial_date = f"{dates[0].strip()}, {year}"

    date_pattern = r"[A-Za-z]+\s\d+, \d{4}" #Looks for 'March 12, 2025' formats

    # If no month is defined in the final_date, the month of the initial_date is set
    final_date = dates[1].strip() if re.search(date_pattern, dates[1].strip()) else f"{initial_date.split()[0]} {dates[1].strip()}"

    # Format string dates
    
    initial_date = format_string_date(initial_date) # In case the month is fully written ("October", "September")
    final_date = format_string_date(final_date)

    return initial_date, final_date
    

In [9]:
date_example = "Oct 17 to Nov 20, 2026"

dates = get_initial_and_final_dates(date_example)
print(f"{dates[0]=}, {dates[1]=}")
type(dates)

dates[0]='2026-10-17', dates[1]='2026-11-20'


tuple

In [10]:
ai_events[["initial_date","final_date"]] = ai_events["Dates"].apply(get_initial_and_final_dates).apply(pd.Series)

ai_events = ai_events.drop("Dates", axis = 1)

In [11]:
ai_events.initial_date = pd.to_datetime(ai_events.initial_date)
ai_events.final_date = pd.to_datetime(ai_events.final_date)

ai_events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Title         51 non-null     object        
 1   Location      51 non-null     object        
 2   Link          51 non-null     object        
 3   initial_date  51 non-null     datetime64[ns]
 4   final_date    51 non-null     datetime64[ns]
dtypes: datetime64[ns](2), object(3)
memory usage: 2.1+ KB


Filter the events from the next 3 months from now

In [12]:
# Get today's date
today = pd.Timestamp.today()

# Get the first day of the current month
first_day_current_month = today.replace(day=1)

# Get the first day of the month, three months from now
first_day_plus_3_months = first_day_current_month + MonthBegin(4)

# Filter for events where final_date is lower than the first day of the current month plus 3 months
events_next_three_months = ai_events[(ai_events.final_date < first_day_plus_3_months) & (ai_events.final_date >= first_day_current_month)]

# Returning the datetime columns to string format to avoid issues when exporting to Excel
events_next_three_months.initial_date = events_next_three_months["initial_date"].dt.strftime('%Y-%m-%dT08:00:00Z')
events_next_three_months.final_date = events_next_three_months["final_date"].dt.strftime('%Y-%m-%dT18:00:00Z')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_next_three_months.initial_date = events_next_three_months["initial_date"].dt.strftime('%Y-%m-%dT08:00:00Z')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_next_three_months.final_date = events_next_three_months["final_date"].dt.strftime('%Y-%m-%dT18:00:00Z')


In [207]:
events_next_three_months[events_next_three_months.Title.str.contains("AI Summit")]

Unnamed: 0,Title,Location,Link,initial_date,final_date
15,"Agentic AI Summit,","Amsterdam, NL",https://www.aidataanalytics.network/events-age...,2025-10-27T08:00:00Z,2025-10-28T18:00:00Z
24,AI Summit Seoul & Expo (AIS),"Seoul, Korea",https://www.aisummitseoul.com/program2025,2025-11-10T08:00:00Z,2025-11-11T18:00:00Z
43,AI Summit New York 2025,"New York, NY",https://newyork.theaisummit.com/?_mc=cl_aisny_...,2025-12-10T08:00:00Z,2025-12-11T18:00:00Z


The following code is to generate an excel file that contains a table, which is required by PowerAutomate to correctly identify the table

In [208]:
with pd.ExcelWriter('../data/ai_events_next_three_months.xlsx', engine='openpyxl') as writer:
    events_next_three_months.to_excel(writer, index=False, sheet_name='AI Events')
    workbook  = writer.book
    worksheet = writer.sheets["AI Events"]
    (max_row, max_col) = events_next_three_months.shape

    # Calcula el rango de la tabla en formato Excel (por ejemplo, "A1:D10")
    table_ref = f"A1:{get_column_letter(max_col)}{max_row + 1}"

    table = Table(displayName="AIEventsTable", ref=table_ref)
    style = TableStyleInfo(name="TableStyleMedium9", showFirstColumn=False,
                           showLastColumn=False, showRowStripes=True, showColumnStripes=False)
    table.tableStyleInfo = style
    worksheet.add_table(table)

In [209]:
max_date = events_next_three_months.initial_date.max()
events_next_three_months.query("initial_date == @max_date")

Unnamed: 0,Title,Location,Link,initial_date,final_date
43,AI Summit New York 2025,"New York, NY",https://newyork.theaisummit.com/?_mc=cl_aisny_...,2025-12-10T08:00:00Z,2025-12-11T18:00:00Z


In [210]:
events_next_three_months.initial_date.min()

'2025-09-29T08:00:00Z'

In [211]:
events_next_three_months.drop_duplicates(["Title", "initial_date"]).query("Title.str.contains('Generative AI for Marke')")

Unnamed: 0,Title,Location,Link,initial_date,final_date
35,Generative AI for Marketing Summit,"London, UK",https://www.aidataanalytics.network/events-gen...,2025-11-24T08:00:00Z,2025-11-26T18:00:00Z


In [212]:
events_next_three_months.shape

(44, 5)