In [51]:
import pandas as pd
import requests
import re
import html
import json

In [52]:
def find_matching_keywords(text, keywords):
    if not text:
        return None
    matches = [keyword for keyword in keywords if keyword and keyword.lower() in text.lower()]
    return '; '.join(matches) if matches else None

def clean_text(bad_text):
    if bad_text is None:  # Check for None
        return ''
    # 1. Decode HTML entities (e.g., &amp;, &lt;, etc.)
    no_html = re.sub(r'<.*?>', '', bad_text)
    # Then, remove specific characters (., *, ?)
    clean_text = re.sub(r'[.*?]', '', no_html)
    # 3. Strip any excessive whitespace
    clean_text = clean_text.strip()
    
    return clean_text

def clean_dict(data_dict):
    cleaned_data = {}
    for key, value in data_dict.items():
        if isinstance(value, str):  # Only clean if it's a string
            cleaned_data[key] = clean_text(value)
        else:
            cleaned_data[key] = value  # Leave other types (lists, None, etc.) unchanged
    return cleaned_data

day_id = 2
types = ['week', 'industry']
industry_ids = []
weeks_ids = []



# Read keywords from a local file
with open('keywords.json', 'r') as f:
    keywords_data = json.load(f)

# Keyword lists
indication = [row[0] for row in keywords_data['rows']]
asset = [row[1] for row in keywords_data['rows']]
company_name = [row[2] for row in keywords_data['rows']]
MoA = [row[3] for row in keywords_data['rows']]
other_keywords = [row[4] for row in keywords_data['rows']]


# Fields to extract
fields = [
    "Session Date", "Session Start Time", "Session End Time", "Session Timezone",
    "Session Location", "Session Number", "Session Pathways", "Session Format",
    "Session CME", "Session Type", "Session Group", "Session Title", "Session Description",
    "Presentation Date", "Presentation Location", "Presentation Start Time", "Presentation End Time",
    "Presentation ID", "Presentation Number", "Presentation Title", "Abstract Authors", 
    "Abstract Affiliations", "Presentation Title (lowercase)", "Abstract", 
    "Abstract Url", "Company", "Asset", "Indication", "MoA", "Other Keywords"
]

combined_data = []
indication = []
asset = []
company_name = []
MoA = []
other_keywords = []
allEndpoints = []


for row in keywords_data['rows']:
    indication.append(row[0])
    asset.append(row[1])
    company_name.append(row[2])
    MoA.append(row[3])
    other_keywords.append(row[4])


In [53]:

# Iterate over types
for event_type in types:
    weekFinished = False
    indFinished = False
    day_id = 2
    while not (weekFinished and indFinished):
        # Construct the URL for a specific day
        days_url = f"https://ueg2024.planner.documedias.systems/api/program/days/{day_id}?program_type={event_type}&system=program_{event_type}_2024&program_mode=list&program_sort=date&filter_display_type=1,3&filter_group=is_not_child"
        
        # Fetch data for the day
        response = requests.get(days_url)
        day_data = response.json()

        # Stop if no data is returned
        if not day_data:
            if event_type == 'week':
                weekFinished = True
            elif event_type == 'industry':
                indFinished = True
            break

        for content in day_data:
            if event_type == 'industry':
                idInd = content.get('id')
                industry_ids.append(idInd)
            if event_type == 'week':
                idWeek = content.get('id')
                weeks_ids.append(idWeek)
            
        day_id += 1


all_ids = [
    {"ids": industry_ids, "event_type": "industry"},
    {"ids": weeks_ids, "event_type": "week"}
]


In [54]:
for event_group in all_ids:
    for id in event_group["ids"]:
        event_type = event_group["event_type"]
        print(f'Current event_type: {event_type}')
        print(f'Current id: {id}')
        sessionUrl = f"https://ueg2024.planner.documedias.systems/api/program/sessions/{id}?program_type={event_type}&system=program_{event_type}_2024"
        allEndpoints.append(sessionUrl)
        print(sessionUrl)


Current event_type: industry
Current id: 369
https://ueg2024.planner.documedias.systems/api/program/sessions/369?program_type=industry&system=program_industry_2024
Current event_type: industry
Current id: 370
https://ueg2024.planner.documedias.systems/api/program/sessions/370?program_type=industry&system=program_industry_2024
Current event_type: industry
Current id: 371
https://ueg2024.planner.documedias.systems/api/program/sessions/371?program_type=industry&system=program_industry_2024
Current event_type: industry
Current id: 372
https://ueg2024.planner.documedias.systems/api/program/sessions/372?program_type=industry&system=program_industry_2024
Current event_type: industry
Current id: 416
https://ueg2024.planner.documedias.systems/api/program/sessions/416?program_type=industry&system=program_industry_2024
Current event_type: industry
Current id: 376
https://ueg2024.planner.documedias.systems/api/program/sessions/376?program_type=industry&system=program_industry_2024
Current event_ty

In [None]:
    # Iterate over each event group


for endpoint in allEndpoints:
        response = requests.get(endpoint)
        session_data = response.json()
        print(f'Current endpoint: {endpoint}')
        for session in session_data:
            session_pathways = session.get('pathways', [])
            session_group = "Other"

            session_common_data = {
                "Session Date": session.get('day', {}).get("date"),
                "Session Start Time": session.get("start_time"),
                "Session End Time": session.get("end_time"),
                "Session Timezone": "CEST",
                "Session Location": session.get("room", {}).get('short_name'),
                "Session Number": session.get("id"),
                "Session Pathways": ', '.join([pathway.get('name') for pathway in session_pathways]) if session_pathways else "",
                "Session CME": "",
                "Session Type": session.get('session_type', {}).get('name'),
                "Session Group": session_group,
                "Session Title": session.get("title"),
                "Session Description": session.get("content", {}).get("outline"),
            }

            authors = []
            institutions = []

            # Iterate through presentations
            for presentation in session.get('presentations', []):
                presentation_title = clean_text(presentation.get("presentation", {}).get("title", ""))
                session_title = clean_text(session.get("title", ""))
                session_description = clean_text(session.get("content", {}).get("outline", ""))
                presentation_number = presentation.get("presentation", {}).get("code")
                abstracts = ""
                Abstract_url = None
                Abstract_id = presentation.get("presentation", {}).get("abstract_id")

                # Fetch Abstract Data if Available
                if Abstract_id:
                    Abstract_url = f"https://programme.ueg.eu/week2024/#/details/presentations/{Abstract_id}"
                    Abstract_endpoint = f"https://ueg2024.abstract.documedias.systems/api/v1/manager/abstract/multi/html/id/{Abstract_id}/template/planner_preview?program_type=week&system=program_week_2024"
                    responseAbstract = requests.get(Abstract_endpoint)
                    dataAbstract = responseAbstract.json()
                    abstracts = dataAbstract.get(Abstract_id, "")

                # Gather author and institution data
                for person in presentation.get("presentation", {}).get("persons", []):
                    institution = person.get('person', {}).get('institution')
                    author_name = f"{person.get('person', {}).get('first_name')} {person.get('person', {}).get('last_name')}"
                    if institution:
                        institutions.append(institution)
                    if author_name:
                        authors.append(author_name)

                # Find matching keywords
                matched_indication = find_matching_keywords(presentation_title, indication) or find_matching_keywords(session_title, indication) or find_matching_keywords(session_description, indication)
                matched_asset = find_matching_keywords(presentation_title, asset) or find_matching_keywords(session_title, asset) or find_matching_keywords(session_description, asset)
                matched_company = find_matching_keywords(presentation_title, company_name) or find_matching_keywords(session_title, company_name) or find_matching_keywords(session_description, company_name)
                matched_MoA = find_matching_keywords(presentation_title, MoA) or find_matching_keywords(session_title, MoA) or find_matching_keywords(session_description, MoA)
                matched_other_keywords = find_matching_keywords(presentation_title, other_keywords) or find_matching_keywords(session_title, other_keywords) or find_matching_keywords(session_description, other_keywords)

                # Compile Presentation Data
                presentation_data = {
                    **session_common_data,
                    "Session Format": ', '.join([format.get('name') for format in session.get('formats', [])]),
                    "Presentation Date": session.get('day', {}).get("date"),
                    "Presentation Location": session.get("room", {}).get('short_name'),
                    "Presentation Start Time": presentation.get("start_time"),
                    "Presentation End Time": presentation.get("end_time"),
                    "Presentation ID": presentation.get("id"),
                    "Presentation Number": presentation_number,
                    "Presentation Title": presentation_title,
                    "Presentation Title (lowercase)": presentation_title.lower(),
                    "Abstract Authors": '; '.join(authors),
                    "Abstract Affiliations": '; '.join(institutions),
                    "Abstract": abstracts,
                    "Abstract Url": Abstract_url,
                    "Company": matched_company,
                    "Indication": matched_indication,
                    "Asset": matched_asset,
                    "MoA": matched_MoA,
                    "Other Keywords": matched_other_keywords,
                }

                cleaned_presentation_data = clean_dict(presentation_data)
                combined_data.append(cleaned_presentation_data)


Current endpoint: https://ueg2024.planner.documedias.systems/api/program/sessions/369?program_type=industry&system=program_industry_2024
Current endpoint: https://ueg2024.planner.documedias.systems/api/program/sessions/370?program_type=industry&system=program_industry_2024
Current endpoint: https://ueg2024.planner.documedias.systems/api/program/sessions/371?program_type=industry&system=program_industry_2024
Current endpoint: https://ueg2024.planner.documedias.systems/api/program/sessions/372?program_type=industry&system=program_industry_2024
Current endpoint: https://ueg2024.planner.documedias.systems/api/program/sessions/416?program_type=industry&system=program_industry_2024
Current endpoint: https://ueg2024.planner.documedias.systems/api/program/sessions/376?program_type=industry&system=program_industry_2024
Current endpoint: https://ueg2024.planner.documedias.systems/api/program/sessions/373?program_type=industry&system=program_industry_2024
Current endpoint: https://ueg2024.planner

In [48]:
df_combined = pd.DataFrame(combined_data, columns=fields)
df_combined.to_excel("UEG_Scrape_Output.xlsx", sheet_name="Extraction", index=False)
print("Data scraping complete. Excel file saved.")


Data scraping complete. Excel file saved.
