In [39]:
import pandas as pd
import requests
import json

# Initialize day counter
day_id = 1
types = ['week', 'industry']
industry_ids = []
weeks_ids = []

# Read keywords from a local file
with open('keywords.json', 'r') as f:
    keywords_data = json.load(f)

# Keyword lists
indication = [row[0] for row in keywords_data['rows']]
asset = [row[1] for row in keywords_data['rows']]
company_name = [row[2] for row in keywords_data['rows']]
MoA = [row[3] for row in keywords_data['rows']]
other_keywords = [row[4] for row in keywords_data['rows']]

# Helper function to match keywords
def find_matching_keywords(text, keywords):
    if not text:
        return None
    matches = [keyword for keyword in keywords if keyword and keyword.lower() in text.lower()]
    return '; '.join(matches) if matches else None

# Fields to extract
fields = [
    "Session Date", "Session Start Time", "Session End Time", "Session Timezone",
    "Session Location", "Session Number", "Session Pathways", "Session Format",
    "Session CME", "Session Type", "Session Group", "Session Title", "Session Description",
    "Presentation Date", "Presentation Location", "Presentation Start Time", "Presentation End Time",
    "Presentation ID", "Presentation Number", "Presentation Title", "Abstract Authors", 
    "Abstract Affiliations", "Presentation Title (lowercase)", "Abstract", 
    "Abstract Url", "Company", "Asset", "Indication", "MoA", "Other Keywords"
]

# Combined data
combined_data = []

# Iterate over types
for event_type in types:
    day_id = 2
    while True:
        # Construct the URL for a specific day
        days_url = f"https://ueg2024.planner.documedias.systems/api/program/days/{day_id}?program_type={event_type}&system=program_{event_type}_2024&program_mode=list&filter_display_type=2"
                    
        
        
        # Fetch data for the day
        response = requests.get(days_url)
        day_data = response.json()
        idList = []
        indFinished = False
        weekFinished = False

        for content in day_data:
            if event_type == 'industry':
                idInd = content.get('id')
                industry_ids.append(idInd)
                if day_data == None:
                    indFinished = True
            if event_type == 'week':
                idWeek = content.get('id')
                weeks_ids.append(idWeek)
                if day_data == None:
                    weekFinished = True
            if weekFinished == True and indFinished == True:
                break
        day_id += 1
        print(day_id)
        print(indFinished)
        print(weekFinished)
# Stop if no data is returned
print('industry ids:', industry_ids)
print('weeks ids:', weeks_ids)


    

3
[{'id': 145, 'legacy_id': None, 'parent_id': None, 'order': None, 'display_type_id': 1, 'program_type_id': 1, 'day_id': 2, 'start_time': '09:00', 'end_time': '10:30', 'duration': 90, 'session_group_id': 7, 'session_type_id': 1, 'code': None, 'title': 'Metabolic dysfunction: Associated steatotic liver disease', 'content': {'outline': '·&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Epidemiological concerns regarding NAFLD/NASH<br>·&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; How to differentiate from alcohol-related liver disease?<br>·&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Relationship with the metabolic syndrome and cardiovascular diseases<br>·&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Risk stratification and evaluation of fibrosis<br>·&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Therapeutic management', 'flave_link': 'https://congressplatform.ueg.eu/exhibition/r/stream/uegw2024-c1/25254'}, 'room_id': 1, 'exceeds': False, 'mixed': False, 'private': False, 'publish': True, 'publish_app': True, 'wordly_link': None, 'st

KeyboardInterrupt: 

In [None]:
# Iterate through sessions
for session in day_data['sessions']:
    session_common_data = {
        "Session Date": session.get('day', {}).get("date"),
        "Session Start Time": session.get("start_time"),
        "Session End Time": session.get("end_time"),
        "Session Timezone": "CEST",
        "Session Location": session.get("room", {}).get('short_name'),
        "Session Number": session.get("id"),
        "Session Pathways": ', '.join([pathway.get('name') for pathway in session.get('pathways', [])]),
        "Session CME": "",
        "Session Type": session.get('session_type', {}).get('name'),
        "Session Group": session.get("session_group_id"),
        "Session Title": session.get("title"),
        "Session Description": session.get("content", {}).get("outline"),
    }

    # Iterate through presentations
    for presentation in session.get('presentations', []):
        presentation_title = presentation.get("presentation", {}).get("title", "")
        presentation_data = {
            **session_common_data,
            "Presentation Date": session.get('day', {}).get("date"),
            "Presentation Location": session.get("room", {}).get('short_name'),
            "Presentation Start Time": presentation.get("start_time"),
            "Presentation End Time": presentation.get("end_time"),
            "Presentation ID": presentation.get("id"),
            "Presentation Number": presentation.get("code"),
            "Presentation Title": presentation_title,
            "Presentation Title (lowercase)": presentation_title.lower(),
        }

        # Abstract data handling
        abstract_id = presentation.get("presentation", {}).get("abstract_id")
        if abstract_id:
            abstract_url = f"https://programme.ueg.eu/week2024/#/details/presentations/{abstract_id}"
            abstract_response = requests.get(f"https://ueg2024.abstract.documedias.systems/api/v1/manager/abstract/multi/html/id/{abstract_id}/template/planner_preview?program_type={event_type}&system=program_{event_type}_2024")
            abstract_data = abstract_response.json().get(str(abstract_id), {})
            presentation_data["Abstract"] = abstract_data
            presentation_data["Abstract Url"] = abstract_url

        # Keyword matching
        presentation_data["Indication"] = find_matching_keywords(presentation_title, indication)
        presentation_data["Asset"] = find_matching_keywords(presentation_title, asset)
        presentation_data["Company"] = find_matching_keywords(presentation_title, company_name)
        presentation_data["MoA"] = find_matching_keywords(presentation_title, MoA)
        presentation_data["Other Keywords"] = find_matching_keywords(presentation_title, other_keywords)

        # Append to combined data
        combined_data.append(presentation_data)


# Create DataFrame and export to Excel
df_combined = pd.DataFrame(combined_data, columns=fields)
df_combined.to_excel("UEG_Scrape_Combined.xlsx", sheet_name="Combined Data", index=False)
print("Data scraping complete. Excel file saved.")

