In [229]:
# %pip install requests 

In [230]:
import requests
from bs4 import BeautifulSoup

def content_scraper(soup, links, identifier):
    link_and_data = {}

    for link in links:  # Iterate over all links
        response = requests.get(link)  # Use the current link

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')

            # Handle scraping logic based on the identifier
            match identifier:
                case 'jena':
                    title = soup.find_all('title')
                    content = soup.find_all('div', class_='flex-layout')

                    # Extract title and content if found
                    text_content = (
                        title[0].get_text(strip=True) + ': ' + content[0].get_text(strip=True)
                        if title and content else "No content found"
                    )
                case 'siegburg':
                    divs = soup.find_all('div', class_='flex-layout')
                    content = divs[1] if len(divs) > 1 else None

                    # Extract content if found
                    text_content = content.get_text(strip=True) if content else "No content found"
                case _:
                    text_content = "Wrong identifier"

            link_and_data[link] = text_content
        else:
            # Handle failed requests
            link_and_data[link] = f"Failed to retrieve the webpage. Status code: {response.status_code}"

    return link_and_data


In [231]:
def siegburg_data(soup):
    links = []

    a_tags = soup.find_all('a')

    for tag in a_tags:
        href = tag.get('href')
        
        if href and '/proposals/' in href and not 'new' in href:
            link = 'https://mitmachen.siegburg.de' + href
    
            if not link in links:
                links.append(link)
    
    link_and_content = content_scraper(soup, links, 'siegburg')

    return link_and_content


def jena_data(soup):
    links = []

    a_tags = soup.find_all('a',  class_='resource-item--title')

    for tag in a_tags:
        href = tag.get('href')
        
        if href:
            link = 'https://mitmachen.jena.de' + href
            
            if not link in links:
                links.append(link)
    
    link_and_content = content_scraper(soup, links, 'jena')

    return link_and_content


def wurzburg_data(soup):
    links = []

    # Find all resource links for Würzburg (similar to Jena)
    a_tags = soup.find_all('a', class_='resource-item--title')

    for tag in a_tags:
        href = tag.get('href')

        if href:
            link = 'https://wuerzburg-mitmachen.de' + href

            if link not in links:
                links.append(link)

    # Scrape content from each link
    link_and_content = content_scraper(soup, links, 'jena')  # Reusing 'jena' scraping logic as the structure is the same

    return link_and_content

In [232]:
urls = [
    'https://mitmachen.siegburg.de/angebotslandkarte',
    'https://mitmachen.jena.de/projekts',
    'https://wuerzburg-mitmachen.de/projekts'
]


def def_42(urls):
    urls_and_data = {}

    for url in urls:
        response = requests.get(url)
        if response.status_code != 200:
            print("Failed to retrieve the webpage. Status code:", response.status_code)

        soup = BeautifulSoup(response.content, 'html.parser')

        link_and_content = scraper(soup, url)
        urls_and_data[url] = link_and_content

    return urls_and_data


def scraper(soup, url):
    if 'siegburg' in url:
        return siegburg_data(soup)
    elif 'jena' in url:
        return jena_data(soup)
    elif 'wuerzburg' in url:  # Add handling for Würzburg
        return wurzburg_data(soup)

    return "No scraper defined for this URL"


link_and_data = def_42(urls)

In [233]:
link_and_data

{'https://mitmachen.siegburg.de/angebotslandkarte': {'https://mitmachen.siegburg.de/proposals/1090-senioiren-cafe-lichtblick-kaldauen': 'Senioiren Cafe Lichtblick Kaldauen18. September 20240 KommentareZugehöriges Projekt:\n            Angebotslandkarte60-7475+Geselliges BeisammenseinDas Cafe Lichtblick in Kaldauen lädt alle Senioren zum geselligen Beisammensein bei Kaffee und Kuchen ein. Es wird gesungen, gespielt, getanzt, gelacht und viel erzählt. Auch gemeinsame Ausflüge und Feiern stehen auf dem Programm.Haben Sie Interesse, dann kommen Sie gerne vorbei. Die Senioren und das Team freut sich auf Sie!Treffen : Alle 14 Tage Dienstags von 15.00 - 17.00 Uhr\xa0(immer in geraden Wochen)Ort : Pfarrheim der katholischen Kirche KaldauenAntoniusweg 1 ;\xa053721 SiegburgAnmeldungen bei Rita Quadt ; Tel.:\xa0015733725174:Kommentare(0)Um fortzufahren, müssen Sie sichanmeldenoderregistrieren.MCafé LichtblickBeigetreten am: 18.09.2024Konto verifiziertVorschläge1Unterstützer*innenUnterstützer*inne

In [234]:
import pandas as pd

# Separate URLs and content for Siegburg and Jena
siegburg_data = {k: v for k, v in link_and_data.items() if "siegburg" in k}
jena_data = {k: v for k, v in link_and_data.items() if "jena" in k}
wurzburg_data = {k: v for k, v in link_and_data.items() if "wuerzburg" in k}

# Create DataFrames
df_sieburg = pd.DataFrame(list(siegburg_data.items()), columns=["URL", "Content"])
df_jena = pd.DataFrame(list(jena_data.items()), columns=["URL", "Content"])
df_wurzburg = pd.DataFrame(list(wurzburg_data.items()), columns=["URL", "Content"])


In [235]:
df_jena = pd.DataFrame(df_jena['Content'][0].items(), columns=["URL", "Content"])
df_sieburg = pd.DataFrame(df_sieburg['Content'][0].items(), columns=["URL", "Content"])
df_wurzburg = pd.DataFrame(df_wurzburg['Content'][0].items(), columns=["URL", "Content"])

#### Cleaning and structurising Sieburg Dataset

In [238]:
import re

# Enhanced function to extract all logical parts, including "Unterstützer*innen"
def extract_full_data_with_supporters(content):
    # Extract title (everything before the first date)
    title_match = re.search(r'^(.*?)(\r|\d{1,2}\.\s\w+\s\d{4})', content)
    title = title_match.group(1).strip() if title_match else None

    # Extract date
    date_match = re.search(r'\d{1,2}\.\s\w+\s\d{4}', content)
    date = date_match.group(0) if date_match else None

    # Extract comments count
    comments_match = re.search(r'(\d+)\sKommentare', content)
    comments = int(comments_match.group(1)) if comments_match else 0

    # Extract tags (sections with numbers or + signs)
    tags_match = re.findall(r'(\d{1,2}[-+]\d{1,2}|\d{2}\+)', content)
    tags = ', '.join(tags_match) if tags_match else None

    # Extract description (everything after "Geselliges Beisammensein" or similar patterns)
    description_start = re.search(r'(Geselliges Beisammensein|Angebotslandkarte)', content)
    description = content[description_start.start():].strip() if description_start else None

    # Extract username
    username_match = re.search(r'(\w+\s\w+|Beigetreten am:.*?\d{4})', content)
    username = username_match.group(1).split('Beigetreten am:')[0].strip() if username_match else None

    # Extract Vorschläge count
    vorschlaege_match = re.search(r'Vorschläge(\d+)', content)
    vorschlaege = int(vorschlaege_match.group(1)) if vorschlaege_match else 0

    # Extract Konto verification status
    konto_match = re.search(r'(Konto\s(verifiziert|ist nicht verifiziert))', content)
    konto_status = konto_match.group(2) if konto_match else None

    # # Extract registration date
    # registration_match = re.search(r'Beigetreten am:\s(\d{1,2}\.\s\w+\s\d{4})', content)
    # registration_date = registration_match.group(1) if registration_match else None

    # Extract number of Unterstützer*innen
    supporters_match = re.search(r'(\d+)\sUnterstützer\*in', content)
    supporters = int(supporters_match.group(1)) if supporters_match else 0

    return title, date, comments, tags, description, username, vorschlaege, konto_status, supporters

# Apply the enhanced function to the DataFrame and create new columns
df_sieburg[['Title', 'Date', 'Comments', 'Tags', 'Description', 'Username', 'Vorschläge', 'Konto Status', 'Supporters']] = df_sieburg['Content'].apply(
    lambda x: pd.Series(extract_full_data_with_supporters(x))
)

In [239]:
# Function to clean description considering keywords, numeric patterns, and refined starting logic
def clean_description_advanced(content):
    # Define keywords that mark the beginning of the description
    keywords = [
        'Geselliges Beisammensein', 'Natur', 'Hilfe & Beratung', 'Bildung', 
        'Musik', 'Bewegung', 'Glaube', 'Kulinarisches', 'Kunst & Kultur', 'Sonstiges',
    ]
    
    # Check for keywords first
    for keyword in keywords:
        if keyword in content:
            start_idx = content.find(keyword) + len(keyword)
            description = content[start_idx:].strip()
            description = re.split(r'(Kommentare\(.*?\)|registrieren)', description)[0].strip()
            return description

    # If no keyword is found, check for numeric patterns like "18-24, 25-49, etc."
    numeric_pattern = re.search(r'(\d{1,2}[-+]\d{1,2}|\d{2}\+)', content)
    if numeric_pattern:
        start_idx = numeric_pattern.end()
        description = content[start_idx:].strip()
        description = re.split(r'(Kommentare\(.*?\)|registrieren)', description)[0].strip()
        return description

    # As a fallback, find the first capital letter, quote, or digit to mark the start
    fallback_match = re.search(r'[A-Z"0-9]', content)
    if fallback_match:
        start_idx = fallback_match.start()
        description = content[start_idx:].strip()
        description = re.split(r'(Kommentare\(.*?\)|registrieren)', description)[0].strip()
        return description

    # If nothing works, return the content as is
    return content

# Apply the advanced cleaning function to the Description column
df_sieburg['Description'] = df_sieburg['Content'].apply(clean_description_advanced)


#### Cleaning and structurising Jena Dataframe

In [240]:
# Generalized function to process all entries in df_jena['Content']
def process_all_jena_entries(df_jena):
    # Generalized function for extracting data
    def extract_jena_data(content, soup):
        # Extract Title
        title_match = re.search(r'^(.*?):', content)
        title = title_match.group(1).strip() if title_match else None

        # Extract Description
        description_match = re.search(r':\s*(.*?)\n⭐', content, re.DOTALL)
        description = description_match.group(1).strip() if description_match else None

        # Extract Project Start Date
        start_date_match = re.search(r'Projektstart\s*(\d{1,2}\.\s\w+\s\d{4})', content)
        start_date = start_date_match.group(1) if start_date_match else None

        # Extract Username
        username_match = re.search(r'@(\w+)', content)
        username = username_match.group(1) if username_match else None

        # Extract Tags
        tags_match = re.findall(r'#(\w+)', content)
        tags = ', '.join(tags_match) if tags_match else None

        # Extract Supporters Count
        supporters_match = re.search(r'(\d+)\sUnterstützer\*in', content)
        supporters = int(supporters_match.group(1)) if supporters_match else None

        # Extract Number of Discussions
        discussions_match = re.search(r'DiskussionenAbgeschlossen\sam\s(\d{1,2}\.\s\w+\s\d{4})', content)
        discussions = discussions_match.group(1) if discussions_match else None

        # Extract Number of Comments from the h4 tag
        comments_tag = soup.find('h4', text=re.compile(r'Kommentare'))
        comments_count = int(re.search(r'\((\d+)\)', comments_tag.get_text(strip=True)).group(1)) if comments_tag else 0

        return {
            "Title": title,
            "Description": description,
            "Project Start Date": start_date,
            "Username": username,
            "Tags": tags,
            "Supporters": supporters,
            "Discussions": discussions,
            "Comments": comments_count
        }


    extracted_data = []
    for _, row in df_jena.iterrows():
        response = requests.get(row["URL"])
        if response.status_code != 200:
            print(f"Failed to load URL: {row['URL']}")
            continue
        
        soup = BeautifulSoup(response.content, 'html.parser')
        data = extract_jena_data(row["Content"], soup)
        extracted_data.append(data)
    
    # Create the DataFrame
    df_jena_cleaned = pd.DataFrame([{
        "URL": row["URL"],
        "Title": data["Title"],
        # "Description": data["Description"],
        "Project Start Date": data["Project Start Date"],
        # "Username": data["Username"],
        # "Tags": data["Tags"],
        "Supporters": data["Supporters"],
        "Discussions": data["Discussions"],
        "Comments": data["Comments"]
    } for row, data in zip(df_jena.to_dict('records'), extracted_data)])

    return df_jena_cleaned


In [248]:
df_jena_cleaned = process_all_jena_entries(df_jena)
df_wurzburg_cleaned = process_all_jena_entries(df_wurzburg)

  comments_tag = soup.find('h4', text=re.compile(r'Kommentare'))
  comments_tag = soup.find('h4', text=re.compile(r'Kommentare'))


In [249]:
df_wurzburg_cleaned

Unnamed: 0,URL,Title,Project Start Date,Supporters,Discussions,Comments
0,https://wuerzburg-mitmachen.de/mobilitaetsplan,Mobilitätsplan 2040,,,,0
1,https://wuerzburg-mitmachen.de/zukunftsregion,Zukunftsregion Würzburg,,,,0
2,https://wuerzburg-mitmachen.de/zukunftskonzepte-fuer-die-innenstadt,Zukunftskonzepte für die Innenstadt,13. Mai 2024,,,0
3,https://wuerzburg-mitmachen.de/klimaanpassung,Klimaanpassung,,,,0
4,https://wuerzburg-mitmachen.de/europawahl-2024,Europawahl 2024,,,,0
5,https://wuerzburg-mitmachen.de/umfrage-zum-neuen-buerger-und-stadtteilzentrum-lindleinsmuehle,Bürgerbeteiligung in der Lindleinsmühle,,,,0
6,https://wuerzburg-mitmachen.de/freizeitgelaende-katzenbergtunnel,Freizeitgelände Katzenbergtunnel,,,,0
7,https://wuerzburg-mitmachen.de/laermaktionsplan,Lärmaktionsplan,,,,0
8,https://wuerzburg-mitmachen.de/neugestaltung-des-mainufers-heidingsfeld,Neugestaltung des Mainufers Heidingsfeld,,,,0
9,https://wuerzburg-mitmachen.de/vu-suedlicher-bischofshut,VU Südlicher Bischofshut,,,,0


#### BurgerBudgets in Jena (2024, 23, 22)

In [243]:
# URLs for the budgets
budget_urls = {
    2024: "https://mitmachen.jena.de/buergerbudget",
    2023: "https://mitmachen.jena.de/buergerbudget-2023",
    2022: "https://mitmachen.jena.de/buergerbudget-2022"
}

# Updated function to scrape and clean a budget table for a given year
def scrape_and_clean_budget_table(url, year):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to load URL: {url}")
        return None
    
    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table', id='budget-investments-compatible')  # Locate the table by its ID
    
    if not table:
        print(f"No table found for URL: {url}")
        return None
    
    # Extract the total available budget for the year (last <th> in <thead>)
    available_budget_tag = table.find('thead').find_all('th')[-1]  # Find the last <th>
    available_budget = (
        float(re.sub(r'[^\d.]', '', available_budget_tag.get_text(strip=True))) * 1000
        if available_budget_tag else None
    )
    
    # Extract table headers
    headers = [th.get_text(strip=True) for th in table.find('thead').find_all('th')]
    
    # Extract table rows
    rows = []
    for tr in table.find('tbody').find_all('tr'):
        # Extract row cells
        cells = [td.get_text(strip=True) for td in tr.find_all('td')]
        
        # Check the class of the <tr> tag for "success" or "discarded"
        approved = 1 if 'success' in tr.get('class', []) else 0
        
        # Append cells and approval status
        rows.append(cells + [approved])
    
    # Add "Approved" column to the headers
    headers.append('Approved')
    
    # Create a DataFrame
    df = pd.DataFrame(rows, columns=headers)
    df['Year'] = year  # Add a 'Year' column
    df['Available Budget'] = available_budget  # Add the total budget for the year to every row
    return df

# Scrape and clean tables for all years
budget_dataframes = [
    scrape_and_clean_budget_table(url, year) for year, url in budget_urls.items()
]

# Combine all dataframes into one
budget_jena_df = pd.concat(budget_dataframes, ignore_index=True)

# Clean and transform the DataFrame
budget_jena_df['Preis'] = budget_jena_df['Preis'].str.extract(r'(\d+)').astype(float) * 1000
budget_jena_df['Stimmen'] = budget_jena_df['Stimmen'].str.extract(r'(\d+)').astype(int)

# Rename columns to English
budget_jena_df.rename(columns={
    'Vorschlag Titel': 'Proposal Title',
    'Stimmen': 'Votes',
    'Preis': 'Price',
    'Year': 'Year',
    'Available Budget': 'Budget for this year',
    'Approved': 'Approved'
}, inplace=True)

# Drop unnecessary columns if any remain
budget_jena_df = budget_jena_df.loc[:, ~budget_jena_df.columns.str.contains('VerfügbareBudgetmittel', na=False)]



## Scraping Comments from Jena Projects (could be probably scaled for other similar cities)

In [244]:
import requests
from bs4 import BeautifulSoup
import re

# Updated function to extract comments from a single page
def extract_comments_from_page(soup):
    comments_data = []
    comments_section = soup.find_all('div', class_='comment small-12')
    
    for comment in comments_section:
        # Extract comment text
        comment_text = comment.find('p').get_text(strip=True) if comment.find('p') else None
        
        # Extract username
        username_tag = comment.find('span', class_='user-name')
        username = username_tag.get_text(strip=True) if username_tag else None
        
        # Extract date
        date_tag = comment.find('div', class_='comment-info').find_all('a')[-1]
        date = date_tag.get_text(strip=True) if date_tag else None

        
        # Extract likes and dislikes (clean and convert to integer)
        likes_tag = comment.find('span', class_='in-favor')
        likes = int(re.sub(r'\D', '', likes_tag.get_text(strip=True))) if likes_tag else 0
        
        dislikes_tag = comment.find('span', class_='against')
        dislikes = int(re.sub(r'\D', '', dislikes_tag.get_text(strip=True))) if dislikes_tag else 0
        
        # Extract total votes (clean and convert to integer)
        total_votes = likes + dislikes
        
        comments_data.append({
            'Text': comment_text,
            'Username': username,
            'Date': date,
            'Likes': likes,
            'Dislikes': dislikes,
            'Total Votes': total_votes
        })
    return comments_data


# Scrape all comments across pages (pagination logic remains the same)
def scrape_all_comments(base_url):
    comments = []
    page = 1
    
    while True:
        paginated_url = f"{base_url}?page={page}" if page > 1 else base_url
        response = requests.get(paginated_url)
        
        if response.status_code != 200:
            print(f"Failed to load page {page} for URL: {base_url}")
            break
        
        soup = BeautifulSoup(response.content, 'html.parser')
        comments_on_page = extract_comments_from_page(soup)
        
        if not comments_on_page:  # Stop if no comments on the page
            break
        
        comments.extend(comments_on_page)
        page += 1

    return comments

# Function to scrape the main content and comments for each URL
def scrape_content_and_comments(urls):
    data = []
    
    for url in urls:
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to load URL: {url}")
            continue
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Scrape main content
        title = soup.find('title').get_text(strip=True) if soup.find('title') else None
        content_div = soup.find('div', class_='flex-layout')
        content = content_div.get_text(strip=True) if content_div else None
        
        # Scrape comments
        comments = scrape_all_comments(url)
        
        data.append({
            'URL': url,
            'Title': title,
            'Content': content,
            'Comments': comments
        })
    
    return data

# Scrape comments for all URLs
urls = df_jena['URL'].tolist()
scraped_data = scrape_content_and_comments(urls)

# Create structured DataFrame for comments
comments_data = []
for item in scraped_data:
    for comment in item['Comments']:
        comment['URL'] = item['URL']  # Link comment to the project URL
        comments_data.append(comment)

# Create the comments DataFrame
df_comments = pd.DataFrame(comments_data)

# Create a mapping from URL to Title
url_to_title = df_jena_cleaned.set_index('URL')['Title'].to_dict()

# Add a 'Project' column to df_comments using the mapping
df_comments['Project'] = df_comments['URL'].map(url_to_title)
df_comments = df_comments[['URL', 'Project'] + [col for col in df_comments.columns if col not in ['URL', 'Project']]]

In [245]:
pd.set_option('display.max_colwidth', 100)