# Web Scraper for 19 Cities

This scraper extracts and organizes data into three main DataFrames:
1. **`all_projects_df`**: Contains all projects from the websites.
   - Columns: `Project URL`, `Project Title`, `Project Description`, `Proposal Count`, `City`

2. **`all_proposals_df`**: Contains all proposals under projects.
   - Columns: `URL`, `Title`, `Proposed for Project`, `Description`, `Author`, `Comments`, `Supporters`, `City`

3. **`all_comments_df`**: Contains all comments under projects and proposals.
   - Columns: `URL`, `Project`, `Text`, `Author`, `Likes`, `Dislikes`, `Date`, `City`


In [115]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Updated function to extract proposals from a project page
def extract_proposals(soup, base_url):
    proposals = []
    proposal_items = soup.find_all('div', class_='resource-item proposal-list-item')

    for proposal in proposal_items:
        # Extract title
        title_tag = proposal.find('a', class_='resource-item--title')
        title = title_tag.get_text(strip=True) if title_tag else None

        # Extract URL
        url = base_url + title_tag['href'] if title_tag and 'href' in title_tag.attrs else None

        # Extract description
        description_tag = proposal.find('div', class_='resource-item--description')
        description = description_tag.get_text(strip=True) if description_tag else None

        # Extract author/username
        author_tag = proposal.find('a', class_='resource-item--author')
        author = author_tag.get_text(strip=True) if author_tag else None

        # Extract number of comments
        comments_tag = proposal.find('span', class_='comments')
        comments = int(comments_tag.get_text(strip=True).split()[0]) if comments_tag else 0

        # Extract number of supporters
        supporters_tag = proposal.find('span', class_='total-supports')
        supporters = int(supporters_tag.get_text(strip=True).split()[0]) if supporters_tag else 0

        # Extract parent project
        project_tag = proposal.find('a', class_='breadcrumbs-item')
        proposed_for_project = project_tag.get_text(strip=True) if project_tag else None

        proposals.append({
            'URL': url,
            'Title': title,
            'Proposed for Project': proposed_for_project,
            'Description': description,
            'Author': author,
            'Comments': comments,
            'Supporters': supporters,
        })
    return proposals
        proposals.append({
            'URL': url,
            'Title': title,
            'Proposed for Project': proposed_for_project,
            'Description': description,
            'Author': author,
            'Comments': comments,
            'Supporters': supporters,
        })
    return proposals


# Function to extract city name from the base URL
def extract_city_name(base_url):
    # Words to remove from the city name
    remove_words = ['mitmachen', 'Mitmachen', 'mitwirken', 'Smarte', 'region', 'unser', 'mitgestalten', 'gestalten', 'machmit', 'dialog', 'consul', 'www', 'de', 'https', 'com']

    # Split the URL into parts (by '.' or '/')
    parts = base_url.replace('https://', '').replace('http://', '').split('.')
    all_parts = [part.split('/')[0] for part in parts]  # Handle cases where "/" exists after domain

    # Remove known unwanted words and empty strings
    filtered_parts = [part for part in all_parts if part.lower() not in remove_words and part]

    # Return the first relevant part (assumes city name is left after filtering)
    city = filtered_parts[0].replace('-', ' ').capitalize() if filtered_parts else "Unknown"

    # Remove unwanted words from city name
    for word in remove_words:
        city = city.replace(word, '')

    return city.strip().capitalize()
    return city.strip().capitalize()


# Update project scraping to exclude comments
def scrape_project_page_with_proposals(url, base_url, city):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to load project page: {url}")
        return None, []
# Update project scraping to exclude comments
def scrape_project_page_with_proposals(url, base_url, city):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to load project page: {url}")
        return None, []

    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract project title
    title_tag = soup.find('title')
    project_title = title_tag.get_text(strip=True) if title_tag else None

    # Extract project description
    content_div = soup.find('div', class_='flex-layout')
    description = content_div.get_text(strip=True) if content_div else None

    # Extract proposals
    proposals = extract_proposals(soup, base_url=base_url)

    return {
        'Project URL': url,
        'Project Title': project_title,
        'Project Description': description,
        'Proposal Count': len(proposals),
    }, proposals


# Modified function to scrape projects with proposals
def scrape_projects_with_proposals(main_url, base_url):
    response = requests.get(main_url)
    if response.status_code != 200:
        print(f"Failed to load main projects page: {main_url}")
        return pd.DataFrame(), pd.DataFrame()

    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all project links
    links = soup.find_all('a', class_='resource-item--title')
    project_links = [base_url + link['href'] for link in links if 'href' in link.attrs]

    projects = []
    all_proposals = []

    for project_url in project_links:
        try:
            project_data, proposals = scrape_project_page_with_proposals(
                project_url, base_url, extract_city_name(base_url)
            )
            if project_data:
                projects.append(project_data)
                all_proposals.extend(proposals)
        except Exception as e:
            print(f"Error scraping project at {project_url}: {e}")

    return pd.DataFrame(projects), pd.DataFrame(all_proposals)

# List of websites (fixed flensburg-mitmachen.de base_url)
websites = [
    {"main_url": "https://wuerzburg-mitmachen.de/projekts", "base_url": "https://wuerzburg-mitmachen.de"},
    {"main_url": "https://mitmachen.siegburg.de/projekts", "base_url": "https://mitmachen.siegburg.de"}, 
    {"main_url": "https://mitmachen.jena.de/projekts", "base_url": "https://mitmachen.jena.de"},
    {"main_url": "https://mitmachgemeinde.de/projekts", "base_url": "https://mitmachgemeinde.de"},
    {"main_url": "https://bamberg-gestalten.de/projekts", "base_url": "https://bamberg-gestalten.de"},
    {"main_url": "https://mitmachen-pforzheim.de/projekts", "base_url": "https://mitmachen-pforzheim.de"},
    {"main_url": "https://bochum-mitgestalten.de/projekts", "base_url": "https://bochum-mitgestalten.de"},
    {"main_url": "https://unser.muenchen.de/projekts", "base_url": "https://unser.muenchen.de"},
    {"main_url": "https://mitreden.ilzerland.bayern/projekts", "base_url": "https://mitreden.ilzerland.bayern"},
    {"main_url": "https://stutensee-mitwirken.de/projekts", "base_url": "https://stutensee-mitwirken.de"},
    {"main_url": "https://consul.unterschleissheim.de/projekts", "base_url": "https://consul.unterschleissheim.de"},
    {"main_url": "https://machmit.kempten.de/projekts", "base_url": "https://machmit.kempten.de"},
    {"main_url": "https://consul.detmold-mitgestalten.de/projekts", "base_url": "https://consul.detmold-mitgestalten.de"},
    {"main_url": "https://flensburg-mitmachen.de/projekts", "base_url": "https://flensburg-mitmachen.de"},  # Fixed URL
    {"main_url": "https://mitmachen.amberg.de/projekts", "base_url": "https://mitmachen.amberg.de"},
    {"main_url": "https://mitmachen.smarte-region-linz.de/projekts", "base_url": "https://mitmachen.smarte-region-linz.de"},
    {"main_url": "https://mitgestalten.trier.de/projekts", "base_url": "https://mitgestalten.trier.de"},
    {"main_url": "https://machmit.augsburg.de/projekts", "base_url": "https://machmit.augsburg.de"}
]


# Initialize empty DataFrames for all projects and proposals
all_projects_df = pd.DataFrame()
all_proposals_df = pd.DataFrame()

# Main loop to scrape all websites
for site in websites:
    main_url = site["main_url"]
    base_url = site["base_url"]

    city = extract_city_name(base_url)

    try:
        # Scrape projects and proposals
        projects_df, proposals_df = scrape_projects_with_proposals(main_url, base_url)

        # Add a 'City' column to all DataFrames
        projects_df['City'] = city
        proposals_df['City'] = city

        # Append results to the combined DataFrames
        all_projects_df = pd.concat([all_projects_df, projects_df], ignore_index=True)
        all_proposals_df = pd.concat([all_proposals_df, proposals_df], ignore_index=True)
    except Exception as e:
        print(f"Failed to scrape {main_url} - {e}")


Error scraping project at https://flensburg-mitmachen.dehttps://survey.lamapoll.de/Publikumspreis-Kommune-bewegt-Welt-2024: HTTPSConnectionPool(host='flensburg-mitmachen.dehttps', port=443): Max retries exceeded with url: /survey.lamapoll.de/Publikumspreis-Kommune-bewegt-Welt-2024 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000018D29CE5AB0>: Failed to resolve 'flensburg-mitmachen.dehttps' ([Errno 11001] getaddrinfo failed)"))


In [131]:
all_proposals_df.head()

Unnamed: 0,URL,Title,Proposed for Project,Description,Author,Comments,Supporters,City
0,https://wuerzburg-mitmachen.de/proposals/110-autofreier-bischofshut,Autofreier Bischofshut,Zukunftskonzepte für die Innenstadt,"Wir fordern die Ausrufung des Klimanotstands, damit Belange unseres Klimas vor das wirtschaftlic...",Letzte Generation Würzburg,0.0,20.0,Wuerzburg
1,https://wuerzburg-mitmachen.de/proposals/109-e-scooter-verbieten,E Scooter verbieten,Zukunftskonzepte für die Innenstadt,E Scooter sollten (im Innenstadtbereich) verboten werden. Diese werden häufig willkürlich abgest...,Ccmuet,0.0,2.0,Wuerzburg
2,https://wuerzburg-mitmachen.de/proposals/108-barrierefrei-ins-nautiland-lgs,Barrierefrei ins Nautiland/LGS,Zukunftskonzepte für die Innenstadt,Nautiland - neu.\r\nUmweltstation - neu.\r\nZellertorauffahrt - neu.\r\nLeider fehlen die barrie...,AASeuffert,0.0,2.0,Wuerzburg
3,https://wuerzburg-mitmachen.de/proposals/107-kinderabenteuer-indoor-spielplatz-smaland,Kinderabenteuer / Indoor Spielplatz / Smaland,Zukunftskonzepte für die Innenstadt,Es gibt zwar schon den FunPark für Kinder mit Trampolinhalle etc. in der Nähe der Nürnberger Str...,ABlitz,0.0,0.0,Wuerzburg
4,https://wuerzburg-mitmachen.de/proposals/106-banke-und-grun-im-neu-gestalteten-bereich-karmelite...,"Bänke und ""Grün"" im neu gestalteten Bereich Karmelitenstraße/Vierröhr...",Zukunftskonzepte für die Innenstadt,Die Baustelle von der Karmelitenstraße zum Vierröhrenbrunnen wurde vor kurzem abgeschlossen. Die...,Ccmuet,0.0,12.0,Wuerzburg


In [132]:
all_projects_df.head()

Unnamed: 0,Project URL,Project Title,Project Description,Proposal Count,City
0,https://wuerzburg-mitmachen.de/grombuehl-zukunftssicher,Energetisches Quartierskonzept für Grombühl,"Grombühl 2040 - ein SzenarioDie Straßen Grombühls sind grüner, ruhiger und voller Leben. Das „Qu...",0,Wuerzburg
1,https://wuerzburg-mitmachen.de/mobilitaetsplan,Mobilitätsplan 2040,Mobilitätsplan 2040 für die Stadt Würzburg: Jetzt mitmachen!Aktuell erstellt die Stadt Würzburg ...,0,Wuerzburg
2,https://wuerzburg-mitmachen.de/zukunftsregion,Zukunftsregion Würzburg,Zukunftsregion Würzburg: Jetzt aktiv mitgestalten!Die Stadt und der Landkreis Würzburg wollen ih...,0,Wuerzburg
3,https://wuerzburg-mitmachen.de/zukunftskonzepte-fuer-die-innenstadt,Zukunftskonzepte für die Innenstadt,"Wie soll die Würzburger Innenstadt von morgen aussehen? Was wünschen sich Bürger:innen, Einzelhä...",24,Wuerzburg
4,https://wuerzburg-mitmachen.de/klimaanpassung,Klimaanpassung,Klimaanpassungsstrategie für die Stadt Würzburg: Jetzt mitmachen!Würzburg - Seit Anfang 2024 era...,14,Wuerzburg


#### BurgerBudgets in Jena (2024, 23, 22)

In [146]:
df.to_csv('siegburg_data.csv', index=False)

In [147]:
df_comments

Username
Lars Löw           34
klaus.kleiner77    32
Der,wo             31
PM                 18
Klaus.kleiner77    16
                   ..
Julius Kuhn         1
Juliane88           1
Juliane Fuchs       1
Julian Sing         1
🐙                   1
Name: count, Length: 387, dtype: int64

### Additional cleaaning and structuring for Sieburg (review if it's needed) !

In [133]:
# import re

# # Enhanced function to extract all logical parts, including "Unterstützer*innen"
# def extract_full_data_with_supporters(content):
#     # Extract title (everything before the first date)
#     title_match = re.search(r'^(.*?)(\r|\d{1,2}\.\s\w+\s\d{4})', content)
#     title = title_match.group(1).strip() if title_match else None

#     # Extract date
#     date_match = re.search(r'\d{1,2}\.\s\w+\s\d{4}', content)
#     date = date_match.group(0) if date_match else None

#     # Extract comments count
#     comments_match = re.search(r'(\d+)\sKommentare', content)
#     comments = int(comments_match.group(1)) if comments_match else 0

#     # Extract tags (sections with numbers or + signs)
#     tags_match = re.findall(r'(\d{1,2}[-+]\d{1,2}|\d{2}\+)', content)
#     tags = ', '.join(tags_match) if tags_match else None

#     # Extract description (everything after "Geselliges Beisammensein" or similar patterns)
#     description_start = re.search(r'(Geselliges Beisammensein|Angebotslandkarte)', content)
#     description = content[description_start.start():].strip() if description_start else None

#     # Extract username
#     username_match = re.search(r'(\w+\s\w+|Beigetreten am:.*?\d{4})', content)
#     username = username_match.group(1).split('Beigetreten am:')[0].strip() if username_match else None

#     # Extract Vorschläge count
#     vorschlaege_match = re.search(r'Vorschläge(\d+)', content)
#     vorschlaege = int(vorschlaege_match.group(1)) if vorschlaege_match else 0

#     # Extract Konto verification status
#     konto_match = re.search(r'(Konto\s(verifiziert|ist nicht verifiziert))', content)
#     konto_status = konto_match.group(2) if konto_match else None

#     # # Extract registration date
#     # registration_match = re.search(r'Beigetreten am:\s(\d{1,2}\.\s\w+\s\d{4})', content)
#     # registration_date = registration_match.group(1) if registration_match else None

#     # Extract number of Unterstützer*innen
#     supporters_match = re.search(r'(\d+)\sUnterstützer\*in', content)
#     supporters = int(supporters_match.group(1)) if supporters_match else 0

#     return title, date, comments, tags, description, username, vorschlaege, konto_status, supporters

# # Apply the enhanced function to the DataFrame and create new columns
# df_sieburg[['Title', 'Date', 'Comments', 'Tags', 'Description', 'Username', 'Vorschläge', 'Konto Status', 'Supporters']] = df_sieburg['Content'].apply(
#     lambda x: pd.Series(extract_full_data_with_supporters(x))
# )


# # Function to clean description considering keywords, numeric patterns, and refined starting logic
# def clean_description_advanced(content):
#     # Define keywords that mark the beginning of the description
#     keywords = [
#         'Geselliges Beisammensein', 'Natur', 'Hilfe & Beratung', 'Bildung', 
#         'Musik', 'Bewegung', 'Glaube', 'Kulinarisches', 'Kunst & Kultur', 'Sonstiges',
#     ]
    
#     # Check for keywords first
#     for keyword in keywords:
#         if keyword in content:
#             start_idx = content.find(keyword) + len(keyword)
#             description = content[start_idx:].strip()
#             description = re.split(r'(Kommentare\(.*?\)|registrieren)', description)[0].strip()
#             return description

#     # If no keyword is found, check for numeric patterns like "18-24, 25-49, etc."
#     numeric_pattern = re.search(r'(\d{1,2}[-+]\d{1,2}|\d{2}\+)', content)
#     if numeric_pattern:
#         start_idx = numeric_pattern.end()
#         description = content[start_idx:].strip()
#         description = re.split(r'(Kommentare\(.*?\)|registrieren)', description)[0].strip()
#         return description

#     # As a fallback, find the first capital letter, quote, or digit to mark the start
#     fallback_match = re.search(r'[A-Z"0-9]', content)
#     if fallback_match:
#         start_idx = fallback_match.start()
#         description = content[start_idx:].strip()
#         description = re.split(r'(Kommentare\(.*?\)|registrieren)', description)[0].strip()
#         return description

#     # If nothing works, return the content as is
#     return content

# # Apply the advanced cleaning function to the Description column
# df_sieburg['Description'] = df_sieburg['Content'].apply(clean_description_advanced)
