# Web Scraper for 19 Cities

This scraper extracts and organizes data into three main DataFrames:
1. **`all_projects_df`**: Contains all projects from the websites.
   - Columns: `Project URL`, `Project Title`, `Project Description`, `Proposal Count`, `City`

2. **`all_proposals_df`**: Contains all proposals under projects.
   - Columns: `URL`, `Title`, `Proposed for Project`, `Description`, `Author`, `Comments`, `Supporters`, `City`

3. **`all_comments_df`**: Contains all comments under projects and proposals.
   - Columns: `URL`, `Project`, `Text`, `Author`, `Likes`, `Dislikes`, `Date`, `City`


In [18]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# # Updated function to extract proposals from a project page
# def extract_proposals(soup, base_url):
#     proposals = []
#     proposal_items = soup.find_all('div', class_='resource-item proposal-list-item')
#     # here we are looking for all the proposals in the page with the class 'resource-item proposal-list-item'

#     # For each proposal item, we loop through and extract the relevant information
#     for proposal in proposal_items:
#         # Extract title by finding the class 'resource-item--title' and getting the text
#         title_tag = proposal.find('a', class_='resource-item--title')
#         title = title_tag.get_text(strip=True) if title_tag else None

#         # Extract URL by finding the class 'resource-item--title' and getting the 'href' attribute (because title is a link)
#         url = base_url + title_tag['href'] if title_tag and 'href' in title_tag.attrs else None

#         # Extract description by finding the class 'resource-item--description' and getting the text
#         description_tag = proposal.find('div', class_='resource-item--description')
#         description = description_tag.get_text(strip=True) if description_tag else None

#         # Extract author/username by finding the class 'resource-item--author' and getting the text
#         author_tag = proposal.find('a', class_='resource-item--author')
#         author = author_tag.get_text(strip=True) if author_tag else None

#         # Extract number of comments by finding the class 'comments' and getting the text (span is almost like a div, but inline)
#         comments_tag = proposal.find('span', class_='comments')
#         comments = int(comments_tag.get_text(strip=True).split()[0]) if comments_tag else 0

#         # Extract number of supporters by finding the class 'total-supports' and getting the text
#         supporters_tag = proposal.find('span', class_='total-supports')
#         supporters = int(supporters_tag.get_text(strip=True).split()[0]) if supporters_tag else 0

#         # Extract parent project name by finding the class 'breadcrumbs-item' and getting the text
#         project_tag = proposal.find('a', class_='breadcrumbs-item')
#         proposed_for_project = project_tag.get_text(strip=True) if project_tag else None

#         # Append the extracted information to the proposals list. Proposals are dictionaries with keys and values
#         proposals.append({
#             'URL': url,
#             'Title': title,
#             'Proposed for Project': proposed_for_project,
#             'Description': description,
#             'Author': author,
#             'Comments': comments,
#             'Supporters': supporters,
#         })
#     return proposals


# # Function to extract city name from the base URL
# def extract_city_name(base_url):
#     # Words to remove from the city name, because URL is not always clean city name and may contain extra words (e.g. mitmachen)
#     remove_words = ['mitmachen', 'Mitmachen', 'mitwirken', 'Smarte', 'region', 'unser', 'mitgestalten', 'gestalten', 'machmit', 'dialog', 'consul', 'www', 'de', 'https', 'com']

#     # Split the URL into parts (by '.' or '/'), because city name is usually the first relevant part
#     parts = base_url.replace('https://', '').replace('http://', '').split('.')
#     all_parts = [part.split('/')[0] for part in parts]  # Handle cases where "/" exists after domain

#     # Remove known unwanted words and empty strings 
#     filtered_parts = [part for part in all_parts if part.lower() not in remove_words and part]

#     # Return the first relevant part (assumes city name is left after filtering) 
#     city = filtered_parts[0].replace('-', ' ').capitalize() if filtered_parts else "Unknown"

#     # Because city name is not always first part, we get it by removing all other words (they are usually similar in all URLs)
#     for word in remove_words:
#         city = city.replace(word, '')

#     return city.strip().capitalize()


# # Here is the function that scrapes the project page and extracts the project title, description and proposals 
# def scrape_project_page_with_proposals(url, base_url, city):

#     # In that block we request the page and parse it with BeautifulSoup, getting soup object, which contains all the HTML content from the given URL
#     response = requests.get(url)
#     if response.status_code != 200:
#         print(f"Failed to load project page: {url}")
#         return None, []

#     soup = BeautifulSoup(response.content, 'html.parser')

    

#     # Extract project title by finding the 'title' tag and getting the text
#     title_tag = soup.find('title')
#     project_title = title_tag.get_text(strip=True) if title_tag else None

#     # Extract project description by finding the 'div' tag with class 'flex-layout' and getting the text
#     content_div = soup.find('div', class_='flex-layout')
#     description = content_div.get_text(strip=True) if content_div else None

#     # Extract proposals by calling the extract_proposals function, which we defined earlier
#     proposals = extract_proposals(soup, base_url=base_url)

#     # Return a dictionary with the extracted information and the list of proposals
#     return {
#         'Project URL': url,
#         'Project Title': project_title,
#         'Project Description': description,
#         'Proposal Count': len(proposals),
#     }, proposals


# # Here is the function that scrapes the main projects page and extracts the project URLs. This function is "main" because it calls the scrape_project_page_with_proposals function, which calls the extract_proposals function.
# # So the scheme is: scrape_projects_with_proposals(to get project URLs) -> scrape_project_page_with_proposals(to get project data and proposals) -> extract_proposals(to get proposals)
# def scrape_projects_with_proposals(main_url, base_url):

#     # Here we again request the page and parse it with BeautifulSoup, getting soup object, which contains all the HTML content from the given URL
#     response = requests.get(main_url)
#     if response.status_code != 200:
#         print(f"Failed to load main projects page: {main_url}")
#         return pd.DataFrame(), pd.DataFrame()

#     soup = BeautifulSoup(response.content, 'html.parser')

#     # Find all project links by finding the 'a' tag with class 'resource-item--title' and getting the 'href' attribute
#     links = soup.find_all('a', class_='resource-item--title')
#     project_links = [base_url + link['href'] for link in links if 'href' in link.attrs]

#     projects = []
#     all_proposals = []

#     # For each project link, call the scrape_project_page_with_proposals function to get the project data and proposals. Try/except block is used to catch any errors that may occur during scraping

#     for project_url in project_links:
#         try:
#             project_data, proposals = scrape_project_page_with_proposals(
#                 project_url, base_url, extract_city_name(base_url)
#             )
#             if project_data:
#                 projects.append(project_data)
#                 all_proposals.extend(proposals)
#         except Exception as e:
#             print(f"Error scraping project at {project_url}: {e}")

#     return pd.DataFrame(projects), pd.DataFrame(all_proposals)

# # List of websites to scrape (19 websites with similar structure)
# websites = [
#     {"main_url": "https://wuerzburg-mitmachen.de/projekts", "base_url": "https://wuerzburg-mitmachen.de"},
#     {"main_url": "https://mitmachen.siegburg.de/projekts", "base_url": "https://mitmachen.siegburg.de"}, 
#     {"main_url": "https://mitmachen.jena.de/projekts", "base_url": "https://mitmachen.jena.de"},
#     {"main_url": "https://mitmachgemeinde.de/projekts", "base_url": "https://mitmachgemeinde.de"},
#     {"main_url": "https://bamberg-gestalten.de/projekts", "base_url": "https://bamberg-gestalten.de"},
#     {"main_url": "https://mitmachen-pforzheim.de/projekts", "base_url": "https://mitmachen-pforzheim.de"},
#     {"main_url": "https://bochum-mitgestalten.de/projekts", "base_url": "https://bochum-mitgestalten.de"},
#     {"main_url": "https://unser.muenchen.de/projekts", "base_url": "https://unser.muenchen.de"},
#     {"main_url": "https://mitreden.ilzerland.bayern/projekts", "base_url": "https://mitreden.ilzerland.bayern"},
#     {"main_url": "https://stutensee-mitwirken.de/projekts", "base_url": "https://stutensee-mitwirken.de"},
#     {"main_url": "https://consul.unterschleissheim.de/projekts", "base_url": "https://consul.unterschleissheim.de"},
#     {"main_url": "https://machmit.kempten.de/projekts", "base_url": "https://machmit.kempten.de"},
#     {"main_url": "https://consul.detmold-mitgestalten.de/projekts", "base_url": "https://consul.detmold-mitgestalten.de"},
#     {"main_url": "https://flensburg-mitmachen.de/projekts", "base_url": "https://flensburg-mitmachen.de"},  # Fixed URL
#     {"main_url": "https://mitmachen.amberg.de/projekts", "base_url": "https://mitmachen.amberg.de"},
#     {"main_url": "https://mitmachen.smarte-region-linz.de/projekts", "base_url": "https://mitmachen.smarte-region-linz.de"},
#     {"main_url": "https://mitgestalten.trier.de/projekts", "base_url": "https://mitgestalten.trier.de"},
#     {"main_url": "https://machmit.augsburg.de/projekts", "base_url": "https://machmit.augsburg.de"}
# ]


# # Initialize empty DataFrames for all projects and proposals 
# all_projects_df = pd.DataFrame()
# all_proposals_df = pd.DataFrame()

# # Main loop to scrape all websites in the list
# for site in websites:
#     main_url = site["main_url"]
#     base_url = site["base_url"]

#     city = extract_city_name(base_url)

#     try:
#         # Scrape projects and proposals
#         projects_df, proposals_df = scrape_projects_with_proposals(main_url, base_url)

#         # Add a 'City' column to all DataFrames
#         projects_df['City'] = city
#         proposals_df['City'] = city

#         # Append results to the combined DataFrames
#         all_projects_df = pd.concat([all_projects_df, projects_df], ignore_index=True)
#         all_proposals_df = pd.concat([all_proposals_df, proposals_df], ignore_index=True)
#     except Exception as e:
#         print(f"Failed to scrape {main_url} - {e}")


# # Save the results to CSV files
# all_projects_df.to_csv('data/all_projects.csv', index=False)
# all_proposals_df.to_csv('data/all_proposals.csv', index=False)
# 

In [19]:
all_projects_df = pd.read_csv('all_projects.csv')
all_proposals_df = pd.read_csv('all_proposals.csv')
all_comments_df = pd.read_csv('all_comments.csv')

In [20]:
all_proposals_df['City'].value_counts()

City
Muenchen             120
Siegburg              96
Amberg                79
Detmold               76
Kempten               69
Mitren                55
Wuerzburg             39
Pforzheim             35
Bamberg               32
Unterschleissheim     27
Trier                 20
Augsburg              17
Name: count, dtype: int64

In [21]:
all_projects_df.head()

Unnamed: 0,Project URL,Project Title,Project Description,Proposal Count,City
0,https://wuerzburg-mitmachen.de/grombuehl-zukun...,Energetisches Quartierskonzept für Grombühl,Grombühl 2040 - ein SzenarioDie Straßen Grombü...,0,Wuerzburg
1,https://wuerzburg-mitmachen.de/mobilitaetsplan,Mobilitätsplan 2040,Mobilitätsplan 2040 für die Stadt Würzburg: Je...,0,Wuerzburg
2,https://wuerzburg-mitmachen.de/zukunftsregion,Zukunftsregion Würzburg,Zukunftsregion Würzburg: Jetzt aktiv mitgestal...,0,Wuerzburg
3,https://wuerzburg-mitmachen.de/zukunftskonzept...,Zukunftskonzepte für die Innenstadt,Wie soll die Würzburger Innenstadt von morgen ...,24,Wuerzburg
4,https://wuerzburg-mitmachen.de/klimaanpassung,Klimaanpassung,Klimaanpassungsstrategie für die Stadt Würzbur...,14,Wuerzburg


#### BurgerBudgets in Jena (2024, 23, 22)

In [22]:
# # URLs for the budgets
# budget_urls = {
#     2024: "https://mitmachen.jena.de/buergerbudget",
#     2023: "https://mitmachen.jena.de/buergerbudget-2023",
#     2022: "https://mitmachen.jena.de/buergerbudget-2022"
# }

# # Updated function to scrape and clean a budget table for a given year
# def scrape_and_clean_budget_table(url, year):
#     response = requests.get(url)
#     if response.status_code != 200:
#         print(f"Failed to load URL: {url}")
#         return None
    
#     soup = BeautifulSoup(response.content, 'html.parser')
#     table = soup.find('table', id='budget-investments-compatible')  # Locate the table by its ID
    
#     if not table:
#         print(f"No table found for URL: {url}")
#         return None
    
#     # Extract the total available budget for the year (last <th> in <thead>)
#     available_budget_tag = table.find('thead').find_all('th')[-1]  # Find the last <th>
#     available_budget = (
#         float(re.sub(r'[^\d.]', '', available_budget_tag.get_text(strip=True))) * 1000
#         if available_budget_tag else None
#     )
    
#     # Extract table headers
#     headers = [th.get_text(strip=True) for th in table.find('thead').find_all('th')]
    
#     # Extract table rows
#     rows = []
#     for tr in table.find('tbody').find_all('tr'):
#         # Extract row cells
#         cells = [td.get_text(strip=True) for td in tr.find_all('td')]
        
#         # Check the class of the <tr> tag for "success" or "discarded"
#         approved = 1 if 'success' in tr.get('class', []) else 0
        
#         # Append cells and approval status
#         rows.append(cells + [approved])
    
#     # Add "Approved" column to the headers
#     headers.append('Approved')
    
#     # Create a DataFrame
#     df = pd.DataFrame(rows, columns=headers)
#     df['Year'] = year  # Add a 'Year' column
#     df['Available Budget'] = available_budget  # Add the total budget for the year to every row
#     return df

# # Scrape and clean tables for all years
# budget_dataframes = [
#     scrape_and_clean_budget_table(url, year) for year, url in budget_urls.items()
# ]

# # Combine all dataframes into one
# budget_jena_df = pd.concat(budget_dataframes, ignore_index=True)

# # Clean and transform the DataFrame
# budget_jena_df['Preis'] = budget_jena_df['Preis'].str.extract(r'(\d+)').astype(float) * 1000
# budget_jena_df['Stimmen'] = budget_jena_df['Stimmen'].str.extract(r'(\d+)').astype(int)

# # Rename columns to English
# budget_jena_df.rename(columns={
#     'Vorschlag Titel': 'Proposal Title',
#     'Stimmen': 'Votes',
#     'Preis': 'Price',
#     'Year': 'Year',
#     'Available Budget': 'Budget for this year',
#     'Approved': 'Approved'
# }, inplace=True)

# # Drop unnecessary columns if any remain
# budget_jena_df = budget_jena_df.loc[:, ~budget_jena_df.columns.str.contains('VerfügbareBudgetmittel', na=False)]



# Comments Scraper

This scraper extracts comments for all projects from the `all_projects_df` DataFrame and organizes them into a structured DataFrame:

1. **`df_comments`**: Contains all comments associated with projects.
   - Columns:
     - `URL`: The URL of the project the comment is associated with.
     - `Project`: The title of the project the comment is associated with.
     - `City`: The city the project belongs to (extracted from the URL).
     - `Text`: The content of the comment.
     - `Username`: The name of the user who posted the comment.
     - `Date`: The date the comment was posted.
     - `Likes`: The number of likes the comment received.
     - `Dislikes`: The number of dislikes the comment received.
     - `Total Votes`: The total votes (likes + dislikes) the comment received.


In [23]:
# # Good scraper for comments (748 entries comments from 19 cities)

import requests
from bs4 import BeautifulSoup
import re

# # Here is the function to extract comments from a single page
# def extract_comments_from_page(soup):

#     # Initialize an empty list to store the comments and look for the comments section div with class 'comment small-12' (all the comment blocks in similar websites have this class)
#     comments_data = []
#     comments_section = soup.find_all('div', class_='comment small-12')
    
#     for comment in comments_section:
#         # Extract comment text (clean and remove extra whitespace) by finding the first 'p' tag inside the comment div
#         comment_text = comment.find('p').get_text(strip=True) if comment.find('p') else None
        
#         # Extract username by finding the 'span' tag with class 'user-name' inside the comment div
#         username_tag = comment.find('span', class_='user-name')
#         username = username_tag.get_text(strip=True) if username_tag else None
        
#         # Extract date by finding the last 'a' tag inside the 'div' tag with class 'comment-info' (last link is the date)
#         date_tag = comment.find('div', class_='comment-info').find_all('a')[-1]
#         date = date_tag.get_text(strip=True) if date_tag else None

        
#         # Extract likes and dislikes (clean and convert to integer) if they exist by finding the 'span' tags with class 'in-favor' and 'against'
#         likes_tag = comment.find('span', class_='in-favor')
#         likes = int(re.sub(r'\D', '', likes_tag.get_text(strip=True))) if likes_tag else 0
        
#         dislikes_tag = comment.find('span', class_='against')
#         dislikes = int(re.sub(r'\D', '', dislikes_tag.get_text(strip=True))) if dislikes_tag else 0
        
#         # Extract total votes (clean and convert to integer), it was the easiest way to get the total votes
#         total_votes = likes + dislikes
        
#         # Append the extracted information to the comments_data list
#         comments_data.append({
#             'Text': comment_text,
#             'Username': username,
#             'Date': date,
#             'Likes': likes,
#             'Dislikes': dislikes,
#             'Total Votes': total_votes
#         })
#     return comments_data


# # Function to extract city name from the base URL to add column 'City' to the comments DataFrame
# def extract_city_name(base_url):
#     # Words to remove from the city name (most URLs have similar structure and contain these words)
#     remove_words = ['mitmachen', 'Mitmachen', 'mitwirken', 'Smarte', 'region', 'unser', 'mitgestalten', 'gestalten', 'machmit', 'dialog', 'consul', 'www', 'de', 'https', 'com']

#     # Split the URL into parts (by '.' or '/')
#     parts = base_url.replace('https://', '').replace('http://', '').split('.')
#     all_parts = [part.split('/')[0] for part in parts]  # Handle cases where "/" exists after domain

#     # Remove known unwanted words and empty strings
#     filtered_parts = [part for part in all_parts if part.lower() not in remove_words and part]

#     # Return the first relevant part (assumes city name is left after filtering)
#     city = filtered_parts[0].replace('-', ' ').capitalize() if filtered_parts else "Unknown"

#     # Remove unwanted words from city name
#     for word in remove_words:
#         city = city.replace(word, '')
    

#     return city.strip().capitalize()


# # Scrape all comments from a paginated URL (e.g., https://example.com/comments?page=1), stop when no comments are found. So basically, this function scrapes all comments from all pages of a project
# def scrape_all_comments(base_url):
#     comments = []
#     page = 1
    
#     # This while loop will continue until there are no comments on the page (extract_comments_from_page returns an empty list)
#     while True:
#         paginated_url = f"{base_url}?page={page}" if page > 1 else base_url
#         response = requests.get(paginated_url)
        
#         if response.status_code != 200:
#             print(f"Failed to load page {page} for URL: {base_url}")
#             break
        
#         soup = BeautifulSoup(response.content, 'html.parser')
#         comments_on_page = extract_comments_from_page(soup)
        
#         if not comments_on_page:  # Stop if no comments on the page (extract_comments_from_page returns an empty list)
#             break
        
#         # Extend the comments list with the comments from the current page and increment the page number
#         comments.extend(comments_on_page)
#         page += 1

#     return comments

# # Function to scrape the main content and comments for each URL 
# # This function is mainly used to scrape the main content and comments for all project URLs and call the scrape_all_comments function to get all comments for each project
# # So usage scheme is: 
# # scrape_content_and_comments(to get main content and comments) -> scrape_all_comments(to get all comments for each project) 
# # -> extract_comments_from_page(to get comments from a single page) 
# # -> extract_city_name(to get city name from URL) 
# # -> form the final DataFrame with comments

# def scrape_content_and_comments(urls):
#     data = []
    
#     for url in urls:
#         response = requests.get(url)
#         if response.status_code != 200:
#             print(f"Failed to load URL: {url}")
#             continue
        
#         soup = BeautifulSoup(response.content, 'html.parser')
        
#         # Scrape main content (title and description) by finding the 'title' tag and the 'div' tag with class 'flex-layout'
#         title = soup.find('title').get_text(strip=True) if soup.find('title') else None
#         content_div = soup.find('div', class_='flex-layout')
#         content = content_div.get_text(strip=True) if content_div else None
        
#         # Scrape comments by calling the scrape_all_comments function
#         comments = scrape_all_comments(url)
        
#         # Append the extracted information to the data list
#         data.append({
#             'URL': url,
#             'Title': title,
#             'Content': content,
#             'Comments': comments
#         })
    
#     return data

# # Scrape comments for all project URLs from all_projects_df (created in the previous step)
# urls = all_projects_df['Project URL'].tolist()  # Use the 'Project URL' column from all_projects_df
# scraped_data = scrape_content_and_comments(urls)

# # Create structured DataFrame for comments
# comments_data = []

# # Loop through the scraped data and extract comments, link them to the project URL and extract the city name
# for item in scraped_data:
#     for comment in item['Comments']:
#         comment['URL'] = item['URL']  # Link comment to the project URL
#         # Extract city name from URL
#         city = extract_city_name(item['URL'])
#         comment['City'] = city
#         comments_data.append(comment)

# # Create the comments DataFrame
# df_comments = pd.DataFrame(comments_data)

# # Create a mapping from URL to Project Title (mapping means that we can use the URL to get the Project Title)
# url_to_title = all_projects_df.set_index('Project URL')['Project Title'].to_dict()

# # Add a 'Project' column to df_comments using the mapping (again method map is working like that: it takes the URL and returns the Project Title)
# df_comments['Project'] = df_comments['URL'].map(url_to_title)
# df_comments = df_comments[['URL', 'Project', 'City'] + [col for col in df_comments.columns if col not in ['URL', 'Project', 'City']]]

# # Save the comments DataFrame to a CSV file

# df_comments.to_csv('data/all_comments.csv', index=False)



In [24]:
all_comments_df['Total Votes'].max()

63

### Additional cleaaning and structuring for Sieburg (review if it's needed) !

In [25]:
# import re

# # Enhanced function to extract all logical parts, including "Unterstützer*innen"
# def extract_full_data_with_supporters(content):
#     # Extract title (everything before the first date)
#     title_match = re.search(r'^(.*?)(\r|\d{1,2}\.\s\w+\s\d{4})', content)
#     title = title_match.group(1).strip() if title_match else None

#     # Extract date
#     date_match = re.search(r'\d{1,2}\.\s\w+\s\d{4}', content)
#     date = date_match.group(0) if date_match else None

#     # Extract comments count
#     comments_match = re.search(r'(\d+)\sKommentare', content)
#     comments = int(comments_match.group(1)) if comments_match else 0

#     # Extract tags (sections with numbers or + signs)
#     tags_match = re.findall(r'(\d{1,2}[-+]\d{1,2}|\d{2}\+)', content)
#     tags = ', '.join(tags_match) if tags_match else None

#     # Extract description (everything after "Geselliges Beisammensein" or similar patterns)
#     description_start = re.search(r'(Geselliges Beisammensein|Angebotslandkarte)', content)
#     description = content[description_start.start():].strip() if description_start else None

#     # Extract username
#     username_match = re.search(r'(\w+\s\w+|Beigetreten am:.*?\d{4})', content)
#     username = username_match.group(1).split('Beigetreten am:')[0].strip() if username_match else None

#     # Extract Vorschläge count
#     vorschlaege_match = re.search(r'Vorschläge(\d+)', content)
#     vorschlaege = int(vorschlaege_match.group(1)) if vorschlaege_match else 0

#     # Extract Konto verification status
#     konto_match = re.search(r'(Konto\s(verifiziert|ist nicht verifiziert))', content)
#     konto_status = konto_match.group(2) if konto_match else None

#     # # Extract registration date
#     # registration_match = re.search(r'Beigetreten am:\s(\d{1,2}\.\s\w+\s\d{4})', content)
#     # registration_date = registration_match.group(1) if registration_match else None

#     # Extract number of Unterstützer*innen
#     supporters_match = re.search(r'(\d+)\sUnterstützer\*in', content)
#     supporters = int(supporters_match.group(1)) if supporters_match else 0

#     return title, date, comments, tags, description, username, vorschlaege, konto_status, supporters

# # Apply the enhanced function to the DataFrame and create new columns
# df_sieburg[['Title', 'Date', 'Comments', 'Tags', 'Description', 'Username', 'Vorschläge', 'Konto Status', 'Supporters']] = df_sieburg['Content'].apply(
#     lambda x: pd.Series(extract_full_data_with_supporters(x))
# )


# # Function to clean description considering keywords, numeric patterns, and refined starting logic
# def clean_description_advanced(content):
#     # Define keywords that mark the beginning of the description
#     keywords = [
#         'Geselliges Beisammensein', 'Natur', 'Hilfe & Beratung', 'Bildung', 
#         'Musik', 'Bewegung', 'Glaube', 'Kulinarisches', 'Kunst & Kultur', 'Sonstiges',
#     ]
    
#     # Check for keywords first
#     for keyword in keywords:
#         if keyword in content:
#             start_idx = content.find(keyword) + len(keyword)
#             description = content[start_idx:].strip()
#             description = re.split(r'(Kommentare\(.*?\)|registrieren)', description)[0].strip()
#             return description

#     # If no keyword is found, check for numeric patterns like "18-24, 25-49, etc."
#     numeric_pattern = re.search(r'(\d{1,2}[-+]\d{1,2}|\d{2}\+)', content)
#     if numeric_pattern:
#         start_idx = numeric_pattern.end()
#         description = content[start_idx:].strip()
#         description = re.split(r'(Kommentare\(.*?\)|registrieren)', description)[0].strip()
#         return description

#     # As a fallback, find the first capital letter, quote, or digit to mark the start
#     fallback_match = re.search(r'[A-Z"0-9]', content)
#     if fallback_match:
#         start_idx = fallback_match.start()
#         description = content[start_idx:].strip()
#         description = re.split(r'(Kommentare\(.*?\)|registrieren)', description)[0].strip()
#         return description

#     # If nothing works, return the content as is
#     return content

# # Apply the advanced cleaning function to the Description column
# df_sieburg['Description'] = df_sieburg['Content'].apply(clean_description_advanced)


# Generating a report

### 📂 Load Dependencies & Data 📊

In [26]:
# 📌 Import necessary libraries
import os
import requests
import pandas as pd
import json
import spacy
import matplotlib.pyplot as plt
from collections import Counter
from wordcloud import WordCloud
from textblob import TextBlob
from fpdf import FPDF

# 🔑 Set your Hugging Face API key
HF_MODEL = "tiiuae/falcon-7b-instruct"  # Alternative: "mistralai/Mistral-7B-Instruct"

# Load German NLP model
nlp = spacy.load("de_core_news_sm")  # Ensure this model is installed on your machine

# 📌 Load datasets
comments_df = pd.read_csv("all_comments.csv")
projects_df = pd.read_csv("all_projects.csv")
proposals_df = pd.read_csv("all_proposals.csv")

# 📌 Filter data for Jena
jena_comments = comments_df[comments_df["City"] == "Jena"].copy()
jena_projects = projects_df[projects_df["City"] == "Jena"].copy()
jena_proposals = proposals_df[proposals_df["City"] == "Jena"].copy()

print(f"✅ Data Loaded Successfully! Jena Comments: {len(jena_comments)}, Projects: {len(jena_projects)}, Proposals: {len(jena_proposals)}")


✅ Data Loaded Successfully! Jena Comments: 314, Projects: 19, Proposals: 0


### 📊 Data Analysis & Insights 🔍

In [27]:
# 🔹 1. Most Active Users
most_active_users = jena_comments["Username"].value_counts().head(10)

# 🔹 2. Most Commented Projects
most_commented_projects = jena_comments["Project"].value_counts().head(5)

# 🔹 3. Sentiment Analysis
jena_comments.loc[:, "Sentiment"] = jena_comments["Text"].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
jena_comments.loc[:, "Sentiment Category"] = jena_comments["Sentiment"].apply(lambda x: "Positive" if x > 0 else ("Negative" if x < 0 else "Neutral"))
sentiment_counts = jena_comments["Sentiment Category"].value_counts()

print("✅ Data Analysis Completed!")
print("\nMost Active Users:\n", most_active_users)
print("\nMost Commented Projects:\n", most_commented_projects)
print("\nSentiment Distribution:\n", sentiment_counts)

# 🔹 1. Most Discussed Topics Using spaCy (More accurate German processing)
def extract_keywords(text):
    doc = nlp(str(text))
    keywords = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    return keywords

jena_comments["Keywords"] = jena_comments["Text"].apply(extract_keywords)
all_keywords = [keyword for keywords in jena_comments["Keywords"] for keyword in keywords]
keyword_counts = Counter(all_keywords).most_common(20)

# 🔹 2. Most Upvoted & Downvoted Comments
most_liked_comments = jena_comments.nlargest(10, "Likes")[["Username", "Text", "Likes"]]
most_disliked_comments = jena_comments.nlargest(10, "Dislikes")[["Username", "Text", "Dislikes"]]

# 🔹 3. Peak Commenting Time (Hourly & Daily Trends)
jena_comments["Date"] = pd.to_datetime(jena_comments["Date"], errors="coerce")
jena_comments["Hour"] = jena_comments["Date"].dt.hour
jena_comments["Day"] = jena_comments["Date"].dt.dayofweek

hourly_distribution = jena_comments["Hour"].value_counts().sort_index()
daily_distribution = jena_comments["Day"].value_counts().sort_index()

# 🔹 5. Most Supported Proposals
most_supported_proposals = proposals_df.nlargest(10, "Supporters")[["Title", "Supporters", "City"]]

# 🔹 6. Projects with the Most Proposals
most_proposals_per_project = projects_df.nlargest(10, "Proposal Count")[["Project Title", "Proposal Count", "City"]]

# 🔹 7. Most Controversial Topics (High disagreement: likes vs. dislikes ratio)
jena_comments["Like-Dislike Ratio"] = jena_comments["Likes"] / (jena_comments["Dislikes"] + 1)
most_controversial_comments = jena_comments.nlargest(10, "Like-Dislike Ratio")[["Username", "Text", "Likes", "Dislikes", "Like-Dislike Ratio"]]

# 🔹 8. Users with the Most Repeated Engagement (Users commenting on multiple projects)
user_project_counts = jena_comments.groupby("Username")["Project"].nunique().sort_values(ascending=False).head(10)

# 🔹 10. Average Engagement Per Project (Number of comments per project)
average_comments_per_project = jena_comments.groupby("Project").size().mean()

# 🔹 Display Insights
insights = {
    "Most Discussed Topics": keyword_counts,
    "Most Liked Comments": most_liked_comments.to_dict(orient="records"),
    "Most Disliked Comments": most_disliked_comments.to_dict(orient="records"),
    "Peak Commenting Hours": hourly_distribution.to_dict(),
    "Peak Commenting Days": daily_distribution.to_dict(),
    "Most Supported Proposals": most_supported_proposals.to_dict(orient="records"),
    "Projects with Most Proposals": most_proposals_per_project.to_dict(orient="records"),
    "Most Controversial Comments": most_controversial_comments.to_dict(orient="records"),
    "Users with Most Repeated Engagement": user_project_counts.to_dict(),
    "Average Comments Per Project": [average_comments_per_project]  # Convert float to list
}

# Print key insights
print("\n✅ Enhanced Insights Generated!")
print("📌 Most Discussed Topics (Top 10):", keyword_counts[:10])
print("\n📌 Most Liked Comment:", most_liked_comments.iloc[0] if not most_liked_comments.empty else "No data")
print("\n📌 Most Controversial Comment:", most_controversial_comments.iloc[0] if not most_controversial_comments.empty else "No data")

# Save insights as a CSV file
insights_df = pd.DataFrame.from_dict(insights, orient='index')
insights_df.to_csv("jena_insights.csv")

print("\n📂 Insights saved as 'jena_insights.csv'")

import ast  # To safely evaluate strings as Python objects

# Function to clean and convert stringified data
def clean_data(value):
    if pd.isna(value) or value == "nan":
        return "Keine Daten verfügbar"  # German for "No data available"
    try:
        parsed_value = ast.literal_eval(value)  # Convert string to tuple/dict if applicable
        if isinstance(parsed_value, tuple):
            return f"{parsed_value[0]} ({parsed_value[1]})"  # Format tuple nicely
        elif isinstance(parsed_value, dict):
            return f"{parsed_value.get('Username', 'Unknown')}: {parsed_value.get('Text', '')}"
        return str(parsed_value)  # Convert any other format to string
    except (ValueError, SyntaxError):
        return str(value)  # Return as-is if it can't be parsed

# Apply cleaning function to all values in the DataFrame
insights_df = insights_df.applymap(clean_data)

# Drop columns with too many missing values (threshold: 80% empty)
insights_df.dropna(axis=1, thresh=int(len(insights_df) * 0.2), inplace=True)

# Convert dataframe to readable string format
insights_summary = insights_df.to_string(index=False)



✅ Data Analysis Completed!

Most Active Users:
 Username
klaus.kleiner77     32
PM                  18
Klaus.kleiner77     16
pfingstochse78      15
Brabax              12
Klaus.Kleiner       12
Ronon               11
Klaus.kleiner.77     8
r_luen               7
AnneK                6
Name: count, dtype: int64

Most Commented Projects:
 Project
Stufe I - Kurzfristige Entwickelbarkeit           91
Westbahnhofstraße                                 50
Szenario 3 „langfristige Flächenverfügbarkeit“    50
Stufe III – Perspektivische Entwickelbarkeit      42
Szenario 1 „Kurzfristige Flächenverfügbarkeit“    32
Name: count, dtype: int64

Sentiment Distribution:
 Sentiment Category
Neutral     294
Positive     18
Negative      2
Name: count, dtype: int64

✅ Enhanced Insights Generated!
📌 Most Discussed Topics (Top 10): [('Jena', 78), ('Straßenbahn', 75), ('Bahnhof', 69), ('Stadt', 55), ('finden', 53), ('Westbahnhof', 44), ('Gleis', 44), ('Planung', 43), ('Platz', 40), ('Straße', 38)]

📌 Most 

  insights_df = insights_df.applymap(clean_data)


In [28]:
insights_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
Most Discussed Topics,"('Jena', 78)","('Straßenbahn', 75)","('Bahnhof', 69)","('Stadt', 55)","('finden', 53)","('Westbahnhof', 44)","('Gleis', 44)","('Planung', 43)","('Platz', 40)","('Straße', 38)","('Unterführung', 35)","('direkt', 34)","('Richtung', 33)","('Szenario', 33)","('Brücke', 32)","('Haltestelle', 31)","('sehen', 31)","('Rahmenplan', 31)","('Radfahrer', 30)","('fahren', 29)"
Most Liked Comments,"{'Username': 'PM', 'Text': 'In diesem Szenario...","{'Username': 'PM', 'Text': 'Wenn Umbau, dann r...","{'Username': 'PM', 'Text': 'Die Aufweitung und...","{'Username': 'PM', 'Text': 'Die Brücke muss la...","{'Username': 'r_luen', 'Text': '1. Zuerst einm...","{'Username': 'AnneK', 'Text': 'Dem ist absolut...","{'Username': 'r_luen', 'Text': 'Ich kann nicht...","{'Username': 'Brabax', 'Text': 'Der Neubau der...","{'Username': 'Patzak', 'Text': 'Wenn man schon...","{'Username': 'r_luen', 'Text': 'Für eine zukun...",Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar
Most Disliked Comments,"{'Username': 'klaus.kleiner3', 'Text': 'Die Pl...","{'Username': 'Dominique', 'Text': 'Den zusätzl...","{'Username': 'klaus.kleiner3', 'Text': 'Ich ha...","{'Username': 'Astrid Lindner', 'Text': 'Ich bi...","{'Username': 'klaus.kleiner77', 'Text': 'Also ...","{'Username': 'klaus.kleiner77', 'Text': 'Wie w...","{'Username': 'klaus.kleiner', 'Text': '@PMGut,...","{'Username': 'klaus.kleiner', 'Text': '@PMGut,...","{'Username': 'Bastian', 'Text': 'Ich kann mir ...","{'Username': 'pfingstochse78', 'Text': 'Wenn d...",Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar
Peak Commenting Hours,0.0,5.0,6.0,7.0,9.0,10.0,11.0,12.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0,21.0,22.0,23.0,Keine Daten verfügbar,Keine Daten verfügbar
Peak Commenting Days,0.0,1.0,2.0,3.0,4.0,5.0,6.0,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar
Most Supported Proposals,{'Title': 'Kulturelle Belebung durch Tanzfläch...,"{'Title': 'Ich bin mit Variante 2 zufrieden.',...",{'Title': 'Besuche des Bürgerbüros überflüssig...,{'Title': 'Wolkenstimmung über der Luitpoldhöh...,"{'Title': 'Luftaufnahme Luitpoldhöhe', 'Suppor...","{'Title': 'Ich bin mit Variante 1 zufrieden.',...",{'Title': 'Digital optimierte Nutzung privater...,"{'Title': 'Spielplatz 2', 'Supporters': 50.0, ...","{'Title': 'Spielplatz 1', 'Supporters': 47.0, ...","{'Title': 'Spielplatz in der Luitpoldhöhe', 'S...",Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar
Projects with Most Proposals,{'Project Title': 'Zukunftskonzepte für die In...,"{'Project Title': 'Angebotslandkarte', 'Propos...",{'Project Title': 'Verkehrskonzept Hauptstraße...,"{'Project Title': 'Verkehrsentwicklungsplan', ...","{'Project Title': 'MoveRegioM', 'Proposal Coun...","{'Project Title': 'Standortvorschläge', 'Propo...",{'Project Title': 'Digitalisierung im Münchner...,{'Project Title': 'Der neue Ilzer Land-Anhänge...,"{'Project Title': 'Energiesparen', 'Proposal C...","{'Project Title': 'Spielplätze Halde Nord', 'P...",Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar
Most Controversial Comments,"{'Username': 'PM', 'Text': 'In diesem Szenario...","{'Username': 'PM', 'Text': 'Wenn Umbau, dann r...","{'Username': 'PM', 'Text': 'Die Aufweitung und...","{'Username': 'PM', 'Text': 'Die Brücke muss la...","{'Username': 'r_luen', 'Text': 'Ich kann nicht...","{'Username': 'Brabax', 'Text': 'Der Neubau der...","{'Username': 'r_luen', 'Text': 'Für eine zukun...","{'Username': 'pfingstochse78', 'Text': 'Ich ma...","{'Username': 'PM', 'Text': 'Der Rahmenplan hat...","{'Username': 'Radfahrergast', 'Text': 'Gemeins...",Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar
Users with Most Repeated Engagement,Brabax,JoSch,pfingstochse78,PM,Stig Ludwig,JessicaR,Daniela Köhler,Mr.Moto,Inga Glökler,Patzak,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar
Average Comments Per Project,44.857142857142854,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar,Keine Daten verfügbar


### 🧠 NLP Processing & Word Cloud ☁️


In [29]:
# 🔹 4. NLP Analysis with spaCy (Topic Extraction)
def extract_keywords(text):
    doc = nlp(str(text))
    keywords = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    return keywords

jena_comments.loc[:, "Keywords"] = jena_comments["Text"].apply(extract_keywords)
all_keywords = [keyword for keywords in jena_comments["Keywords"] for keyword in keywords]
keyword_counts = Counter(all_keywords).most_common(20)

# 🔹 5. Generate Word Cloud
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(" ".join(all_keywords))
wordcloud_path = "jena_wordcloud.png"
wordcloud.to_file(wordcloud_path)

# 🔹 6. Generate Activity Charts
plt.figure(figsize=(8, 5))
most_active_users.plot(kind="bar", color="blue")
plt.title("Top 10 Most Active Users in Jena")
plt.xlabel("Username")
plt.ylabel("Number of Comments")
plt.xticks(rotation=45)
plt.grid(axis="y", linestyle="--", alpha=0.7)
active_users_plot_path = "jena_active_users.png"
plt.savefig(active_users_plot_path)
plt.close()

plt.figure(figsize=(6, 4))
sentiment_counts.plot(kind="bar", color=["green", "gray", "red"])
plt.title("Sentiment Distribution of Comments in Jena")
plt.xlabel("Sentiment Category")
plt.ylabel("Number of Comments")
plt.xticks(rotation=0)
plt.grid(axis="y", linestyle="--", alpha=0.7)
sentiment_plot_path = "jena_sentiment_distribution.png"
plt.savefig(sentiment_plot_path)
plt.close()

print("✅ NLP Analysis & Visualizations Completed!")
print("\nTop Discussion Topics:\n", keyword_counts)


✅ NLP Analysis & Visualizations Completed!

Top Discussion Topics:
 [('Jena', 78), ('Straßenbahn', 75), ('Bahnhof', 69), ('Stadt', 55), ('finden', 53), ('Westbahnhof', 44), ('Gleis', 44), ('Planung', 43), ('Platz', 40), ('Straße', 38), ('Unterführung', 35), ('direkt', 34), ('Richtung', 33), ('Szenario', 33), ('Brücke', 32), ('Haltestelle', 31), ('sehen', 31), ('Rahmenplan', 31), ('Radfahrer', 30), ('fahren', 29)]


### 🤖 AI-Powered Summary 📝


In [30]:
# 🔹 7. Generate AI-Powered Summary via Hugging Face API
jena_data_summary = f"""
City: Jena
Total Projects: {len(jena_projects)}
Total Proposals: {len(jena_proposals)}
Total Comments: {len(jena_comments)}

Top 5 Most Commented Projects:
{most_commented_projects.to_string()}

Top 10 Most Active Users:
{most_active_users.to_string()}

Sentiment Analysis:
- Neutral Comments: {sentiment_counts.get('Neutral', 0)}
- Positive Comments: {sentiment_counts.get('Positive', 0)}
- Negative Comments: {sentiment_counts.get('Negative', 0)}

Most Common Discussion Topics:
{', '.join([word for word, count in keyword_counts])}
"""

ai_prompt = f"""
Generate a structured analytical report on civic engagement in Jena based on the following insights:

{jena_data_summary}

The report should include:
- A professional introduction about civic engagement in Jena.
- Key trends and insights from the provided data.
- Observations on public sentiment and discussion topics.
- Suggestions for improving citizen engagement.

Ensure the report is structured, formal, and insightful.
"""

def generate_report(text):
    payload = {"inputs": text, "parameters": {"max_new_tokens": 1000}}

    
    try:
        result = response.json()
        if "error" in result:
            return f"❌ Error from Hugging Face API: {result['error']}"
        elif isinstance(result, list) and "generated_text" in result[0]:
            return result[0]["generated_text"]
        else:
            return "❌ Unexpected API response format."
    
    except Exception as e:
        return f"❌ Exception occurred: {str(e)}"

# Call AI for Summary
# Call AI for Summary (Ensure unique output)
ai_generated_report = generate_report(ai_prompt).strip()

# Print only key insights from the AI-generated content
print("\n✅ AI Summary Generated!")
print(ai_generated_report[:500])  # Print only first 500 characters for preview




✅ AI Summary Generated!
Generate a structured analytical report on civic engagement in Jena based on the following insights:


City: Jena
Total Projects: 19
Total Proposals: 0
Total Comments: 314

Top 5 Most Commented Projects:
Project
Stufe I - Kurzfristige Entwickelbarkeit           91
Westbahnhofstraße                                 50
Szenario 3 „langfristige Flächenverfügbarkeit“    50
Stufe III – Perspektivische Entwickelbarkeit      42
Szenario 1 „Kurzfristige Flächenverfügbarkeit“    32

Top 10 Most Active Use


### 📄 Generate AI-Powered PDF Report 📊


In [31]:
# # Create PDF Report
# pdf = FPDF()
# pdf.set_auto_page_break(auto=True, margin=15)
# pdf.add_page()

# # ✅ Load all font styles
# pdf.add_font("DejaVu", "", "DejaVuSans.ttf", uni=True)  # Regular
# pdf.add_font("DejaVu", "B", "DejaVuSans-Bold.ttf", uni=True)  # Bold
# pdf.add_font("DejaVu", "I", "DejaVuSans-Oblique.ttf", uni=True)  # Italic
# pdf.add_font("DejaVu", "BI", "DejaVuSans-BoldOblique.ttf", uni=True)  # Bold Italic

# pdf.set_font("DejaVu", "", 12)  # Use regular font

# # ✅ Ensure text fields are not empty
# def safe_text(text):
#     return text if text.strip() else "No data available"

# jena_data_summary = safe_text(jena_data_summary)
# ai_generated_report = safe_text(ai_generated_report)

# # 📌 Report Title
# pdf.set_font("DejaVu", "B", 16)
# pdf.cell(200, 10, "AI-Generated Analytical Report for Jena", ln=True, align="C")

# # 📌 Summary Section
# pdf.set_font("DejaVu", "B", 12)
# pdf.cell(200, 10, "Engagement Summary", ln=True)
# pdf.set_font("DejaVu", "", 12)
# pdf.multi_cell(190, 10, jena_data_summary)

# # 📌 AI-Generated Analysis
# pdf.set_font("DejaVu", "B", 12)
# pdf.cell(200, 10, "AI-Generated Insights", ln=True)
# pdf.set_font("DejaVu", "", 12)
# pdf.multi_cell(190, 10, ai_generated_report)

# # 📌 Additional Insights
# pdf.set_font("DejaVu", "B", 12)
# pdf.cell(200, 10, "Detailed Data Analysis", ln=True)
# pdf.set_font("DejaVu", "", 12)
# pdf.multi_cell(190, 10, f"📌 Most Discussed Topics: {', '.join([word for word, count in keyword_counts[:10]])}")
# pdf.multi_cell(190, 10, f"📌 Most Liked Comments: {most_liked_comments.to_string()}")
# pdf.multi_cell(190, 10, f"📌 Peak Commenting Hours: {hourly_distribution.to_string()}")
# pdf.multi_cell(190, 10, f"📌 Peak Commenting Days: {daily_distribution.to_string()}")
# pdf.multi_cell(190, 10, f"📌 Most Supported Proposals: {most_supported_proposals.to_string()}")
# pdf.multi_cell(190, 10, f"📌 Most Controversial Comments: {most_controversial_comments.to_string()}")

# # 📌 Insert Charts (Ensure files exist before adding)
# def safe_add_image(pdf, path, x=50, w=100):
#     if os.path.exists(path):
#         pdf.image(path, x=x, w=w)
#     else:
#         print(f"⚠️ Warning: Image not found - {path}")

# safe_add_image(pdf, "jena_wordcloud.png")
# safe_add_image(pdf, "jena_active_users.png")
# safe_add_image(pdf, "jena_sentiment_distribution.png")

# # ✅ Save PDF
# pdf_path = "jena_ai_report.pdf"
# pdf.output(pdf_path, "F")

# print(f"✅ AI Report Successfully Generated: {pdf_path}")


In [32]:
# import os
# from jinja2 import Environment, FileSystemLoader
# from weasyprint import HTML
# from datetime import datetime

# # Data for the report (replace with your actual data)
# city = "Jena"
# data = {
#     "city": city,
#     "date": datetime.now().strftime("%Y-%m-%d"),
#     "summary": "This is a summary of civic engagement in Jena.",
#     "ai_insights": "AI-generated insights about civic engagement in Jena.",
#     "most_discussed_topics": ", ".join(["Jena", "Straßenbahn", "Bahnhof", "Stadt", "finden"]),
#     "most_liked_comments": "Most liked comments data here.",
#     "peak_hours": "Peak commenting hours data here.",
#     "peak_days": "Peak commenting days data here.",
#     "most_supported_proposals": "Most supported proposals data here.",
#     "most_controversial_comments": "Most controversial comments data here.",
#     "wordcloud_path": "jena_wordcloud.png",
#     "active_users_plot_path": "jena_active_users.png",
#     "sentiment_plot_path": "jena_sentiment_distribution.png",
# }

# # Load the Jinja2 template
# env = Environment(loader=FileSystemLoader('.'))
# template = env.get_template('report_template.html')

# # Render the HTML
# html_out = template.render(data)

# # Save HTML to a file (optional)
# html_file_path = f"{city}_report.html"
# with open(html_file_path, "w", encoding="utf-8") as f:
#     f.write(html_out)

# # Convert HTML to PDF using WeasyPrint
# pdf_file_path = f"{city}_report.pdf"
# HTML(string=html_out).write_pdf(pdf_file_path)

# print(f"✅ Report generated: {pdf_file_path}")


-----

WeasyPrint could not import some external libraries. Please carefully follow the installation steps before reporting an issue:
https://doc.courtbouillon.org/weasyprint/stable/first_steps.html#installation
https://doc.courtbouillon.org/weasyprint/stable/first_steps.html#troubleshooting 

-----



OSError: cannot load library 'libgobject-2.0-0': error 0x7e.  Additionally, ctypes.util.find_library() did not manage to locate a library called 'libgobject-2.0-0'

In [33]:
import os
from jinja2 import Environment, FileSystemLoader
import pdfkit
from datetime import datetime

# Specify the path to wkhtmltopdf
wkhtmltopdf_path = r'C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe'  # Update this path

# Data for the report (replace with your actual data)
city = "Jena"
data = {
    "city": city,
    "date": datetime.now().strftime("%Y-%m-%d"),
    "summary": "This is a summary of civic engagement in Jena.",
    "ai_insights": "AI-generated insights about civic engagement in Jena.",
    "most_discussed_topics": ", ".join(["Jena", "Straßenbahn", "Bahnhof", "Stadt", "finden"]),
    "most_liked_comments": "Most liked comments data here.",
    "peak_hours": "Peak commenting hours data here.",
    "peak_days": "Peak commenting days data here.",
    "most_supported_proposals": "Most supported proposals data here.",
    "most_controversial_comments": "Most controversial comments data here.",
    "wordcloud_path": "jena_wordcloud.png",  # Relative path
    "active_users_plot_path": "jena_active_users.png",  # Relative path
    "sentiment_plot_path": "jena_sentiment_distribution.png",  # Relative path
}

# Convert relative paths to absolute paths
data["wordcloud_path"] = os.path.abspath(data["wordcloud_path"])
data["active_users_plot_path"] = os.path.abspath(data["active_users_plot_path"])
data["sentiment_plot_path"] = os.path.abspath(data["sentiment_plot_path"])

# Load the Jinja2 template
env = Environment(loader=FileSystemLoader('.'))
template = env.get_template('report_template.html')

# Render the HTML
html_out = template.render(data)

# Save HTML to a file (optional)
html_file_path = f"{city}_report.html"
with open(html_file_path, "w", encoding="utf-8") as f:
    f.write(html_out)

# PDF options to disable printer interaction and suppress warnings
options = {
    'quiet': '',  # Suppress warnings and errors
    'disable-local-file-access': '',  # Prevent access to local files
    'no-pdf-compression': '',  # Disable PDF compression (avoids printer interaction)
}

# Convert HTML to PDF
pdf_file_path = f"{city}_report.pdf"
pdfkit.from_string(html_out, pdf_file_path, configuration=pdfkit.configuration(wkhtmltopdf=wkhtmltopdf_path), options=options)

print(f"✅ Report generated: {pdf_file_path}")

OSError: wkhtmltopdf reported an error:
Exit with code 1 due to network error: ProtocolUnknownError


In [35]:
projects_df['City'].value_counts()

City
Augsburg             24
Bochum               24
Detmold              24
Siegburg             24
Flensburg            23
Wuerzburg            22
Muenchen             19
Jena                 19
Unterschleissheim    18
Mitren               14
Bamberg              14
Pforzheim            12
Kempten              10
Amberg                7
Linz                  6
Trier                 6
Mitmachgemein         4
Stutensee             2
Name: count, dtype: int64

In [38]:
all_proposals_df

Unnamed: 0,URL,Title,Proposed for Project,Description,Author,Comments,Supporters,City
0,https://wuerzburg-mitmachen.de/proposals/110-a...,Autofreier Bischofshut,Zukunftskonzepte für die Innenstadt,"Wir fordern die Ausrufung des Klimanotstands, ...",Letzte Generation Würzburg,0.0,20.0,Wuerzburg
1,https://wuerzburg-mitmachen.de/proposals/109-e...,E Scooter verbieten,Zukunftskonzepte für die Innenstadt,E Scooter sollten (im Innenstadtbereich) verbo...,Ccmuet,0.0,2.0,Wuerzburg
2,https://wuerzburg-mitmachen.de/proposals/108-b...,Barrierefrei ins Nautiland/LGS,Zukunftskonzepte für die Innenstadt,Nautiland - neu.\r\nUmweltstation - neu.\r\nZe...,AASeuffert,0.0,2.0,Wuerzburg
3,https://wuerzburg-mitmachen.de/proposals/107-k...,Kinderabenteuer / Indoor Spielplatz / Smaland,Zukunftskonzepte für die Innenstadt,Es gibt zwar schon den FunPark für Kinder mit ...,ABlitz,0.0,0.0,Wuerzburg
4,https://wuerzburg-mitmachen.de/proposals/106-b...,"Bänke und ""Grün"" im neu gestalteten Bereich Ka...",Zukunftskonzepte für die Innenstadt,Die Baustelle von der Karmelitenstraße zum Vie...,Ccmuet,0.0,12.0,Wuerzburg
...,...,...,...,...,...,...,...,...
660,https://machmit.augsburg.de/proposals/8-famili...,Familiennetz\r\nWe are family,Wie soll die digitale Plattform für Familien h...,2 Ideen,SSonja Poland - FSP Bärenkeller,0.0,0.0,Augsburg
661,https://machmit.augsburg.de/proposals/7-famili...,Familien Hub,Wie soll die digitale Plattform für Familien h...,Ein Hub für Familien,CCPM,0.0,0.0,Augsburg
662,https://machmit.augsburg.de/proposals/21-windp...,Windprechtpark,Wie war der Sommer?,"Der Winprechtpark ist ein kühler Ort, hat aber...",Gguest_7cc4b24b-c70a-48f7-bb5e-ad399221e79a,0.0,0.0,Augsburg
663,https://machmit.augsburg.de/proposals/13-witte...,Wittelsbacher Park,Wie war der Sommer?,Sehr schöne Park mit viel Schatten. Etwas mehr...,UUli,0.0,1.0,Augsburg
