<a href="https://colab.research.google.com/github/gspinaci/Vita-e-morte-DH-projects/blob/main/giorgia_crawlerDHprojects.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

# URL to the spreadsheet file (e.g., Google Sheets in .xlsx format)
url = "https://docs.google.com/spreadsheets/d/1G3WiRMoopP8Y2FlVJfwCRqZVLilw-aMQRBxF_SoMF20/edit?gid=647909856#gid=647909856"

# Extract the spreadsheet ID from the URL
spreadsheet_id = url.split("/d/")[1].split("/")[0]

# Construct the download URL for Google Sheets
download_url = f"https://docs.google.com/spreadsheets/d/{spreadsheet_id}/export?format=xlsx"

# Load the spreadsheet using the download URL and specifying the engine
df = pd.read_excel(download_url, sheet_name="Centri DH", engine="openpyxl")

# Save as CSV
df.to_csv("output.csv", index=False)

In [6]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

def create_dh_projects_scraper():
    def is_valid_url(url):
        """Check if the URL is valid, adding schema if missing."""
        if not url:
            return None
        url = url.strip()
        if not url.startswith(('http://', 'https://')):
            url = 'https://' + url

        try:
            result = urlparse(url)
            return url if all([result.scheme, result.netloc]) else None
        except:
            return None

    def get_page_content(url):
        """Fetch and parse webpage content with error handling."""
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
            }
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            return BeautifulSoup(response.text, 'html.parser')
        except Exception as e:
            print(f"Error fetching {url}: {str(e)}")
            return None

    def find_project_links(soup, base_url):
        """Find project-related links on the page, including those in modals."""
        keywords = [
            'project', 'progetti',
            'digital humanities', 'umanistica digitale',
            'digital', 'digitale',
            'research', 'ricerca',
            'laboratorio', 'laboratory',
            'archivio', 'archive'
        ]

        exclude_domains = [
            'facebook.com', 'twitter.com', 'linkedin.com', 'instagram.com',
            'youtube.com', 'blogspot.com', 'wordpress.com', 'tumblr.com'
        ]

        projects = []
        if soup:
            # Find all potential modal containers
            modal_elements = soup.find_all([
                'div',  # Common modal container
                'section',  # Sometimes used for modals
                'aside'  # Off-canvas modals
            ], class_=lambda x: x and any(modal_term in x.lower()
                for modal_term in ['modal', 'popup', 'dialog', 'overlay', 'lightbox']))

            # Combine regular page content and modal content
            search_areas = [soup] + modal_elements

            for area in search_areas:
                links = area.find_all('a', href=True)
                for link in links:
                    href = link.get('href', '')
                    text = link.text.lower().strip()

                    # Skip empty, javascript, email, or phone links
                    if not href or href.startswith(('javascript:', '#', 'mailto:', 'tel:')):
                        continue

                    # Make URL absolute
                    href = urljoin(base_url, href)

                    # Check if the URL should be excluded
                    if any(domain in href for domain in exclude_domains):
                        continue

                    # Check if any keyword is in the link text or URL
                    if any(keyword in text or keyword in href.lower() for keyword in keywords):
                        # Check for hidden elements
                        is_hidden = any(parent.get('style', '').lower().find('display: none') != -1
                                      or parent.get('style', '').lower().find('visibility: hidden') != -1
                                      for parent in link.parents)

                        if not is_hidden and not any(p['link'] == href for p in projects):
                            projects.append({
                                'link': href,
                                'name': text or href,
                            })

        return projects

    def process_institution(row):
        """Process a single institution's URLs."""
        projects_list = []
        if pd.notna(row['URL']):
            # Split URLs by semicolon and handle potential whitespace
            urls = [url.strip() for url in str(row['URL']).split(';')]

            # Check if this is one of the special institutions
            special_institutions = ['Torino', 'Catania', 'Tatti', 'Hertziana']
            is_special = any(inst in str(row['Nome']) for inst in special_institutions)

            # For special institutions, just add the URLs as project links
            if is_special:
                for url in urls:
                    valid_url = is_valid_url(url)
                    if valid_url:
                        projects_list.append({
                            'categoria': row['Categoria'],
                            'institution': row['Nome'],
                            'location': row['Luogo'],
                            'institution_url': valid_url,
                            'project_name': f"Project at {row['Nome']}",
                            'project_link': valid_url
                        })
            else:
                # Normal processing for other institutions
                for url in urls:
                    valid_url = is_valid_url(url)
                    if valid_url:
                        print(f"  Processing URL: {valid_url}")
                        soup = get_page_content(valid_url)
                        if soup:
                            projects = find_project_links(soup, valid_url)

                            for project in projects:
                                projects_list.append({
                                    'categoria': row['Categoria'],
                                    'institution': row['Nome'],
                                    'location': row['Luogo'],
                                    'institution_url': valid_url,
                                    'project_name': project['name'],
                                    'project_link': project['link']
                                })

        return projects_list

    def process_dh_centers(csv_path):
        """Main function to process the CSV file and extract project links."""
        try:
            df = pd.read_csv(csv_path, encoding='utf-8')

            # Check if required columns are present
            required_columns = ['Categoria', 'Nome', 'Luogo', 'URL']
            if not all(col in df.columns for col in required_columns):
                raise ValueError(f"CSV must contain columns: {', '.join(required_columns)}")

            # Process all institutions, including special ones
            all_projects = []
            for idx, row in df.iterrows():
                print(f"Processing {row['Nome']}...")
                projects = process_institution(row)
                all_projects.extend(projects)

            results_df = pd.DataFrame(all_projects)

            # Remove duplicates
            results_df = results_df.drop_duplicates(subset=['institution', 'project_link'])

            return results_df

        except Exception as e:
            print(f"Error processing CSV: {str(e)}")
            return pd.DataFrame()

    return process_dh_centers

# Usage example:
if __name__ == "__main__":
    scraper = create_dh_projects_scraper()
    results = scraper('output.csv')

    # Save results to CSV
    if not results.empty:
        results.to_csv('dh_projects_results.csv', index=False, encoding='utf-8')
        print(f"Found {len(results)} projects across {results['institution'].nunique()} institutions")


Processing Digital Humanities Advanced Research Centre (DH.ARC) - Unibo...
  Processing URL: https://centri.unibo.it/dharc/en/research/projects-at-dh-arc
Processing Venice Centre for Digital and Public Humanities (VeDPH)...
  Processing URL: https://www.unive.it/pag/47701/
  Processing URL: https://www.unive.it/pag/47702/
  Processing URL: https://www.unive.it/pag/47703/
  Processing URL: https://www.unive.it/pag/47704/
Processing Digital Culture Laboratory - Pisa...
  Processing URL: https://www.labcd.unipi.it/progetti/
Processing DigiLab – Interdepartmental Center for research and services - la sapienza...
  Processing URL: https://digilab.uniroma1.it/ricerca/progetti-corso
Processing Interdepartmental center FiTMU – DH Section - unisalerno...
Processing Laboratory Vast-Lab - Prato ...
  Processing URL: https://vast-lab.org/progetti/
Processing Centro interdipartimentale di ricerca in Digital Humanities - Università del Salento...
Processing CRR-MM - unibo...
  Processing URL: https:

In [3]:
import requests
import pandas as pd
import time
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

def create_session_with_retries():
    """Create a requests session with retry strategy"""
    session = requests.Session()
    retries = Retry(
        total=3,
        backoff_factor=0.5,
        status_forcelist=[500, 502, 503, 504]
    )
    adapter = HTTPAdapter(max_retries=retries)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

def check_links_status(csv_path):
    """Check status of project links and create a clean status report"""
    try:
        df = pd.read_csv(csv_path, encoding='utf-8')
    except Exception as e:
        print(f"Error reading CSV file: {str(e)}")
        return None

    # Create a session with retry strategy
    session = create_session_with_retries()
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    # Create lists to store results
    results = []

    # Check each link
    for index, row in df.iterrows():
        url = row['project_link']
        print(f"Checking {index + 1}/{len(df)}: {url}")

        try:
            response = session.get(url, headers=headers, timeout=10, allow_redirects=True)
            status_code = response.status_code

            # Determine website status
            if 200 <= status_code < 300:
                status = "Active"
            elif status_code == 404:
                status = "Not Found"
            elif 300 <= status_code < 400:
                status = f"Redirect ({status_code})"
            elif 400 <= status_code < 500:
                status = f"Client Error ({status_code})"
            elif 500 <= status_code < 600:
                status = f"Server Error ({status_code})"
            else:
                status = f"Unknown ({status_code})"

            results.append({
                'institution': row['institution'],
                'project_name': row['project_name'],
                'url': url,
                'status_code': status_code,
                'status': status,
                'response': "Success" if 200 <= status_code < 300 else "Failed"
            })

        except requests.exceptions.ConnectionError:
            results.append({
                'institution': row['institution'],
                'project_name': row['project_name'],
                'url': url,
                'status_code': None,
                'status': "Connection Error",
                'response': "Failed"
            })
        except requests.exceptions.Timeout:
            results.append({
                'institution': row['institution'],
                'project_name': row['project_name'],
                'url': url,
                'status_code': None,
                'status': "Timeout",
                'response': "Failed"
            })
        except requests.exceptions.RequestException as e:
            results.append({
                'institution': row['institution'],
                'project_name': row['project_name'],
                'url': url,
                'status_code': None,
                'status': f"Error: {str(e)[:100]}...",
                'response': "Failed"
            })

        time.sleep(0.5)

    # Create DataFrame from results
    results_df = pd.DataFrame(results)

    # Calculate statistics
    total_links = len(results_df)
    active_links = len(results_df[results_df['response'] == "Success"])
    failed_links = total_links - active_links

    print("\nSummary:")
    print(f"Total links checked: {total_links}")
    print(f"Active links: {active_links}")
    print(f"Failed links: {failed_links}")

    # Save to CSV
    results_df.to_csv('website_status_report.csv', index=False, encoding='utf-8')
    print("\nDetailed results saved to: website_status_report.csv")

    return results_df

if __name__ == "__main__":
    results_df = check_links_status('dh_projects_results.csv')

    if results_df is not None:
        # Display a sample of the results
        print("\nSample of results:")
        print(results_df[['institution', 'project_name', 'url', 'status', 'response']].head())

        # Display breakdown of different status types
        print("\nStatus breakdown:")
        print(results_df['status'].value_counts())

Checking 1/312: https://centri.unibo.it/dharc/en
Checking 2/312: https://centri.unibo.it/dharc/en/research
Checking 3/312: https://centri.unibo.it/dharc/en/research/topics
Checking 4/312: https://centri.unibo.it/dharc/en/research/projects-at-dh-arc
Checking 5/312: https://centri.unibo.it/dharc/en/research/partner
Checking 6/312: https://centri.unibo.it/dharc/en/research/visiting-fellow
Checking 7/312: http://www.iccd.beniculturali.it/it/progetti/4597/arco-architettura-della-conoscenza-ontologie-per-la-descrizione-del-patrimonio-culturale
Checking 8/312: http://artchives.fondazionezeri.unibo.it/
Checking 9/312: https://polifonia-project.github.io/clef/
Checking 10/312: https://github.com/polifonia-project/registry_app
Checking 11/312: https://projects.dharc.unibo.it/dhdkey/
Checking 12/312: https://dl.ficlit.unibo.it
Checking 13/312: https://aldomorodigitale.unibo.it/
Checking 14/312: http://www.mario-project.eu/portal/
Checking 15/312: https://www.iks-project.eu/
Checking 16/312: https

KeyboardInterrupt: 

In [None]:
import requests
import pandas as pd
from datetime import datetime
import time

def get_last_accessible_date(url):
    """
    Uses the Wayback Machine CDX API to find the most recent snapshot date when the URL was accessible.
    """
    if not url or pd.isna(url):  # Check for empty or NaN values
        return None

    api_url = f"http://web.archive.org/cdx/search/cdx?url={url}&output=json&filter=statuscode:200"
    try:
        response = requests.get(api_url, timeout=10)
        response.raise_for_status()

        data = response.json()

        # Skip header and find the most recent snapshot with a 200 status code
        if len(data) > 1:
            last_snapshot = data[-1]  # Last entry
            timestamp = last_snapshot[1]  # CDX API timestamp format is YYYYMMDDhhmmss
            # Convert timestamp to a human-readable date
            last_accessible_date = datetime.strptime(timestamp, "%Y%m%d%H%M%S").date()
            return last_accessible_date
        else:
            return None  # No accessible snapshot found
    except requests.exceptions.RequestException as e:
        print(f"Error checking {url}: {str(e)}")
        return None  # Return None instead of error message for consistency

def check_non200_links_in_wayback(status_report_csv):
    """
    Process only non-200 status links from the status report and check their last accessible date in Wayback Machine.
    """
    try:
        # Read the status report CSV file
        df = pd.read_csv(status_report_csv)

        required_columns = ['url', 'status_code']
        if not all(col in df.columns for col in required_columns):
            raise ValueError(f"CSV file must contain columns: {', '.join(required_columns)}")

        # Filter for non-200 status codes
        non200_df = df[df['status_code'] != 200].copy()

        if len(non200_df) == 0:
            print("No non-200 status links found to process.")
            return None

        print(f"Processing {len(non200_df)} non-200 status links...")

        # New column to store the last accessible date from Wayback Machine
        non200_df['wayback_last_accessible'] = None

        # Add a counter for progress tracking
        total = len(non200_df)
        for index, row in non200_df.iterrows():
            if (index + 1) % 5 == 0:  # Show progress every 5 items
                print(f"Processing {index + 1} of {total} URLs...")

            url = row['url']
            last_accessible_date = get_last_accessible_date(url)
            non200_df.at[index, 'wayback_last_accessible'] = last_accessible_date

            # Add a small delay to avoid overwhelming the API
            time.sleep(1)

        # Merge results back with original DataFrame
        df = df.merge(
            non200_df[['url', 'wayback_last_accessible']],
            on='url',
            how='left'
        )

        # Save results to a new CSV file
        output_file = 'website_status_with_wayback.csv'
        df.to_csv(output_file, index=False)
        print(f"\nResults saved to {output_file}")

        # Print summary statistics
        total_non200 = len(non200_df)
        archived_links = non200_df['wayback_last_accessible'].notna().sum()
        print(f"\nSummary:")
        print(f"Total non-200 links processed: {total_non200}")
        print(f"Links found in Wayback Machine: {archived_links}")
        print(f"Links not found: {total_non200 - archived_links}")

        # Display results for archived links
        if archived_links > 0:
            print("\nArchived links details:")
            archived_df = non200_df[non200_df['wayback_last_accessible'].notna()]
            for _, row in archived_df.iterrows():
                print(f"URL: {row['url']}")
                print(f"Original Status: {row['status_code']}")
                print(f"Last Accessible: {row['wayback_last_accessible']}")
                print("-" * 50)

        return df

    except Exception as e:
        print(f"Error processing CSV: {str(e)}")
        return None

if __name__ == "__main__":
    # Usage example
    results = check_non200_links_in_wayback('website_status_report.csv')

Processing 13 non-200 status links...
Error checking http://artchives.fondazionezeri.unibo.it/: HTTPConnectionPool(host='web.archive.org', port=80): Read timed out. (read timeout=10)
Processing 15 of 13 URLs...
Error checking https://www.iks-project.eu/: HTTPConnectionPool(host='web.archive.org', port=80): Read timed out. (read timeout=10)
Processing 20 of 13 URLs...
Error checking http://projects.dharc.unibo.it/mauth/search: HTTPConnectionPool(host='web.archive.org', port=80): Read timed out. (read timeout=10)
Error checking http://it/264/laboratorio-di-epigrafia-greca: HTTPConnectionPool(host='web.archive.org', port=80): Read timed out. (read timeout=10)
Error checking http://vbd.humnet.unipi.it/beta2/: HTTPConnectionPool(host='web.archive.org', port=80): Read timed out. (read timeout=10)
Error checking https://italianacademy.columbia.edu/content/frida: HTTPConnectionPool(host='web.archive.org', port=80): Read timed out. (read timeout=10)
Error checking https://web.unica.it/unica/it/

In [1]:
import pandas as pd
import requests

# Load the Excel file
file_path = 'filtering_DH_institutions_from_links.xlsx'
df = pd.read_excel(file_path)

# Set column names correctly and remove header row if needed
df.columns = df.iloc[0]
df = df[1:].reset_index(drop=True)
df.columns = ["category", "institution", "location", "institution_url", "project_name", "project_link"]

# Add a new column for Digital Edition detection
df["Digital Edition"] = ""

# Define the function to check for digital edition keywords
def check_digital_edition(url):
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            page_content = response.text.lower()
            # Look for keywords related to digital editions
            if "edizioni digitali" in page_content or "digital edition" in page_content or "edizione digitale" in page_content:
                return "yes"
    except requests.RequestException:
        pass
    return ""  # Leave blank if no keywords found or if there's an error

# Apply the function to each URL
df["Digital Edition"] = df["project_link"].apply(check_digital_edition)

# Save the updated DataFrame back to a new Excel file
output_path = 'filtered_DH_institutions_with_digital_editions.xlsx'
df.to_excel(output_path, index=False)
print(f"Processed file saved as {output_path}")


Processed file saved as filtered_DH_institutions_with_digital_editions.xlsx
