In [None]:
def order_urls_alphabetically(input_file, output_file):
    """
    Reads URLs from the input file, sorts them alphabetically, and writes them to the output file.

    Args:
        input_file (str): Path to the input file containing URLs.
        output_file (str): Path to the output file where sorted URLs will be written.
    """
    try:
        # Read URLs from the input file
        with open(input_file, 'r') as file:
            urls = file.readlines()

        # Strip newline characters and sort URLs alphabetically
        urls = sorted([url.strip() for url in urls])

        # Write sorted URLs to the output file
        with open(output_file, 'w') as file:
            for url in urls:
                file.write(url + '\n')

        print(f"Sorted URLs have been written to {output_file}")

    except FileNotFoundError:
        print(f"Input file {input_file} not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
input_file = 'ulusofona_crawler/ulusofona_crawler/urls.txt'
output_file = 'sorted_urls.txt'
order_urls_alphabetically(input_file, output_file)

In [1]:
def remove_lines_with_extensions(input_file, output_file, extensions):
    """
    Reads lines from the input file, removes lines that match the specified extensions, and writes the remaining lines to the output file.

    Args:
        input_file (str): Path to the input file.
        output_file (str): Path to the output file.
        extensions (list): List of file extensions to remove.
    """
    try:
        # Read lines from the input file
        with open(input_file, 'r') as file:
            lines = file.readlines()

        # Filter out lines that match the specified extensions
        filtered_lines = [line for line in lines if not any(line.strip().endswith(ext) for ext in extensions)]

        # Write filtered lines to the output file
        with open(output_file, 'w') as file:
            for line in filtered_lines:
                file.write(line)

        print(f"Lines with specified extensions have been removed and written to {output_file}")

    except FileNotFoundError:
        print(f"Input file {input_file} not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
input_file = 'sorted_urls.txt'
output_file = 'filtered_urls.txt'
extensions = ['.pdf', '.jpg', '.jpeg', '.png', '.docx', '.xlsx']
remove_lines_with_extensions(input_file, output_file, extensions)

Lines with specified extensions have been removed and written to filtered_urls.txt


In [3]:
import requests
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def filter_urls_by_status_code(input_file, output_file):
    """
    Reads URLs from the input file, filters out URLs that do not return a 200 HTTP status code, and writes the remaining URLs to the output file.

    Args:
        input_file (str): Path to the input file containing URLs.
        output_file (str): Path to the output file where filtered URLs will be written.
    """
    try:
        # Read URLs from the input file
        with open(input_file, 'r') as file:
            urls = file.readlines()

        # Filter out URLs that do not return a 200 HTTP status code
        filtered_urls = []
        for url in urls:
            try:
                response = requests.head(url.strip(), timeout=5, verify=False)
                if response.status_code == 200:
                    filtered_urls.append(url)
            except requests.exceptions.RequestException as e:
                print(f"Error occurred while testing {url.strip()}: {e}")

        # Write filtered URLs to the output file
        with open(output_file, 'w') as file:
            for url in filtered_urls:
                file.write(url)

        print(f"Filtered URLs have been written to {output_file}")

    except FileNotFoundError:
        print(f"Input file {input_file} not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
input_file = 'filtered_urls.txt'
output_file = 'valid_urls.txt'
filter_urls_by_status_code(input_file, output_file)

Filtered URLs have been written to valid_urls.txt


In [4]:
def remove_duplicate_share_modal_urls(input_file, output_file):
    """
    Reads URLs from the input file, removes duplicate URLs with the "#share-modal" suffix, and writes the remaining URLs to the output file.

    Args:
        input_file (str): Path to the input file containing URLs.
        output_file (str): Path to the output file where filtered URLs will be written.
    """
    try:
        # Read URLs from the input file
        with open(input_file, 'r') as file:
            urls = file.readlines()

        # Create a set to store unique URLs
        unique_urls = set()

        # Create a list to store filtered URLs
        filtered_urls = []

        # Iterate over the URLs
        for url in urls:
            # Remove the newline character
            url = url.strip()

            # Check if the URL contains "#share-modal"
            if "#share-modal" in url:
                # Remove the "#share-modal" suffix
                base_url = url.split("#share-modal")[0]

                # Check if the base URL is already in the unique URLs set
                if base_url in unique_urls:
                    # Skip this URL
                    continue
                else:
                    # Add the base URL to the unique URLs set
                    unique_urls.add(base_url)

                    # Add the URL to the filtered URLs list
                    filtered_urls.append(url)
            else:
                # Add the URL to the unique URLs set
                unique_urls.add(url)

                # Add the URL to the filtered URLs list
                filtered_urls.append(url)

        # Write filtered URLs to the output file
        with open(output_file, 'w') as file:
            for url in filtered_urls:
                file.write(url + "\n")

        print(f"Filtered URLs have been written to {output_file}")

    except FileNotFoundError:
        print(f"Input file {input_file} not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
input_file = 'valid_urls.txt'
output_file = 'unique_urls.txt'
remove_duplicate_share_modal_urls(input_file, output_file)

Filtered URLs have been written to unique_urls.txt
