<a href="https://colab.research.google.com/github/ericyoc/gather_cyber_topics_poc/blob/main/gather_interesting_links.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
from google.colab import drive
import os
import re
from datetime import datetime
import requests
from urllib.parse import urlparse, parse_qs

def mount_drive():
    """Mount Google Drive and return base path"""
    try:
        drive.mount('/content/drive', force_remount=True)
        base_path = '/content/drive/My Drive/Interesting_Links'  # Modified to only look in Interesting_Links
        print("Successfully mounted Google Drive")
        return base_path
    except Exception as e:
        print(f"Error mounting drive: {str(e)}")
        raise

def get_url_description(url):
    """Generate a brief description for the URL"""
    try:
        parsed_url = urlparse(url)
        if 'youtube.com' in parsed_url.netloc or 'youtu.be' in parsed_url.netloc:
            return "YouTube video"
        elif 'github.com' in parsed_url.netloc:
            return "GitHub repository"
        elif 'medium.com' in parsed_url.netloc:
            return "Medium article"
        elif 'stackoverflow.com' in parsed_url.netloc:
            return "Stack Overflow post"
        elif 'docs.google.com' in parsed_url.netloc:
            return "Google Docs document"
        else:
            domain = parsed_url.netloc.replace('www.', '')
            return f"Resource from {domain}"
    except:
        return ""

def get_shortcut_info(file_path):
    """Extract information from shortcut file
    Returns tuple of (name, url, description)"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()

        url_match = re.search(r'URL=(.*)', content)
        if url_match:
            url = url_match.group(1).strip()
            name = os.path.splitext(os.path.basename(file_path))[0]

            # Try to get description from .txt file with same name
            description = ""
            desc_file = os.path.splitext(file_path)[0] + '.txt'
            if os.path.exists(desc_file):
                with open(desc_file, 'r', encoding='utf-8') as f:
                    description = f.read().strip()

            # If no description file exists, generate a basic description from URL
            if not description:
                description = get_url_description(url)

            return (name, url, description)
    except Exception as e:
        print(f"Error processing file {file_path}: {str(e)}")
        return None

def scan_directories(base_path):
    """Recursively scan directories and collect shortcut information
    Returns dictionary with directory paths as keys and lists of file info as values"""
    directory_contents = {}

    if not os.path.exists(base_path):
        print(f"Warning: Path {base_path} does not exist")
        return directory_contents

    for root, dirs, files in os.walk(base_path):
        shortcuts = []

        for file in files:
            if file.endswith('.url'):
                file_path = os.path.join(root, file)
                shortcut_info = get_shortcut_info(file_path)
                if shortcut_info:
                    shortcuts.append(shortcut_info)

        if shortcuts:
            # Get relative path from Interesting_Links directory
            dir_name = os.path.relpath(root, base_path)
            if dir_name == '.':  # For files directly in Interesting_Links
                dir_name = 'Root'
            directory_contents[dir_name] = shortcuts

    return directory_contents

def generate_rtf_content(directory_contents):
    """Generate RTF formatted content using directories as sections"""
    rtf_header = """{\\rtf1\\ansi\\deff0
{\\colortbl;\\red0\\green0\\blue0;\\red0\\green0\\blue255;}
{\\fonttbl{\\f0\\fswiss\\fcharset0 Arial;}{\\f1\\froman\\fcharset0 Times New Roman;}}
"""
    content = rtf_header

    # Add title and date
    current_date = datetime.now().strftime("%B %d, %Y")
    content += f"\\f0\\fs32\\b Resource Collection\\line"
    content += f"\\fs24\\i Generated on {current_date}\\i0\\line\\line"

    # Sort directories to ensure consistent ordering
    for directory in sorted(directory_contents.keys()):
        shortcuts = directory_contents[directory]

        # Add directory as section header
        content += f"\\fs28\\b {directory}\\line"
        content += "\\fs24 " + "="*50 + "\\line\\line"

        # Add shortcuts
        for name, url, description in shortcuts:
            content += f"\\b Name:\\b0 {name}\\line"
            content += f"\\b URL:\\b0 \\cf2\\ul {url}\\ulnone\\cf0\\line"
            if description:
                content += f"\\b Description:\\b0 {description}\\line"
            content += "-"*30 + "\\line\\line"

        content += "\\line"  # Extra space between sections

    content += "}"
    return content

def save_rtf_document(content):
    """Save RTF document to Colab content directory"""
    output_path = '/content/Resource_Collection.rtf'
    try:
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(content)
        print(f"Successfully {'updated' if os.path.exists(output_path) else 'created'} RTF document at: {output_path}")
    except Exception as e:
        print(f"Error saving document: {str(e)}")

def main():
    try:
        # Mount Google Drive and get base path
        base_path = mount_drive()

        # Scan directories
        directory_contents = scan_directories(base_path)

        if not directory_contents:
            print("No shortcuts found in Interesting_Links directory")
            return

        # Generate RTF content
        rtf_content = generate_rtf_content(directory_contents)

        # Save the RTF document to Colab content directory
        save_rtf_document(rtf_content)

    except Exception as e:
        print(f"An error occurred in main: {str(e)}")

if __name__ == "__main__":
    main()

Mounted at /content/drive
Successfully mounted Google Drive
Successfully updated RTF document at: /content/Resource_Collection.rtf
