In [1]:
import requests
import base64
import json
import bs4
import pandas as pd
import pprint as pp
from azure.devops.connection import Connection
from msrest.authentication import BasicAuthentication
from requests_oauthlib import OAuth2Session
import os

In [9]:
PAT = os.environ["PAT"]
# URL for listing all wikis in the project
organization = "o365exchange"  # replace with your organization name
project = "IP%20Engineering"  # replace with your project name
list_wikis_url = f"https://dev.azure.com/{organization}/{project}/_apis/wiki/wikis?api-version=7.1-preview.2"

# Encode the PAT for Basic Authentication
token = base64.b64encode(f":{PAT}".encode("utf-8")).decode("utf-8")

# Set up the headers with the Base64 encoded PAT for authentication
headers = {
    'Authorization': f'Basic {token}', 
    "Content-Type": "application/json"
}

In [None]:
# Make GET request to Azure DevOps REST API to list all wikis
response = requests.get(list_wikis_url, headers=headers)

In [11]:
# Set up for listing all pages in the 'IP Engineering.wiki' wikis
wiki_id = "db2ead7a-d607-492b-ad78-8dc55a059bfa"
pages_url = f"https://dev.azure.com/{organization}/{project}/_apis/wiki/wikis/{wiki_id}/pagesbatch?api-version=7.2-preview.1"
all_pages = []


data = {
    "top": 100,  # Set to maximum valid value
    "continuationToken": None
    #"pageViewsForDays": 30
}
response = requests.post(pages_url, headers=headers, data=json.dumps(data))

In [None]:
print(response)

In [None]:
response.json()

In [5]:
# Function to list all wikis
def list_wikis():
    response = requests.get(list_wikis_url, headers=headers)
    if response.status_code == 200:
        return response.json().get('value', [])
    else:
        print(f"Failed to retrieve wikis. Status code: {response.status_code}")
        print(f"Error message: {response.text}")
        return []

In [6]:
# Function to list pages in a specific wiki
def list_pages(wiki_id):
    pages_url = f"https://dev.azure.com/{organization}/{project}/_apis/wiki/wikis/{wiki_id}/pagesbatch?api-version=7.1-preview.1"
    data = {
        "top": 100,  # You can adjust this number based on how many pages you expect
        "continuationToken": None
    }
    response = requests.post(pages_url, headers=headers, data=json.dumps(data))
    if response.status_code == 200:
        return response.json().get('value', [])
    else:
        print(f"Failed to retrieve wiki pages. Status code: {response.status_code}")
        print(f"Error message: {response.text}")
        return []

In [7]:
# Main script
wikis = list_wikis()

if wikis:
    print("Found wikis:")
    pp.pprint(wikis)
    
    first_wiki = wikis[0]
    wiki_id = first_wiki['id']
    print(f"\nFetching pages for the first wiki: {first_wiki['name']}")
    pages = list_pages(wiki_id)
    if pages:
        print(f"Found {len(pages)} pages for wiki {first_wiki['name']}")
        for page in pages:
            print(f"Page ID: {page['id']}")
            print(f"Page Path: {page['path']}")
            print("-" * 50)
    else:
        print("No pages found in the first wiki.")
else:
    print("No wikis found in the project.")
 

Found wikis:
[{'id': 'db2ead7a-d607-492b-ad78-8dc55a059bfa',
  'mappedPath': '/',
  'name': 'IP Engineering.wiki',
  'projectId': 'f2b55896-e832-438d-9220-cbc08c545713',
  'remoteUrl': 'https://dev.azure.com/O365Exchange/f2b55896-e832-438d-9220-cbc08c545713/_wiki/wikis/db2ead7a-d607-492b-ad78-8dc55a059bfa',
  'repositoryId': 'db2ead7a-d607-492b-ad78-8dc55a059bfa',
  'type': 'projectWiki',
  'url': 'https://dev.azure.com/O365Exchange/f2b55896-e832-438d-9220-cbc08c545713/_apis/wiki/wikis/db2ead7a-d607-492b-ad78-8dc55a059bfa',
  'versions': [{'version': 'wikiMaster'}]},
 {'id': 'c0690210-75b0-4dd6-8fd5-9df00951f223',
  'mappedPath': '/SCC-Engineering',
  'name': 'M365SCC',
  'projectId': 'f2b55896-e832-438d-9220-cbc08c545713',
  'remoteUrl': 'https://dev.azure.com/O365Exchange/f2b55896-e832-438d-9220-cbc08c545713/_wiki/wikis/c0690210-75b0-4dd6-8fd5-9df00951f223',
  'repositoryId': '6201cb83-d7d0-4c17-ad3f-7723fb6efb8f',
  'type': 'codeWiki',
  'url': 'https://dev.azure.com/O365Exchange/f2

In [None]:
''' James working Code'''

In [31]:
token = base64.b64encode(f":{PAT}".encode("utf-8")).decode("utf-8")
# Define the endpoint URL
url = f"https://dev.azure.com/o365exchange/IP%20Engineering/_apis/wiki/wikis/IP%20Engineering.wiki/pagesbatch?api-version=7.1-preview.1"

# Set up headers with authentication
headers = {
    "Content-Type": "application/json",
    "Authorization": f"Basic {token}"
}

# Function to get all pages
def get_all_wiki_pages(url, headers):
    all_pages = []
    continuation_token = "1"
    
    while True:
        # Add continuationToken to the query if it exists
        if continuation_token:
            data = {
                "continuationToken": continuation_token,
            }
        else:
            data = {}
        
        response = requests.post(url, headers=headers, data=json.dumps(data))
        
        response_data = response.json()
        
        
        # Append the pages to the list
        if 'value' in response_data:
            all_pages.extend(response_data['value'])
        
        # Check for continuationToken
        if 'X-MS-ContinuationToken' in response.headers:
            continuation_token = response.headers['X-MS-ContinuationToken']
        else:
            break
    
    return all_pages

In [33]:
# Function to save each page's content to a text file
def scrape_wiki_pages_to_txt(all_pages):
    for page in all_pages:
        page_id = page['id']
        page_path = page['path']
        page_title = page['path'].split('/')[-1]
        
        # Extract last part of the path
        last_page_path = page_path.rsplit('/', 1)[-1].replace('/', '-')  # Replace '/' with '-' in the last part
        
        # Construct the URL for the wiki page
        wiki_url = f"https://o365exchange.visualstudio.com/IP%20Engineering/_wiki/wikis/IP%20Engineering.wiki/{page_id}/{page_title}"
        
        # Make a GET request to fetch the content of the wiki page
        response = requests.get(wiki_url)
        
        if response.status_code == 200:
            # Parse HTML content using BeautifulSoup
            wiki = bs4.BeautifulSoup(response.text, "html.parser")
            
            # Extract paragraphs and write them to a text file
            with open(f"{page_id}_{last_page_path}.txt", "w", encoding="utf-8") as f:
                for paragraph in wiki.select("p"):
                    f.write(paragraph.getText() + "\n")
        else:
            print(f"Failed to fetch page {page_id}_{last_page_path}. Status code: {response.status_code}")

In [27]:
# Construct the URL for the wiki page
wiki_url = f"https://o365exchange.visualstudio.com/IP%20Engineering/_wiki/wikis/IP%20Engineering.wiki/{page_id}/{page_path}"

# Function to save each page's content to a text file
def scrape_wiki_pages_to_txt(all_pages):
        # Make a GET request to fetch the content of the wiki page
    response = requests.get(url, headers = headers)
    if response.status_code == 200:
        # Parse HTML content using BeautifulSoup
        all_pages = response.json()['content']
        for page in all_pages:
            page_id = page['id']
            page_path = page['path']
            page_title = page['path'].split('/')[-1]
            page_content = page['content']
        
            # Extract paragraphs and write them to a text file
            with open(f"{page_id}_{page_title}.txt", "w", encoding="utf-8") as file:
                for paragraph in wiki.select("p"):
                    file.write(paragraph.getText() + "\n")
    else:
        print(f"Failed to fetch page {page_id}_{page_title}. Status code: {response.status_code}")

In [34]:
# Fetch all pages
all_pages = get_all_wiki_pages(url, headers)
#print(f"Total pages retrieved: {len(all_pages)}")
#print(all_pages)

In [35]:
scrape_wiki_pages_to_txt(all_pages)

Failed to fetch page 187_Analysts Onboarding. Status code: 203
Failed to fetch page 188_On Call Process & Responsbilities. Status code: 203
Failed to fetch page 189_V-Team Updates. Status code: 203
Failed to fetch page 190_Org Security Team Wiki. Status code: 203
Failed to fetch page 191_Abuse Types. Status code: 203
Failed to fetch page 192_Outbound Spam. Status code: 203
Failed to fetch page 193_Enterprise outbound spam. Status code: 203
Failed to fetch page 194_Single Sender Compromise. Status code: 203
Failed to fetch page 195_Tenant. Status code: 203
Failed to fetch page 196_Malicious Tenant. Status code: 203


KeyboardInterrupt: 

In [32]:
token = base64.b64encode(f":{PAT}".encode("utf-8")).decode("utf-8")

# The API endpoint for the Azure REST API
api_url = "https://dev.azure.com/o365exchange/IP%20Engineering/_apis/wiki/wikis/IP%20Engineering.wiki/pagesbatch?api-version=7.1-preview.1"

# Headers for authentication
headers = {
    'Content-Type': 'application/json',
    'Authorization': f'Basic {token}'
}

def download_wiki_pages(api_url, headers):
    response = requests.get(api_url, headers=headers)
    if response.status_code == 200:
        pages = response.json()['value']
        for page in pages:
            page_id = page['id']
            page_title = page['path'].split('/')[-1]
            page_content = page['content']
            
            # Save the page content to a text file
            with open(f"{page_title}.txt", 'w', encoding='utf-8') as file:
                file.write(page_content)
            print(f"Downloaded {page_title}.txt")
    else:
        print(f"Failed to retrieve wiki pages: {response.status_code}")

# Call the function to start downloading wiki pages
download_wiki_pages(api_url, headers)

Failed to retrieve wiki pages: 405
