In [1]:
import requests
from bs4 import BeautifulSoup
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd
import re
import pickle
from tqdm import tqdm


In [2]:


def nzta_oia_crawler_preparation(soup):
 
    # Parse the HTML content
    # Define the regular expression pattern to match the desired link format
    # The pattern will now match any URL that starts with 'oia-####-response' and ends with '.pdf'
    pattern = re.compile(r'oia-\d+-response.*\.pdf', re.IGNORECASE)

    # List to store the results with links and their preceding <p>, <h3>, <i>, and <h4> (year) elements
    results_2024_2023 = []

    # Find all links that match the pattern and capture their preceding <p>, <h3>, <i> (month), and <h4> (year)
    for link in soup.find_all('a', href=True):
        if pattern.search(link['href']):
            # Initialize variables to store the preceding <p>, <h3>, month, and year elements
            preceding_p = None
            preceding_h3 = None
            month = None
            year = None
            
            # Traverse backward in the document from the link's position
            prev_element = link.find_previous(['p', 'h3', 'i', 'h4'])
            while prev_element:
                if prev_element.name == 'p' and not preceding_p:
                    preceding_p = prev_element.get_text(strip=True)
                elif prev_element.name == 'h3' and not preceding_h3:
                    preceding_h3 = prev_element.get_text(strip=True)
                elif prev_element.name == 'i' and 'i-caret-down' in prev_element.get('class', []) and not month:
                    # Find the associated month from the parent <a> tag
                    month = prev_element.find_parent('a').get_text(strip=True)
                elif prev_element.name == 'h4' and not year:
                    # Find the year from the <h4> tag
                    year = prev_element.get_text(strip=True)
                # Break the loop if all elements are found
                if preceding_p and preceding_h3 and month and year:
                    break
                prev_element = prev_element.find_previous(['p', 'h3', 'i', 'h4'])
            
            # Store the result with the link and its associated <p>, <h3>, month, and year
            results_2024_2023.append({
                'url': link['href'],
                'response_letter': link.get_text(strip=True),
                'preceding_p': preceding_p,
                'preceding_h3': preceding_h3,
                'month': month,
                'year': year
            })

    return results_2024_2023


# Function to find related attachments based on the response file name
def find_related_attachments_2024_2023(response_filename):
    # Extract the ID number from the response filename (assuming the pattern is oia-####-response*.pdf)
    request_id_match = re.search(r'oia-(\d+)-response.*\.pdf', response_filename, re.IGNORECASE)

    if not request_id_match:
        print(f"No valid ID found in the filename: {response_filename}")
        return []  # No valid ID found, return an empty list

    request_id = request_id_match.group(1)

    # Create a pattern to identify attachment URLs with the same ID
    # The pattern allows for any characters between "attachment" and any file extension
    attachment_pattern = re.compile(r'oia-' + request_id + r'(-\d+)?-attachment.*\..+', re.IGNORECASE)




    # List to store found attachments
    attachments = []

    # Find all <a> tags that match the attachment pattern
    for link in soup.find_all('a', href=True):
        if attachment_pattern.search(link['href']):
            attachment_info = link['href']  # Store only the URL
            attachments.append(attachment_info)

    return attachments





def nzta_oia_non_media_crawler_preparation(soup):
    response_pattern = re.compile(r'^(?!.*attachment).*\.pdf', re.IGNORECASE)

    # List to store the results with links and their preceding <p>, <h3>, <i>, and <h4> (year) elements
    results_non_media = []

    # Find all links that match the pattern and exclude 'attachment'
    for link in soup.find_all('a', href=True):
        if response_pattern.search(link['href']):
            # Initialize variables to store the preceding <p>, <h3>, month, and year elements
            preceding_p = None
            preceding_h3 = None
            year = None
            
            # Traverse backward in the document from the link's position
            prev_element = link.find_previous(['p', 'h2', 'i'])
            while prev_element:
                if prev_element.name == 'p' and not preceding_p:
                    preceding_p = prev_element.get_text(strip=True)
                elif prev_element.name == 'h2' and not preceding_h3:
                    preceding_h3 = prev_element.get_text(strip=True)
                elif prev_element.name == 'i' and 'i-caret-down' in prev_element.get('class', []) and not year:
                    # Find the associated year from the parent <a> tag
                    year = prev_element.find_parent('a').get_text(strip=True)
            
                # Break the loop if all elements are found
                if preceding_p and preceding_h3 and year:
                    break
                prev_element = prev_element.find_previous(['p', 'h2', 'i'])
            
            # Store the result with the link and its associated <p>, <h3>, month, and year
            results_non_media.append({
                'url': link['href'],
                'response_letter': link.get_text(strip=True),
                'preceding_p': preceding_p,
                'preceding_h3': preceding_h3,
                'year': year
            })
    return results_non_media

def find_related_attachments_non_media(response_filename):
    # Extract the ID number from the response filename (assuming the pattern is oia-####-response*.pdf)
    request_id_match = re.search(r'oia-(\d+)-.*\.pdf', response_filename, re.IGNORECASE)

    
    if not request_id_match:
        return []  # No valid ID found, return an empty list

    request_id = request_id_match.group(1)

    # Create a pattern to identify attachment URLs with the same ID
    # The pattern allows for any characters between "attachment" and any file extension
    attachment_pattern = re.compile(r'oia-' + request_id + r'(-\d+)?-attachment.*\..+', re.IGNORECASE)


    # List to store found attachments
    attachments = []

    # Find all <a> tags that match the attachment pattern
    for link in soup.find_all('a', href=True):
        if attachment_pattern.search(link['href']):
            attachment_info = link['href']  # Store only the URL
            attachments.append(attachment_info)

    return attachments


def nzta_oia_media_crawler_preparation(soup):
    # Modify the initial pattern to exclude 'attachment' URLs
    response_pattern = re.compile(r'^(?!.*attachment).*\.pdf', re.IGNORECASE)

    # List to store the results with links and their preceding <p>, <h3>, <i>, and <h4> (year) elements
    results_media = []

    # Find all links that match the pattern and exclude 'attachment'
    for link in soup.find_all('a', href=True):
        if response_pattern.search(link['href']):
            # Initialize variables to store the preceding <p>, <h3>, month, and year elements
            preceding_p = None
            preceding_h3 = None
            year = None
            
            # Traverse backward in the document from the link's position
            prev_element = link.find_previous(['p', 'h2', 'i'])
            while prev_element:
                if prev_element.name == 'p' and not preceding_p:
                    preceding_p = prev_element.get_text(strip=True)
                elif prev_element.name == 'h2' and not preceding_h3:
                    preceding_h3 = prev_element.get_text(strip=True)
                elif prev_element.name == 'i' and 'i-caret-down' in prev_element.get('class', []) and not year:
                    # Find the associated year from the parent <a> tag
                    year = prev_element.find_parent('a').get_text(strip=True)
            
                # Break the loop if all elements are found
                if preceding_p and preceding_h3 and year:
                    break
                prev_element = prev_element.find_previous(['p', 'h2', 'i'])
            
            # Store the result with the link and its associated <p>, <h3>, month, and year
            results_media.append({
                'url': link['href'],
                'response_letter': link.get_text(strip=True),
                'preceding_p': preceding_p,
                'preceding_h3': preceding_h3,
                'year': year
            })
    return results_media


# Function to find related attachments based on the response file name
def find_related_attachments_media(response_filename):
    # Extract the ID number from the response filename (assuming the pattern is oia-####-response*.pdf)
    request_id_match = re.search(r'oia-(\d+)-.*\.pdf', response_filename)
    
    if not request_id_match:
        return []  # No valid ID found, return an empty list

    request_id = request_id_match.group(1)

    # Create a pattern to identify attachment URLs with the same ID
    # The pattern allows for any characters between "attachment" and any file extension
    attachment_pattern = re.compile(r'oia-' + request_id + r'(-\d+)?-attachment.*\..+', re.IGNORECASE)



    # List to store found attachments
    attachments = []

    # Find all <a> tags that match the attachment pattern
    for link in soup.find_all('a', href=True):
        if attachment_pattern.search(link['href']):
            attachment_info = link['href']  # Store only the URL
            attachments.append(attachment_info)

    return attachments


# # Output the results
# for result in results_2024_2023:
#     print(f"URL: {result['url']}")
#     print(f"Response Letter: {result['response_letter']}")
#     print(f"Preceding <p>: {result['preceding_p']}")
#     print(f"Preceding <h3>: {result['preceding_h3']}")
#     print(f"Month: {result['month']}")
#     print(f"Year: {result['year']}")
#     print("=" * 40)


In [3]:



response = requests.get('https://www.nzta.govt.nz/about-us/official-information-act/official-information-act-responses/')
soup = BeautifulSoup(response.content, 'html.parser')

results_2024_2023 = nzta_oia_crawler_preparation(soup)


# Loop over each response in the results list to find and attach related attachments
for result in results_2024_2023:
    response_filename = result['url']
    related_attachments = find_related_attachments_2024_2023(response_filename)
    result['attachments'] = related_attachments

# Output the results with attached attachments
for result in results_2024_2023:
    print(f"URL: {result['url']}")
    print(f"Response Letter: {result['response_letter']}")
    print(f"Preceding <p>: {result['preceding_p']}")
    print(f"Preceding <h3>: {result['preceding_h3']}")
    print(f"Month: {result['month']}")
    print(f"Year: {result['year']}")
    print("Attachments:")
    for attachment in result['attachments']:
        print(f"  - {attachment}")
    print("=" * 40)





URL: /assets/About-us/docs/oia-2024/oia-16747-response-letter.pdf
Response Letter: Response letter[PDF, 347 KB]
Preceding <p>: Responded: 20 November 2024Requested by: Member of the public
Preceding <h3>: Advice to the minister on Hawke’s Bay Expressway
Month: November
Year: 2024
Attachments:
  - /assets/About-us/docs/oia-2024/oia-16747-attachment-1.pdf
  - /assets/About-us/docs/oia-2024/oia-16747-attachment-2.pdf
URL: /assets/About-us/docs/oia-2024/oia-16809-response-letter.pdf
Response Letter: Response letter[PDF, 123 KB]
Preceding <p>: Responded: 19 November 2024Requested by: Member of the public
Preceding <h3>: Traffic management plan for Ironman 70.3 world championship – Taupo, 14–15 December 2024
Month: November
Year: 2024
Attachments:
  - /assets/About-us/docs/oia-2024/oia-16809-attachment-1.pdf
URL: /assets/About-us/docs/oia-2024/oia-16706-response-letter.pdf
Response Letter: Response letter[PDF, 261 KB]
Preceding <p>: Responded: 19 November 2024Requested by: Member of the publ

In [4]:


# Loop over each response in the results list to find and attach related attachments

response = requests.get('https://www.nzta.govt.nz/about-us/official-information-act/official-information-act-responses/non-media-official-information-act-oia-responses/')
soup = BeautifulSoup(response.content, 'html.parser')

results_non_media = nzta_oia_non_media_crawler_preparation(soup)
for result in results_non_media:
    response_filename = result['url']
    related_attachments = find_related_attachments_non_media(response_filename)
    result['attachments'] = related_attachments

# Output the results with attached attachments
for result in results_non_media:
    print(f"URL: {result['url']}")
    print(f"Response Letter: {result['response_letter']}")
    print(f"Preceding <p>: {result['preceding_p']}")
    print(f"Preceding <h3>: {result['preceding_h3']}")
    print(f"Year: {result['year']}")
    print("Attachments:")
    for attachment in result['attachments']:
        print(f"  - {attachment}")
    print("=" * 40)



URL: /assets/About-us/docs/oia2-2022/oia-11576-response.pdf
Response Letter: Response letter[PDF, 234 KB]
Preceding <p>: Responded: 23 December 2022
Preceding <h3>: Transport projects funded by Waka Kotahi where spending commenced for construction between 2009–2022
Year: 2022
Attachments:
  - /assets/About-us/docs/oia2-2022/oia-11576-attachment.xlsx
URL: /assets/About-us/docs/oia2-2022/oia-11570-response.pdf
Response Letter: Response letter[PDF, 326 KB]
Preceding <p>: Responded: 22 December 2022
Preceding <h3>: SH2 speed review and safety improvements – public engagement and safety assessments
Year: 2022
Attachments:
  - /assets/About-us/docs/oia2-2022/oia-11570-attachment-1.pdf
  - /assets/About-us/docs/oia2-2022/oia-11570-attachment-2.pdf
URL: /assets/About-us/docs/oia2-2022/oia-11521-response.pdf
Response Letter: Response letter[PDF, 261 KB]
Preceding <p>: Responded: 22 December 2022
Preceding <h3>: SH2 Hebden Crescent and Liverton Road crash stats / feasibility design report
Year: 

Finally the same for media requests


In [5]:


response = requests.get('https://www.nzta.govt.nz/about-us/official-information-act/official-information-act-responses/media-official-information-act-oia-responses/')
# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')

results_media = nzta_oia_media_crawler_preparation(soup)



# Loop over each response in the results list to find and attach related attachments
for result in results_media:
    response_filename = result['url']
    related_attachments = find_related_attachments_media(response_filename)
    result['attachments'] = related_attachments

# Output the results with attached attachments
for result in results_media:
    print(f"URL: {result['url']}")
    print(f"Response Letter: {result['response_letter']}")
    print(f"Preceding <p>: {result['preceding_p']}")
    print(f"Preceding <h3>: {result['preceding_h3']}")
    print(f"Year: {result['year']}")
    print("Attachments:")
    for attachment in result['attachments']:
        print(f"  - {attachment}")
    print("=" * 40)


URL: /assets/About-us/docs/oia-2022/oia-11277-response-phil-pennington.pdf
Response Letter: Response letter[PDF, 94 KB]
Preceding <p>: Responded: 19 December 2022Requested by: Radio New Zealand
Preceding <h3>: National Ticketing Solution business case and other reports
Year: 2022
Attachments:
  - /assets/About-us/docs/oia-2022/oia-11277-attachment-1.pdf
  - /assets/About-us/docs/oia-2022/oia-11277-attachment-2.pdf
  - /assets/About-us/docs/oia-2022/oia-11277-attachment-3.pdf
  - /assets/About-us/docs/oia-2022/oia-11277-attachment-4.pdf
URL: /assets/About-us/docs/oia-2022/OIA-11558-response-letter.pdf
Response Letter: Response letter[PDF, 67 KB]
Preceding <p>: Responded: 15 December 2022Requested by: Wairarapa Age Times
Preceding <h3>: Limited Licence processing times
Year: 2022
Attachments:
URL: /assets/About-us/docs/oia-2022/OIA-11491-response-caroline-williams.pdf
Response Letter: Response letter[PDF, 122 KB]
Preceding <p>: Responded: 14 December 2022Requested by: Stuff
Preceding <h3

In [6]:
all_results = results_2024_2023 + results_non_media + results_media

In [7]:

# Save to a file
with open('file_metadata.pkl', 'wb') as file:
    pickle.dump(all_results, file)


In [8]:
# List of month names for extraction
MONTHS = [
    'January', 'February', 'March', 'April', 'May', 'June',
    'July', 'August', 'September', 'October', 'November', 'December'
]

def extract_month(text):
    """
    Extracts the first occurrence of a month name from the given text.

    Parameters:
        text (str): The text to search for a month.

    Returns:
        str: The found month name or 'Unknown' if no month is found.
    """
    for month in MONTHS:
        if month.lower() in text.lower():
            return month
    return 'Unknown'

def download_file(url, save_path):
    """
    Downloads a file from the given URL and saves it to the specified path.

    Parameters:
        url (str): The URL of the file to download.
        save_path (str): The local file path to save the downloaded file.
    """
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()  # Raise an error for bad status codes
        with open(save_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    file.write(chunk)
    except requests.exceptions.RequestException as e:
        print(f"Failed to download {url}. Error: {e}")

def main(all_results):
    """
    Processes the results to download files organized by year and month.

    Parameters:
        all_results (list): A list of result dictionaries containing file information.
    """
    base_url = 'https://www.nzta.govt.nz'
    base_dir = "transport_data/nzta"
    printed_years = set()

    for result in tqdm(all_results, desc="Processing Results", unit="file"):
        # Extract and validate the year
        year_str = result.get('year', '').strip()
        try:
            year = int(year_str)
        except ValueError:
            print(f"Invalid year '{year_str}' in result. Skipping entry.")
            continue

        # Determine the month based on the year
        if year <= 2022:
            preceding_p = result.get('preceding_p', '')
            month = extract_month(preceding_p)
        else:
            month = result.get('month', 'Unknown')

        # Print the year being downloaded only once
        if year not in printed_years:
            print(f"📂 Downloading files for year {year}")
            printed_years.add(year)

        # Define the directory path
        dir_path = os.path.join(base_dir, str(year), month)
        os.makedirs(dir_path, exist_ok=True)

        # Download the main response letter PDF
        response_relative_url = result.get('url', '')
        if not response_relative_url:
            print("No URL found for response letter. Skipping.")
            continue
        response_url = base_url + response_relative_url
        response_filename = os.path.basename(response_relative_url)
        response_save_path = os.path.join(dir_path, response_filename)
        download_file(response_url, response_save_path)

        # Download all attachments
        attachments = result.get('attachments', [])
        for attachment_relative_url in attachments:
            attachment_url = base_url + attachment_relative_url
            attachment_filename = os.path.basename(attachment_relative_url)
            attachment_save_path = os.path.join(dir_path, attachment_filename)
            download_file(attachment_url, attachment_save_path)

if __name__ == "__main__":
    # Example usage:
    # Replace the following list with your actual `all_results` data

    main(all_results)


Processing Results:   0%|          | 0/1070 [00:00<?, ?file/s]

📂 Downloading files for year 2024


Processing Results:  19%|█▉        | 201/1070 [01:27<02:28,  5.84file/s]

📂 Downloading files for year 2023


Processing Results:  43%|████▎     | 461/1070 [02:54<02:24,  4.22file/s]

📂 Downloading files for year 2022


Processing Results:  52%|█████▏    | 553/1070 [03:42<01:35,  5.41file/s]

📂 Downloading files for year 2021


Processing Results:  57%|█████▋    | 607/1070 [03:55<01:05,  7.07file/s]

📂 Downloading files for year 2020


Processing Results:  58%|█████▊    | 622/1070 [04:08<01:31,  4.92file/s]

📂 Downloading files for year 2019


Processing Results:  81%|████████  | 862/1070 [06:21<00:48,  4.29file/s]

📂 Downloading files for year 2018


Processing Results:  86%|████████▌ | 918/1070 [06:27<00:16,  9.25file/s]

📂 Downloading files for year 2017


Processing Results:  88%|████████▊ | 939/1070 [06:37<02:05,  1.04file/s]

📂 Downloading files for year 2016


Processing Results:  89%|████████▉ | 953/1070 [06:41<00:26,  4.48file/s]

📂 Downloading files for year 2015


Processing Results:  91%|█████████▏| 979/1070 [06:47<00:12,  7.15file/s]

📂 Downloading files for year 2014


Processing Results:  94%|█████████▍| 1006/1070 [06:50<00:07,  8.96file/s]

📂 Downloading files for year 2013


Processing Results:  98%|█████████▊| 1051/1070 [06:57<00:02,  7.74file/s]

📂 Downloading files for year 2012


Processing Results: 100%|██████████| 1070/1070 [06:59<00:00,  2.55file/s]


In [14]:
all_results
experiment_files = pd.read_csv('reference_tables/oia_files_and_questions.csv')

experiment_files.head()

Unnamed: 0,File Name,Question,Answer
0,oia-7339-nigel-parry.pdf,What is the NZTA planning in terms of road net...,NZTA is considering changes as part of the Mar...
1,oia-7339-nigel-parry.pdf,When is the draft Strategic Case for Picton Po...,The draft Strategic Case for Picton Port Acces...
2,oia-12662-response-letter.pdf,What is the crash history for the SH2/SH53 int...,"According to the Crash Analysis System (CAS), ..."
3,oia-12662-response-letter.pdf,Are there any future upgrades planned to addre...,The speed limit in Featherston was reduced to ...
4,oia-13817-response-letter.pdf,How many MR1 and MR1B transactions were comple...,"A total of 3,927,179 MR1 and MR1B transactions..."


In [None]:
import os
import requests
import pandas as pd

# Load the experiment files with file names
experiment_files = pd.read_csv('reference_tables/oia_files_and_questions.csv')

# Base URL for constructing full links
base_url = "https://www.nzta.govt.nz"

# Directory to save the downloaded files
output_dir = 'transport_data/sample_files/nzta/simple_files/'
os.makedirs(output_dir, exist_ok=True)

# Loop through the rows in the experiment files DataFrame
for index, row in experiment_files.iterrows():
    file_name = row['File Name']
    
    # Find the matching entry in all_results
    for result in all_results:
        # Check if the file name matches the end of the 'url' field in all_results
        if file_name in result['url']:
            # Download the main file
            full_url = base_url + result['url']
            print(f"Downloading main file: {file_name} from {full_url}")
            response = requests.get(full_url)
            file_path = os.path.join(output_dir, file_name)
            with open(file_path, 'wb') as f:
                f.write(response.content)
            
            # Download attachments, if any
            for attachment in result.get('attachments', []):
                attachment_url = base_url + attachment
                attachment_name = attachment.split('/')[-1]
                print(f"Downloading attachment: {attachment_name} from {attachment_url}")
                response = requests.get(attachment_url)
                attachment_path = os.path.join(output_dir, attachment_name)
                with open(attachment_path, 'wb') as f:
                    f.write(response.content)


In [9]:
# Download simple files
simple_files = pd.read_csv('reference_tables/simple_files.csv')

# loop over the rows in simple files and select the Link column
# find the ntry in all_results that matches the link and download that link and the attatchments if any into testing_data/simple_files


for index, row in simple_files.iterrows():
    link = row['Link']
    #print(link)
    for result in all_results:
        if 'https://www.nzta.govt.nz' + result['url'] == link:
            
            # Make the directory if it doesn't exist
            os.makedirs('transport_data/sample_files/nzta/simple_files/', exist_ok=True)
            print(f'transport_data/sample_files/nzta/simple_files/{link.split("/")[-1]}')
            response = requests.get(link)
            with open(f'transport_data/sample_files/nzta/simple_files/{link.split("/")[-1]}', 'wb') as f:
                f.write(response.content)
            for attachment in result['attachments']:
                response = requests.get(attachment)
                with open(f'testing_data/simple_files/{attachment.split("/")[-1]}', 'wb') as f:
                    f.write(response.content)

transport_data/sample_files/nzta/simple_files/oia-16040-response-letter.pdf
transport_data/sample_files/nzta/simple_files/oia-15868-response-letter.pdf
transport_data/sample_files/nzta/simple_files/oia-15866-response-letter.pdf
transport_data/sample_files/nzta/simple_files/OIA-15786-response-letter.pdf
transport_data/sample_files/nzta/simple_files/oia-15726-response-letter.pdf
transport_data/sample_files/nzta/simple_files/oia-15454-response-letter.pdf
transport_data/sample_files/nzta/simple_files/oia-15602-response-letter.pdf
transport_data/sample_files/nzta/simple_files/oia-15503-response-letter.pdf
transport_data/sample_files/nzta/simple_files/oia-15628-response-letter.pdf
transport_data/sample_files/nzta/simple_files/oia-15406-response-letter.pdf
transport_data/sample_files/nzta/simple_files/oia-15483-response-letter.pdf
transport_data/sample_files/nzta/simple_files/oia-15366-response-letter.pdf
transport_data/sample_files/nzta/simple_files/oia-15239-response-letter.pdf
transport_da

In [None]:
# Download simple tabled files
simple_tabled_files = pd.read_csv('reference_tables/simple_tabled_files.csv')

# loop over the rows in simple tabled files and select the Link column
# find the entry in all_results that matches the link and download that link and the attatchments if any into testing_data/simple_tabled_files
for index, row in simple_tabled_files.iterrows():
    link = row['Link']
    #print(link)
    for result in all_results:
        if 'https://www.nzta.govt.nz' + result['url'] == link:
            print('yes')
            print(f'transport_data/simple_table_files/{link.split("/")[-1]}')
            os.makedirs('transport_data/sample_files/nzta/simple_tabled_files', exist_ok=True)
            response = requests.get(link)
            with open(f'transport_data/sample_files/nzta/simple_tabled_files/{link.split("/")[-1]}', 'wb') as f:
                f.write(response.content)
            for attachment in result['attachments']:
                response = requests.get(attachment)
                with open(f'testing_data/simple_table_files/{attachment.split("/")[-1]}', 'wb') as f:
                    f.write(response.content)

yes
transport_data/simple_table_files/oia-15911-response-letter.pdf
yes
transport_data/simple_table_files/oia-15470-response-letter.pdf
yes
transport_data/simple_table_files/OIA-15348-response-letter.pdf
yes
transport_data/simple_table_files/OIA-15005-response-letter.pdf
yes
transport_data/simple_table_files/OIA-14738-response-letter.pdf
yes
transport_data/simple_table_files/oia-14203-response-letter.pdf
yes
transport_data/simple_table_files/oia-13918-response-letter.pdf
yes
transport_data/simple_table_files/oia-13816-response-letter.pdf
yes
transport_data/simple_table_files/oia-13671-response-letter.pdf
yes
transport_data/simple_table_files/oia-13759-response-letter.pdf
yes
transport_data/simple_table_files/oia-13817-response-letter.pdf
yes
transport_data/simple_table_files/oia-13654-response-letter.pdf
yes
transport_data/simple_table_files/oia-13673-response-letter.pdf
yes
transport_data/simple_table_files/oia-13480-response-letter.pdf
yes
transport_data/simple_table_files/oia-13351-

In [12]:
all_results

[{'url': '/assets/About-us/docs/oia-2024/oia-16747-response-letter.pdf',
  'response_letter': 'Response letter[PDF, 347 KB]',
  'preceding_p': 'Responded: 20 November 2024Requested by: Member of the public',
  'preceding_h3': 'Advice to the minister on Hawke’s Bay Expressway',
  'month': 'November',
  'year': '2024',
  'attachments': ['/assets/About-us/docs/oia-2024/oia-16747-attachment-1.pdf',
   '/assets/About-us/docs/oia-2024/oia-16747-attachment-2.pdf']},
 {'url': '/assets/About-us/docs/oia-2024/oia-16809-response-letter.pdf',
  'response_letter': 'Response letter[PDF, 123 KB]',
  'preceding_p': 'Responded: 19 November 2024Requested by: Member of the public',
  'preceding_h3': 'Traffic management plan for Ironman 70.3 world championship – Taupo, 14–15 December 2024',
  'month': 'November',
  'year': '2024',
  'attachments': ['/assets/About-us/docs/oia-2024/oia-16809-attachment-1.pdf']},
 {'url': '/assets/About-us/docs/oia-2024/oia-16706-response-letter.pdf',
  'response_letter': 'R

In [46]:
# Download annexed files
annexed_files = pd.read_csv('reference_tables/annexed_files.csv')
# select only half of the annexed files rows
# loop over the rows in annexed files and select the Link column
# find the ntry in all_results that matches the link and download that link and the attatchments if any into testing_data/annexed_files
for index, row in annexed_files.iterrows():
    link = row['Link']
    #print(link)
    for result in all_results:
        if 'https://www.nzta.govt.nz' + result['url'] == link:
            
            print(f"file found")
            response = requests.get(link)
            # make dir transport_data/sample_files/nzta/annexed_files

            os.makedirs('transport_data/sample_files/nzta/annexed_files', exist_ok=True)
            with open(f'transport_data/sample_files/nzta/annexed_files/{link.split("/")[-1]}', 'wb') as f:
                f.write(response.content)
            for attachment in result['attachments']:
                print(f"found attachment")
                response = requests.get('https://www.nzta.govt.nz' + attachment)
                with open(f'transport_data/sample_files/nzta/annexed_files/{attachment.split("/")[-1]}', 'wb') as f:
                    f.write(response.content)

file found
file found
file found
file found
file found
file found
file found
file found
file found
file found
file found
file found
file found
file found
file found


In [33]:
# Download annexed tabled files
annexed_tabled_files = pd.read_csv('reference_tables/annexed_tabled_files.csv')

# loop over the rows in annexed tabled files and select the Link column
# find the ntry in all_results that matches the link and download that link and the attatchments if any into testing_data/annexed_tabled_files

for index, row in annexed_tabled_files.iterrows():
    link = row['Link']
    #print(link)
    for result in all_results:
        if 'https://www.nzta.govt.nz' + result['url'] == link:
           

           # makde directory if it doesn't exist
            os.makedirs('transport_data/sample_files/nzta/annexed_table_files', exist_ok=True)

            response = requests.get(link)
            with open(f'transport_data/sample_files/nzta/annexed_table_files/{link.split("/")[-1]}', 'wb') as f:
                f.write(response.content)
            for attachment in result['attachments']:
                response = requests.get('https://www.nzta.govt.nz' + attachment)
                with open(f'transport_data/sample_files/annexed_table)files/{attachment.split("/")[-1]}', 'wb') as f:
                    f.write(response.content)

In [36]:
# # Look for .zip files in all of the testing data directories and extract them
# import zipfile
# for directory in os.listdir('transport_data/'):
#     for file in os.listdir(f'transport_data/{directory}'):
#         if file.endswith('.zip'):
#             print(f'testing_data/{directory}/{file}')
#             with zipfile.ZipFile(f'testing_data/{directory}/{file}', 'r') as zip_ref:
#                 zip_ref.extractall(f'testing_data/{directory}/')
#             os.remove(f'testing_data/{directory}/{file}')


In [None]:
# import random
# import requests
# import os

# # Set the random seed and sample 500 results
# random.seed(42)
# sample = random.sample(all_results, 500)

# # Directory to save the files
# output_dir = 'testing_data/noise_files'
# os.makedirs(output_dir, exist_ok=True)

# # Initialize the count of downloaded files
# count = 0

# # Download the sample
# for result in sample:
#     # Download the main file
#     link = 'https://www.nzta.govt.nz' + result['url']
#     print(f"Downloading main file: {link}")
#     response = requests.get(link)
#     with open(os.path.join(output_dir, link.split("/")[-1]), 'wb') as f:
#         f.write(response.content)
    
#     # Increment count for the main file
#     count += 1
    
#     # Download the attachments
#     for attachment in result['attachments']:
#         attachment_link = 'https://www.nzta.govt.nz' + attachment
#         print(f"Downloading attachment: {attachment_link}")
#         response = requests.get(attachment_link)
#         with open(os.path.join(output_dir, attachment.split("/")[-1]), 'wb') as f:
#             f.write(response.content)
        
#         # Increment count for each attachment
#         count += 1
        
#     # If the count has reached or exceeded 500, stop downloading more files
#     if count >= 500:
#         break  # Stop the main loop if the limit is reached


In [37]:
# # delete any mp4 and xlsx files
# for directory in os.listdir('transport_data/'):
#     for file in os.listdir(f'transport_data/{directory}'):
#         if file.endswith('.mp4') or file.endswith('.xlsx') or file.endswith('.wav'):
#             print(f'testing_data/{directory}/{file}')
#             os.remove(f'testing_data/{directory}/{file}')

In [19]:
# # unzip all .zip files in the noise_files directory
# import zipfile
# for file in os.listdir('testing_data/noise_files'):
#     if file.endswith('.zip'):
#         print(f'testing_data/noise_files/{file}')
#         with zipfile.ZipFile(f'testing_data/noise_files/{file}', 'r') as zip_ref:
#             zip_ref.extractall(f'testing_data/noise_files/')
#         os.remove(f'testing_data/noise_files/{file}')

testing_data/noise_files/oia-10666-attachments.zip
testing_data/noise_files/oia-10925-attachments-1-10.zip
testing_data/noise_files/oia-10925-attachments-11-20.zip
testing_data/noise_files/oia-10925-attachments-21-30.zip
testing_data/noise_files/oia-10925-attachments-31-40.zip
testing_data/noise_files/oia-10925-attachments-41-50a.zip
testing_data/noise_files/oia-12093-Attachments-x-30.zip
testing_data/noise_files/oia-13081-attachments-7-files.zip
testing_data/noise_files/OIA-14259-attachments.zip
testing_data/noise_files/oia-15739-attachments-6-files.zip
