In [61]:
!pip install requests



In [62]:
import os
import requests

def download_files(base_url, start_pattern):
    # Ensure the save directory exists
    save_directory = "../inputs/yml_files/"
    if not os.path.exists(save_directory):
        os.makedirs(save_directory)
    
    # List contents of directory
    contents = requests.get(base_url).json()
    
    for item in contents:
        # Check if the item is a file and starts with the pattern
        if item['type'] == 'file' and item['name'].startswith(start_pattern):
            file_url = item['download_url']
            # Download the file
            file_content = requests.get(file_url).content
            file_name = item['name']
            # Save the file with the full path
            with open(os.path.join(save_directory, file_name), 'wb') as file:
                file.write(file_content)
        elif item['type'] == 'dir':
            # Recursively check in directories
            download_files(item['url'], start_pattern)

# GitHub API URL for the contents of the directory
api_url = "https://api.github.com/repos/github/codeql/contents/java/ql/lib/ext"
# Pattern to match filenames
pattern = "org.apache.commons"

# Call the function with the API URL and the filename pattern
download_files(api_url, pattern)


In [63]:
from bs4 import BeautifulSoup

def extract_all_href_data_from_file(html_file_path):
    # Load HTML file content
    with open(html_file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()

    # Create a BeautifulSoup object
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all <a> tags that have an 'href' attribute
    a_tags = soup.find_all('a', href=lambda href: href and href.startswith('#'))
    
    results = []

    # Extract text and any related information for each <a> tag
    for a_tag in a_tags:
        href = a_tag['href']
        # Try to find a related description in a sibling <div>
        parent_div = a_tag.find_parent('div')
        description_text = "No description found."
        if parent_div:
            next_div = parent_div.find_next_sibling('div')
            if next_div and next_div.find('div', class_='block'):
                description_text = next_div.find('div', class_='block').text.strip()
                results.append({'href': href[1:], 'description': description_text})
    return results

In [64]:
import requests

def scrape_all_html_data(class_name, function):
    # Output CSV file path
    html_file_path = f"../inputs/html_files/{class_name}_{function}.html"
    
    # ["org.apache.commons.io", "FileUtils", False, "forceMkdir",
    # URL of the webpage you want to download
    sub_class = class_name.split('.')
    index_of_commons = sub_class.index('commons')
    if len(sub_class) - index_of_commons > 2:
        url = f"https://commons.apache.org/proper/commons-{sub_class[index_of_commons + 1]}/apidocs/org/apache/commons/{sub_class[index_of_commons + 1]}/{sub_class[index_of_commons + 2]}/{function}.html"
    else:
        url = f"https://commons.apache.org/proper/commons-{sub_class[index_of_commons + 1]}/apidocs/org/apache/commons/{sub_class[index_of_commons + 1]}/{function}.html"
    print(url)
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Save the HTML content to a local file
        with open(html_file_path, "w", encoding="utf-8") as file:
            file.write(response.text)
        print("File downloaded successfully!")
    else:
        print("Failed to retrieve the webpage. Status code:", response.status_code)

In [67]:
import yaml
import csv
import os

# Directory containing the YAML files
directory_path = '../inputs/yml_files'

# Output CSV file path
csv_file_path = '../inputs/new_API.csv'

# Failed CSV file path
error_log_path = '../inputs/error_log.csv'

# Prepare the header for the CSV file
header = ['keys', 'docs', 'real']

# List to hold all CSV data from multiple YAML files
csv_data = []

# List to hold errors
error_data = []

# Function to process each YAML file
def process_yaml_file(file_path, csv_data):
    with open(file_path, 'r') as file:
        data = yaml.safe_load(file)
        for ext in data.get('extensions', []):
            extensible_type = ext['addsTo'].get('extensible', '')
            if extensible_type in ['sourceModel', 'sinkModel']:
                model_type = 'source' if extensible_type == 'sourceModel' else 'sink'
                prev_class = ""
                for item in ext.get('data', []):
                    class_name, function, method, parameters = item[0], item[1], item[3], item[4]
                    # New package that needed to be scraped
                    sub_class_name = class_name.split('.')[-1]
                    if sub_class_name != prev_class:
                        prev_class = sub_class_name
                        scrape_all_html_data(class_name, function)
    
                    # scrape_all_html_data(class_name, function)
                    # Generate a unique key using class, method, and parameters
                    key = f"{class_name}_{function}_{method}_{parameters.replace(',', '_')}"
                    print(key)
                    # find the description
                    des = ""
                    html_file_path = f"../inputs/html_files/{class_name}_{function}.html"  
                    try:
                        all_href_data = extract_all_href_data_from_file(html_file_path)
                        for data in all_href_data:
                            if '(' in data['href']:
                                method_name, param_section = data['href'].split('(')
                                param_section = param_section.strip(')')
    
                            else:
                                method_name = data['href']
                                param_section = ""
                                
                            params = param_section.split(',')
                            abbreviated_params = [param.strip().split('.')[-1] for param in params]
                            if '(' in parameters:
                                parameters = parameters[1:-1]
                                
                            if method_name == method and parameters.split(',') == abbreviated_params:
                                des = data['description']
                                break
                    except Exception as e:
                        error_data.append([class_name, function, str(e)])
                    csv_data.append([key, des, model_type])

# Loop through each file in the directory
for filename in os.listdir(directory_path):
    if filename.endswith('.yml'):
        file_path = os.path.join(directory_path, filename)
        process_yaml_file(file_path, csv_data)

# Write all the data to a single CSV file
with open(csv_file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(header)
    writer.writerows(csv_data)
    
# Write error data to error log file
with open(error_log_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Class Name', 'Function Name', 'Error Message'])
    writer.writerows(error_data)

https://commons.apache.org/proper/commons-exec/apidocs/org/apache/commons/exec/environment/EnvironmentUtils.html
File downloaded successfully!
org.apache.commons.exec.environment_EnvironmentUtils_addVariableToEnvironment_(Map_String)
https://commons.apache.org/proper/commons-exec/apidocs/org/apache/commons/exec/launcher/CommandLauncher.html
File downloaded successfully!
org.apache.commons.exec.launcher_CommandLauncher_exec_
https://commons.apache.org/proper/commons-exec/apidocs/org/apache/commons/exec/CommandLine.html
File downloaded successfully!
org.apache.commons.exec_CommandLine_parse_(String)
org.apache.commons.exec_CommandLine_parse_(String_Map)
org.apache.commons.exec_CommandLine_addArguments_(String)
org.apache.commons.exec_CommandLine_addArguments_(String_boolean)
org.apache.commons.exec_CommandLine_addArguments_(String[])
org.apache.commons.exec_CommandLine_addArguments_(String[]_boolean)
org.apache.commons.exec_Executor_execute_(CommandLine_Map)
org.apache.commons.exec_Execu

In [None]:
new website: https://commons.apache.org/proper/commons-lang/javadocs
old website:https://commons.apache.org/proper/commons-lang/javadocs/api-3.0/
old website: https://commons.apache.org/proper/commons-lang/javadocs/api-3.1/org/apache/commons/lang3/builder/ToStringBuilder.html