**PyGithub Documentation Link:** https://pygithub.readthedocs.io/en/stable/index.html

**PyGithub -> GitHub REST API Documentation:**  https://pygithub.readthedocs.io/en/stable/github.html

**GitHub Search Query:** https://docs.github.com/en/search-github/searching-on-github/searching-for-repositories

In [1]:
from github import Github, UnknownObjectException, Auth, GithubException
from datetime import datetime, timezone
from langchain_community.document_loaders import GithubFileLoader
import sys
import re
import os
import configparser
import tiktoken
import openai
import csv
import json
import os
import time
import requests



config = configparser.ConfigParser()
config.read('config.ini')

# GitHub API setup
github_token = config['github']['token']
auth = Auth.Token(github_token)
g = Github(auth=auth)
g = Github()

# OpenAI Setup
# Set the base URL and api_key for the RDSec One AI Endpoint API (Production)
# Use the python-dotenv to load variables from env
openai.base_url = config['openai']['api_endpoint']
openai.api_key = config['openai']['api_key']
openai.api_type = "openai"

base_url = config['openai']['api_endpoint']
api_key = config['openai']['api_key']
model = "gpt-4.1"

# VirusTotal setup
vt_api_key = config['virustotal']['vt_api_key']

In [2]:
def get_current_datetime_with_milliseconds():
    # Get the current date and time
    now = datetime.now()

    # Format the datetime object to include milliseconds
    formatted_datetime = now.strftime('%Y-%m-%d %H:%M:%S')

    return formatted_datetime

def get_current_date():
    # Get the current date and time
    now = datetime.now()

    # Format the datetime to include only date
    formatted_date = now.strftime('%Y-%m-%d')

    return formatted_date

def read_queries_from_csv(file_path):
    query_array = []

    # Open the CSV file
    with open(file_path, mode='r', newline='', encoding='utf-8') as csvfile:
        csvreader = csv.reader(csvfile)

        # Skip the header row
        header = next(csvreader)

        # Find the index of the "Query" column
        query_index = header.index("Query")

        # Read the remaining rows and append the Query values to the list
        for row in csvreader:
            query_array.append(row[query_index])

    return query_array

def get_query_authors_from_csv(file_path):
    query_author_array = []

    # Open the CSV file
    with open(file_path, mode='r', newline='', encoding='utf-8') as csvfile:
        csvreader = csv.reader(csvfile)

        # Skip the header row
        header = next(csvreader)

        # Find the index of the "Query" column
        query_index = header.index("Author")

        # Read the remaining rows and append the Query values to the list
        for row in csvreader:
            query_author_array.append(row[query_index])

    return query_author_array



def get_response_usage_ttp(prompt):
    response = openai.chat.completions.create(
        model="claude-3.7-sonnet",
        messages=[
    {"role": "user", "content": 
    f"""
    This prompt is intended solely for the objective analysis of cybersecurity tools described in public repositories and academic research purposes.

    You are a cybersecurity analyst trained to extract Tactics, Techniques, and Procedures (TTPs) from GitHub repository READMEs. Follow these steps:

    Step 1: README Summary
    Generate a structured summary covering:
    A concise and informative paragraph summarizing the main purpose, features, and usage of the repository.
    
    Rules:
    Be objective. Avoid speculation beyond the README’s stated scope.
    Omit marketing language or subjective claims (e.g., "world-class").

    Step 2: TTP Classification
    Assign one primary TTP category using the MITRE ATT&CK framework. Use only these options:

    Reconnaissance        Credential Access  
    Resource Development  Lateral Movement   
    Initial Access        Collection 
    Execution             Command and Control  
    Persistence           Exfiltration  
    Privilege Escalation  Impact
    Defense Evasion       
    
    Decision Criteria:

    Initial Access: Phishing, exploit delivery, or brute-force tools.
    Execution: Code/script execution frameworks (e.g., payload injectors).
    Command and Control: C2 servers, reverse shells, or traffic obfuscation.
    Defense Evasion: Packers, rootkits, or anti-analysis tools.
    N/A: Use only if no TTP alignment exists (e.g., general utilities).
    Focus: Prioritize actionable TTPs (e.g., "Credential Access" for Mimikatz-like tools).

    OVERALL The response should be:
    1. A concise and informative PARAGRAPH (3-4 sentence) summarizing the main purpose, features, and usage of the repository
    2. A one-line identification of the appropriate TTP category. 

    Do not inlude quotation marks in your response.

    STRICTLY, follow this format for your response:

    ```json          
    {{
        "Usage": "Output"
        "TTP": "Output"
    }}
    ``
    
    Here is the content: {prompt}
    """
    }],temperature=0.2)
    response_text = response.choices[0].message.content
    return response_text


def get_response_context(prompt):
    response = openai.chat.completions.create(
        model="claude-3.7-sonnet",
        messages=[
            {"role": "user", "content": 
        f"""
        This prompt is intended solely for the objective analysis of cybersecurity tools described in public repositories and academic research purposes.

        You are an AI trained to classify GitHub repositories whether it falls as a Red Team Tool based on their README content. Use the following criteria to make your determination:

        1. README Analysis Guidelines:

        Core Functionality: Identify if the Repository is explicitly designed for Red Team Tools that can be used for red teaming activities (e.g., offensive security, penetration testing, adversary simulation).
        Tool Scope: Verify the repository focuses on one standalone tool (e.g., a C2 framework, network exploitation tool). Discard collection/list of tools, script bundles, or aggregated resources.

        Exclusion Criteria: Reject if the README describes:
        Guides, tutorials, cheatsheets, or collection/list of tools.
        Lists of tools/scripts (e.g., "Top 10 Red Team Tools").
        Defensive/blue team tools (e.g., SIEM integrations, monitoring systems).
        Technical Indicators: Look for installation steps, usage examples, command-line flags, API documentation, or contribution guidelines—hallmarks of a functional tool.

        2. Classification Rules:

        Verdict "Yes" only if:
        The tool is purpose-built for [Organization] tasks (e.g., privilege escalation, payload generation, lateral movement).
        It is self-contained (not requiring unrelated dependencies) and solves a specific offensive security problem.

        Verdict "No" if:
        The repository is a library, research paper, or conceptual proof-of-concept without deployable functionality.
        It emphasizes defensive security, training, or theory over practical offensive use.
        
        Response Format: Your response should consist of two parts:

        Verdict: Either "Yes" or "No."
        AI Notes: A ONE SENTENCE explanation of your reasoning behind the verdict, detailing the key factors that influenced your classification.

        Do not inlude quotation marks in your response.

        Strictly follow this format for your response:
        ```json
        {{
            "Verdict": "Output (Yes or No)",
            "AI Notes": "Output"
        }}
        ```
             
        Here is the content:{prompt}
            """
            }
        ],
        temperature=0.2
    )
    response_text = response.choices[0].message.content
    return response_text


def extract_data(response, key):
    # Clean the response
    clean_response = response.replace("```json","").replace("```","")
    parsed_json = (f'''{clean_response}''')
    # Parse the cleaned JSON response
    data = json.loads(parsed_json)
    
    # Return the requested data
    return data.get(key)


def remove_think(response):
    cleaned_string = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL)
    return cleaned_string



def get_total_token(prompt):
    encoding = tiktoken.encoding_for_model("gpt-4o")
    input_tokens=encoding.encode(prompt)
    return len(input_tokens)

def check_assets_for_exe(repo_name):
    try:
        asset_container = []
        repo = g.get_repo(repo_name)
        latest_release = repo.get_latest_release()

        for asset in latest_release.get_assets():
            asset_container.append(asset)

        # Check if asset_container has any data
        if asset_container:
            return True
        else:
            return False
    except Exception as e:
        return False
    

def string_split_add_exe(repo_name):
    part_after_slash = repo_name.split('/')[1]
    append_exe = part_after_slash + ".exe"
    return append_exe


def find_exe_files(repo_name, path=""):
    get_name_repo = repo_name.full_name
    repo_exe_to_check = string_split_add_exe(get_name_repo)
    exe_files = []
    contents = repo_name.get_contents(path)
    for content in contents:
        if content.type == "dir":
            # Recursively search in the subdirectory
            if find_exe_files(repo_name, content.path):
                return True
        elif content.type == "file" and content.path.endswith(".exe"):
            exe_files.append(content.path.lower())

    repo_exe_to_check_lower = repo_exe_to_check.lower()

    # Check if the filename of the repo appended with exe exists in the scraped exe files in the content
    if repo_exe_to_check_lower in exe_files:
        return True

    return False

'''
def summarize_maxim_token(readme_doc):
    llm = ChatOpenAI(api_key=api_key, base_url=base_url, model=model)
    char_text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=0)
    readme_content = char_text_splitter.split_documents(readme_doc)
    chain = load_summarize_chain(llm=llm, chain_type="map_reduce")
    initial_contents = chain.invoke(readme_content[0:10])
    return initial_contents.get('output_text','')  
'''

    


def populate_repo_data(repo_name, query, query_author):
    #col_names = ["Date", "Repository Name", "Stars", "Creation Date", "URL", "TTP", "Usage","Query", "Query Author"]
    has_ioc = ""
    init_repo = g.get_repo(repo_name)
    repo_fullname = init_repo.full_name
    # Populate List
    #date_time = get_current_datetime_with_milliseconds()
    #repo_obj = g.get_repo(repo)
    #print(repo_obj)
    #Check for exe in repo dir and assets
    #has_exe_file_repo = find_exe_files(init_repo)
    has_releases_assets = check_assets_for_exe(repo_name)
    if has_releases_assets:
        has_ioc = "Yes"
    else:
        has_ioc = "No"
    
    decoded_content = get_readme_contents(repo_name)
    repo_formatted_date = init_repo.created_at.strftime('%Y-%m-%d')
    repo_star = init_repo.stargazers_count
    repo_url = f"https://www.github.com/{repo_fullname}"
    repo_query = query
    repo_query_author = query_author
    #last_commit_date = get_latest_commit_date(repo_fullname)
    exe_name = string_split_add_exe(repo_fullname)
    #is_available_in_vt =  query_reponame_in_vt(exe_name)

    prompt_ini = decoded_content
    #Calculate Token from Prompt
    input_token_total = get_total_token(prompt_ini)
    #input_usage_token_total = get_total_token(prompt_for_usage)
    #print(input_usage_token_total)

    if input_token_total >= 128000:
        return "README exceeded the token limit"
    else:
        #repo_ttp_ai = get_response(prompt_for_ttp)
        #repo_usage_ai = get_response(prompt_for_usage)
        response_ttp_usage = get_response_usage_ttp(prompt_ini)
        response_context = get_response_context(prompt_ini)
        removed_think_response = remove_think(response_context)
        #print (removed_think_response)

        extracted_ttp = extract_data(response_ttp_usage, "TTP")
        extracted_usage = extract_data(response_ttp_usage,"Usage")
        extracted_verdict = extract_data(removed_think_response, "Verdict")
        extracted_notes = extract_data(removed_think_response, "AI Notes")
        
        repo_data = {
        "Repository Name": repo_fullname,
        "Creation Date": repo_formatted_date,
        "Verdict": extracted_verdict,
        "AI Notes": extracted_notes,
        "Stars": repo_star,
        "URL": repo_url,
        "TTP": extracted_ttp,
        "Usage": extracted_usage,
        "Has IOC": has_ioc,
        #"Is VT available": is_available_in_vt,
        "Query": repo_query,
        "Query Author": repo_query_author
        }

        
    return repo_data

def get_readme_contents(repo_name):
    # Regex pattern to match files containing "README" in their filename, case-insensitive
    readme_pattern = re.compile(r'readme\.(txt|md|rst|org)$', re.IGNORECASE)

    # Initialize GitHub API client
    repo = g.get_repo(repo_name)

    # Get the default branch
    default_branch = repo.default_branch

    try:
        loader = GithubFileLoader(
            repo=repo_name,  # the repo name
            branch=default_branch,  # the defaultzqy1 branch name
            access_token=github_token,
            github_api_url="https://api.github.com",
            file_filter=lambda file_path: readme_pattern.search(file_path) is not None
        )
        readme_content = loader.load()
        if not readme_content:
            return "No README files found in the repository."
        return readme_content[0].page_content
    except Exception as e:
        raise ValueError(f"Unable to Load README files: {e}")
            

def load_existing_repos(json_file_path):
    existing_repos = []
    try:
        with open(json_file_path, 'r') as file:
            data = json.load(file)
            existing_repos.extend(data)
    except FileNotFoundError:
        pass  # If the file does not exist, we just skip it
    return existing_repos

def load_existing_repo_names(*json_file_path):
    existing_repo_names = set()
    for json_file_path in json_file_path:
        try:
            with open(json_file_path, 'r') as file:
                data = json.load(file)
                for item in data:
                    existing_repo_names.add(item["Repository Name"].split('/')[-1])
        except FileNotFoundError:
            pass  # If the file does not exist, we just skip it
    return existing_repo_names

def append_data_to_json_with_date(json_file_path, repo_data):
    # Check if the file exists
    if os.path.exists(json_file_path):
        # If it exists, read the existing data
        with open(json_file_path, 'r') as f:
            existing_data = json.load(f)
    else:
        existing_data = []

    # Append new data to existing data
    existing_data.extend(repo_data)

    # Write back to the JSON file
    with open(json_file_path, 'w') as f:
        json.dump(existing_data, f, indent=4)

def append_data_to_json_archive(json_file_path, repo_data):
   # Check if the file exists
    if os.path.exists(json_file_path):
        # If it exists, read the existing data
        with open(json_file_path, 'r') as f:
            existing_data = json.load(f)
    else:
        existing_data = []

    # Append new data to existing data
    existing_data.extend(repo_data)

    # Write back to the JSON file
    with open(json_file_path, 'w') as f:
        json.dump(existing_data, f, indent=4)


def is_api_key_valid():
    prompt = "This is a test"
    try:
        response = openai.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "user", "content": prompt}
        ],
    )
    except Exception as e:
        print(f"An error occured: {e}")
        return False
    else:
        return True
    
def query_reponame_in_vt(repo_fullname):
    query = (f'name:{repo_fullname}')
    url = 'https://www.virustotal.com/api/v3/intelligence/search'
    headers = {
        'x-apikey': vt_api_key
    }
    params = {
        'query': query
    }
    response = requests.get(url, headers=headers, params=params)
 
    if response.status_code == 200:
        results = response.json()
        if results.get('data'):
            return f"https://www.virustotal.com/gui/search/name%253A{repo_fullname}/files"
        else:
            return None
    else:
        return f"Error: {response.status_code}, {response.text}"
    

def save_updated_repos(json_file_path, data):
    with open(json_file_path, 'w') as file:
        json.dump(data, file, indent=4)

def get_query_and_author_by_repo_name(json_data, repo_name):
    for repo in json_data:
        if repo["Repository Name"] == repo_name:
            return repo["Query"], repo["Query Author"]
    return None, None

def check_rate_limit():
    rate_limit = g.get_rate_limit()
    core_rate = rate_limit.core
    search_rate = rate_limit.search
    
    if core_rate.remaining < 10 or search_rate.remaining < 5:
        reset_time = max(core_rate.reset, search_rate.reset)
        # Use timezone-aware current time
        current_time = datetime.now(timezone.utc)
        sleep_time = (reset_time - current_time).total_seconds()
        if sleep_time > 0:
            print(f"Rate limit nearly exhausted. Sleeping for {sleep_time} seconds.")
            time.sleep(sleep_time + 5)  # Add 5 seconds buffer
def main():
    print("Starting the script...")

# Get Date
    date_today = get_current_date()
    print("Date today: " + date_today)

# Initialization of Variables and Queries
    
    query_array = read_queries_from_csv("github_queries.csv")
    query_author_array = get_query_authors_from_csv("github_queries.csv")
    json_file_path_based_on_date = "github_repositories_"+ date_today +".json"
    json_file_path_archive = "github_repositories_archive.json"
    final_repo_name_list = []
    new_repositories = []
    output_directory_yes = 'json_output/Yes'
    output_directory_no = 'json_output/No'

# Check OpenAI API Connection
    print("Checking connection to OpenAI")
    openai_status = is_api_key_valid()
    if openai_status:
        print("Successfully connected to OpenAI")
    else:
        print("Cannot connect to OpenAI. The Script is exiting....")
        sys.exit(1)

# Load Existing Repositories from Archive CSV   
    existing_repo_names = load_existing_repo_names(json_file_path_archive)
    

    for query in query_array:
        
        print(f"Querying GitHub for repositories with query: {query}")
        repos = g.search_repositories(query=query, stars='<1000')
        print("Removing Duplicates")
        query_repo_count = 0
        for repo in repos:
            if repo.full_name.split('/')[-1] not in existing_repo_names and query_repo_count < 10 :
                new_repositories.append({
                    "Repository Name": repo.full_name,
                    "Query": query,
                    "Query Author": query_author_array[query_array.index(query)]
                })
            
                #new_repositories.append(repo.full_name)
                existing_repo_names.add(repo.full_name)
                query_repo_count += 1
                time.sleep(5)

    print(f"Total Repositories:{len(new_repositories)}")
    for new_repo in new_repositories:
            repo_name = new_repo['Repository Name']
            query = new_repo['Query']
            query_author = new_repo['Query Author']
            if new_repo is None:
                continue
            try:
                print(f"Processing repository: {repo_name}")
                check_rate_limit()
                repo_data = populate_repo_data(repo_name, query, query_author)
                final_repo_name_list.append({
                "Repository Name": repo_name,
                "Query": query,
                "Query Author": query_author
                })
                try:
                    append_data_to_json_archive(json_file_path_archive, final_repo_name_list)
                except Exception as e:
                    print(f"An error occured while appending data to JSON file: {e}")
                final_repo_name_list = []
            
                if repo_data == "README exceeded the token limit":
                    print("Repository exceeded the token limit. Skipping...")
                    time.sleep(5)
                    continue

                else:
                    if repo_data.get("Verdict","") == "Yes":
                        # Define the JSON file name based on the repository name
                        json_file_name = os.path.join(output_directory_yes, f"{repo_name.replace('/', '_')}.json")

                        # Write the repository data to a JSON file
                        with open(json_file_name, 'w') as json_file:
                            json.dump(repo_data, json_file, indent=4)
                        print(f"Data for repository {repo_name} has been written to {json_file_name}")
                        time.sleep(5)
                    else:
                         # Define the JSON file name based on the repository name
                        json_file_name = os.path.join(output_directory_no, f"{repo_name.replace('/', '_')}.json")

                        # Write the repository data to a JSON file
                        with open(json_file_name, 'w') as json_file:
                            json.dump(repo_data, json_file, indent=4)
                        print(f"Data for repository {repo_name} has been written to {json_file_name}")
                        time.sleep(3)
            except Exception as e:
                #print(repo_data)
                print(f"An error occurred: {e}")
                time.sleep(5)
                continue

    print("All repositories have been processed.")

    #print(f"Appending repositories to JSON file: {json_file_path_archive}")
    #Append existing Repository to github_repositories_archive.json
    try:
        append_data_to_json_archive(json_file_path_archive, final_repo_name_list)
    except Exception as e:
        print(f"An error occured while appending data to JSON file: {e}")
        sys.exit(1)

    '''
     for repo_data in final_repo_data_list:
        try:
            append_data_to_json_with_date(json_file_path_based_on_date, repo_data)
        except Exception as e:
            print(f"An error occured while appending data to JSON file: {e}")
            sys.exit(1)
    '''

    print("The script finished successfully")
    g.close()
    
main()


Starting the script...
Date today: 2025-06-11
Checking connection to OpenAI
Successfully connected to OpenAI
Querying GitHub for repositories with query: repo:SaadAhla/dark-kill
Removing Duplicates
Total Repositories:1
Processing repository: SaadAhla/dark-kill
Data for repository SaadAhla/dark-kill has been written to json_output/Yes\SaadAhla_dark-kill.json
All repositories have been processed.
The script finished successfully


# FOR TESTING PURPOSES ONLY

### TEST CONTENT PROMPT

In [1]:
config = configparser.ConfigParser()
config.read('config.ini')
# GitHub API setup
github_token = config['github']['token']
auth = Auth.Token(github_token)
g = Github(auth=auth)

loader = GithubFileLoader(
    repo="gussieIsASuccessfulWarlock/Cyber-Logic-Dataset",  # the repo name
    branch="main",  # the branch name
    access_token=github_token,
    github_api_url="https://api.github.com",
    file_filter=lambda file_path: file_path.endswith(
        ".md"
    ),  # load all markdowns files.
)
documents = loader.load()
print(documents)

NameError: name 'configparser' is not defined

In [47]:
config = configparser.ConfigParser()
config.read('config.ini')
# GitHub API setup
github_token = config['github']['token']
auth = Auth.Token(github_token)
g = Github(auth=auth)



def get_readme_contents(repo_name):
    common_dirs = [
        '',  # root directory
        '.github'
    ]
    
    # Regex pattern to match files containing "README" in their filename, case-insensitive
    readme_pattern = re.compile(r'readme\.(txt|md|rst|org|adoc|asciidoc|html|pod|creole|mediawiki)$', re.IGNORECASE)
    
    # Initialize Github API client
    repo = g.get_repo(repo_name)
    
    for directory in common_dirs:
        try:
            contents = repo.get_contents(directory)
            for content_file in contents:
                if readme_pattern.match(content_file.name):
                    readme_content = repo.get_contents(content_file.path)
                    if readme_content.encoding == 'none':
                        return "README not found"
                    else:
                        return readme_content.decoded_content.decode()
        except UnknownObjectException:
            continue
        except GithubException as e:
            print(f"Error accessing directory '{directory}': {e}")
            continue

    return "README not found"
print(get_readme_contents("gussieIsASuccessfulWarlock/Cyber-Logic-Dataset"))

README not found


In [31]:
from langchain_community.document_loaders import GithubFileLoader
from langchain.chains.summarize import load_summarize_chain
from langchain_openai import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
import re




config = configparser.ConfigParser()
config.read('config.ini')

base_url = config['openai']['api_endpoint']
api_key = config['openai']['api_key']
model = "gpt-4o"

# GitHub API setup
github_token = config['github']['token']
auth = Auth.Token(github_token)
g = Github(auth=auth)

def get_readme_contents(repo_name):
    # Regex pattern to match files containing "README" in their filename, case-insensitive
    readme_pattern = re.compile(r'readme\.(txt|md|rst|org|adoc|asciidoc|html|pod|creole|mediawiki)$', re.IGNORECASE)
    # Initialize GitHub API client
    repo = g.get_repo(repo_name)

    # Get the default branch
    default_branch = repo.default_branch

    try:
        loader = GithubFileLoader(
            repo=repo_name,  # the repo name
            branch=default_branch,  # the defaultzqy1 branch name
            access_token=github_token,
            github_api_url="https://api.github.com",
            file_filter=lambda file_path: readme_pattern.search(file_path) is not None or file_path == "README.md"
        )
        readme_content = loader.load()
        print(readme_content)
        return readme_content
    except Exception as e:
        raise ValueError(f"Error loading README files: {e}")
    
    raise ValueError("No README files found in the default branch of the repository.")



readme = get_readme_contents("gussieIsASuccessfulWarlock/Cyber-Logic-Dataset")

llm = ChatOpenAI(api_key=api_key, base_url=base_url, model=model)
char_text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
readme_content = char_text_splitter.split_documents(readme)
chain = load_summarize_chain(llm=llm, chain_type="map_reduce")
initial_contents = chain.invoke(readme_content[0:10])
print(initial_contents.get('output_text',''))


[]
Certainly! Could you please provide the text you would like summarized?


In [None]:
import re
from github import Github
from langchain_community.document_loaders import GithubFileLoader
from langchain.chains.summarize import load_summarize_chain

config = configparser.ConfigParser()
config.read('config.ini')

# GitHub API setup
github_token = config['github']['token']
auth = Auth.Token(github_token)
g = Github(auth=auth)

def get_readme_contents(repo_name):
    # Regex pattern to match files containing "README" in their filename, case-insensitive
    readme_pattern = re.compile(r'readme\.(txt|md|rst)$', re.IGNORECASE)

    # Initialize GitHub API client
    repo = g.get_repo(repo_name)

    # Get the default branch
    default_branch = repo.default_branch

    try:
        loader = GithubFileLoader(
            repo=repo_name,  # the repo name
            branch=default_branch,  # the defaultzqy1 branch name
            access_token=github_token,
            github_api_url="https://api.github.com",
            file_filter=lambda file_path: readme_pattern.search(file_path) is not None
        )
        readme_content = loader.load()
        if readme_content:
            return readme_content[0].page_content
    except Exception as e:
        raise ValueError(f"Error loading README files: {e}")

    raise ValueError("No README files found in the default branch of the repository.")

'''
import json

# Specify the path to the JSON file
file_path = 'github_repositories_archive_2025.json'

# Load the JSON data from the file
with open(file_path, 'r') as file:
    repositories = json.load(file)

# Loop through each repository and print the repository name
'''


def get_response_context(prompt):
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": 
            """
        Simulate three brilliant, logical experts in penetration testing collaboratively answering a question. Each one verbosely explains their thought process in real-time, 
        onsidering the prior explanations of others and openly acknowledging mistakes. At each step, whenever possible, each expert refines and builds 
        upon the thoughts of others, acknowledging their contributions. They continue until there is a definitive answer to the question. 
        The idea is that they are a professional pentester and will verdict if the repository given is a tool that they can use to in their pentesting methods.


        You are responsible for determining whether a given GitHub repository qualifies as a Red Team Tool or is merely a collection of guides, cheatsheets, tips, and collections of tools. Follow these guidelines:

        README Analysis: Carefully read and analyze the README file of the GitHub repository to gather insights about its purpose and functionality.
        To verify if it is a tool, think of it if it can be classified as one of MITRE TTPS when used or can be used as a tool to use to pentest a system.

        Classification Criteria:

        If the repository is specifically designed as a Red Team Tool, respond with "Yes."
        If the repository is not a Red Team Tool or is primarily a guide, cheatsheet, or collection, respond with "No."
        Also add a confidence level whether you think the repository is a tool (100 being the high confidence while 0 being the low)
        Response Format: Your response should consist of three parts:

        The verdict: either "Yes" or "No."
        The AI notes: A brief explanation of your reasoning behind the verdict, detailing the key factors that influenced your classification.
        The Confidence Level: A percentage whether you think the repository is a tool that can be use by pentesters
        
        Strictly, your output should be in a JSON Format like this:          
            {
                "Verdict": "Output",
                "AI Notes": "Output"
                "Confidence Level": "Output"
            }

        Example Output:
            {
                "Verdict": No,
                "AI Notes": The repository 'js-cookie-monitor-debugger-hook' primarily serves as a script for monitoring and debugging JavaScript operations related to cookies. It provides functionalities to track changes to cookies and set breakpoints when specific cookie events occur. While it is a useful tool for developers and security researchers, it does not fit the criteria of a Red Team Tool as defined by MITRE TTPs. Instead, it functions more as a debugging aid rather than a tool designed for offensive security operations.
                "Confidence Level": 0% 
            }

            """
            },
            {"role": "user", "content": prompt}
        ],
        temperature=0.1
    )
    response_text = response.choices[0].message.content
    return response_text

'''

decoded_content = get_readme_contents(init_repo) 
prompt = f"Here is the link: {decoded_content}"

print(get_response(prompt))

'''

'''
for repo in repositories:
    link = f"https://github.com/{repo['Repository Name']}"
    prompt = f"Here is the link: {link}"
    response = get_response(prompt)
    print(f"Repository Name: {repo['Repository Name']}, Response: {response}")

'''


# Assuming the CSV file is named 'repositories.csv' and contains one URL per line
file_path = 'test_repo.csv'

# Open the CSV file and read the URLs
with open(file_path, 'r') as file:
    # Read each line and strip any whitespace (like newline characters)
    repositories = [line.strip() for line in file.readlines()]

# Loop through each URL and perform actions
for repo in repositories:
    # Print the URL (or perform any other desired action)
    readme_content = get_readme_contents(repo)
    prompt = f"Here is the content: {readme_content}"
    response = get_response_context(prompt)
    print(f"Repository Name: {repo}, Response: {response}")
    # You can add more actions here, such as calling a function

 


### Test recursive search in repository

In [94]:
def check_assets_for_exe(repo_name):
    try:
        asset_container = []
        repo = g.get_repo(repo_name)
        latest_release = repo.get_latest_release()

        for asset in latest_release.get_assets():
            asset_container.append(asset)

        # Check if asset_container has any data
        if asset_container:
            return True
        else:
            return False
    except Exception as e:
        return False

status = check_assets_for_exe("jekil/awesome-hacking")

print (status)

False


### Test commit date

In [182]:
json_file_path_archive = "github_repositories_archive.json"
new_repositories = []


def get_current_date():
    # Get the current date and time
    now = datetime.now()

    # Format the datetime to include only date
    formatted_date = now.strftime('%Y-%m-%d')

    return formatted_date

def load_existing_repos(json_file_path):
    existing_repos = []
    try:
        with open(json_file_path, 'r') as file:
            data = json.load(file)
            existing_repos.extend(data)
    except FileNotFoundError:
        pass  # If the file does not exist, we just skip it
    return existing_repos

def get_repositories_with_updated_commit(json_file_path_archive):
    existing_repos = load_existing_repos(json_file_path_archive)
    date_today = get_current_date()
    updated_repos = []

    for repo in existing_repos:
        # Compare the "Last Commit Date" to the current date
        if repo["Last Commit Date"] == date_today:
            # Append the repository to the list if the dates match
            updated_repos.append(repo["Repository Name"])
    
    return updated_repos

matching_repos = get_repositories_with_updated_commit(json_file_path_archive)
new_repositories.append(matching_repos)
print(new_repositories)


[[]]


### Test content fetching

In [42]:
def check_assets_for_repository(repo_name):
    asset_container = []
    repo = g.get_repo(repo_name)
    latest_release = repo.get_latest_release()

    for asset in latest_release.get_assets():
        asset_container.append(asset)

    # Check if asset_container has any data
    if asset_container:
        return True
    else:
        return False

False


### TEST TOKEN OF PROMPTS HERE**

In [55]:
def get_readme_contents(repo_name):
    common_dirs = [
    '',  # root directory
    '.github'
    ]
    
    ini_repo = g.get_repo(repo_name)
    found = False
    for directory in common_dirs:
        try:
            readme_path = f"{directory}/README.md".strip('/')
            readme_content = ini_repo.get_contents(readme_path)
            decoded_content = readme_content.decoded_content.decode()
            break
        except UnknownObjectException:
            continue

    
    if decoded_content is None:
        decoded_content = "README.md not found"

    return decoded_content   

def get_total_token(prompt):
    encoding = tiktoken.encoding_for_model("gpt-4o")
    input_tokens=encoding.encode(prompt)
    return len(input_tokens)


 
decoded_content = get_readme_contents("loseys/BlackMamba")

prompt = "Your task is to summarize the README.md of the given GitHub repository. Here is the content: " + decoded_content + "." + " Your output should be a concise and informative paragraph that summarizes the main purpose, features, and usage of the repository."



total_token = get_total_token(prompt)

print(total_token)

1400


### PROMPT TESTING

In [4]:
def get_readme_contents(repo_name):
    common_dirs = [
    '',  # root directory
    '.github'
    ]

    readme_files = ['README.md', 'Readme.md', 'README.rst', 'Readme.rst']
    found = False
    for directory in common_dirs:
        for readme_file in readme_files:
            try:
                readme_path = f"{directory}/{readme_file}".strip('/')
                readme_content = repo_name.get_contents(readme_path)
                return readme_content.decoded_content.decode()
            except UnknownObjectException:
                continue

    return "README.md not found"   

def get_response_test(prompt):
 try:
    response = openai.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a helpful assistant"},
            {"role": "user", "content": prompt}
        ],
    )
    response_text = response.choices[0].message.content
    return response_text
 except Exception as e:
    print(f"General Error: {e}")
    sys.exit(1)

ini_repo = g.get_repo("quasar/Quasar")

content = get_readme_contents(ini_repo)

prompt = f"""Your task is to summarize the README of the given GitHub repository content, and give the appropriate TTP usage of the tool. Here is the content:" {content} "Your response for the readme should be a concise and informative paragraph that summarizes the main purpose, features, and usage of the repository. Your response for the TTP should be one-line. The TTP are any of this:(Reconnaissance, Resource Development, Initial Access, Execution, Persistence, Privilege Escalation, Defense Evasion, Credential Access, Discovery, Lateral Movement, Collection, Command and Control, Exfiltration, Impact. Your output should also be in a 2d string array format like this 
[
    ["TTP", "Your response here"],
    ["Usage", "Your response here"]
]
"""
response = get_response(prompt)

cleaned_response = response.replace("```plaintext\n", "").replace("\n```", "")

# Parse the cleaned JSON response
data_cleaned = json.loads(cleaned_response)

# Extract TTP and Usage
ttp = data_cleaned[0][1]
usage = data_cleaned[1][1]

# Create a dictionary to hold the extracted information
result = {
    "TTP": ttp,
    "Usage": usage
}

# Convert the dictionary to a JSON string
result_json = json.dumps(result, indent=4)

# Print or save the JSON string
#print(result_json)

json_data = (f'''{result_json}''')

data = json.loads(json_data)

# Extract the content from the "TTP" and "Usage" fields
ttp = data.get("TTP")
usage = data.get("Usage")

# Print the extracted content
print("HAHAHA:", ttp)
print("HAHAHAH:", usage)




#Dummy function

def extract_data(response, key):
    # Clean the response
    cleaned_response = response.replace("```plaintext\n", "").replace("\n```", "")
    
    # Parse the cleaned JSON response
    data = json.loads(cleaned_response)
    
    # Extract TTP and Usage
    ttp = data[0][1]
    usage = data[1][1]
    
    # Create a dictionary to hold the extracted information
    result = {
        "TTP": ttp,
        "Usage": usage
    }
    
    # Return the requested data
    return result.get(key)
        
        

HAHAHA: Command and Control
HAHAHAH: Quasar is a free, open-source remote administration tool for Windows that facilitates user support, administrative tasks, and employee monitoring with features like remote desktop, keylogging, file management, encrypted communication, and task management, making it a comprehensive solution for remote administration.


### TEST README CONTENT HERE

In [None]:
def get_readme_contents(repo_name):
    common_dirs = [
    '',  # root directory
    '.github'
    ]

    readme_files = ['README.md', 'Readme.md', 'README.rst', 'Readme.rst']
    found = False
    for directory in common_dirs:
        for readme_file in readme_files:
            try:
                readme_path = f"{directory}/{readme_file}".strip('/')
                readme_content = repo_name.get_contents(readme_path)
                return readme_content.decoded_content.decode()
            except UnknownObjectException:
                continue

    return "README.md not found"    


ini_repo = g.get_repo("quasar/Quasar")

content = get_readme_contents(ini_repo)

print(content)

## SPECIFIC CONTENT

In [38]:
def remove_content_specific(response):
    start_index = response.find("# Brutal")

    if start_index != -1:
        response = response[start_index:]
    print(response)

response_generated = """
# Now Teensy can be use for
Teensy like a rubber ducky , why im choose teensy ? because the price very cheap for me . t’s extremely useful for executing scripts on a target machine without the need for human-to-keyboard interaction ( HID -ATTACK ) .When you insert the device, it will be detected as a keyboard, and using the microprocessor and onboard flash memory storage, you can send a very fast set of keystrokes to the target’s machine and completely compromise it, regardless of autorun. I’ve used it in my security testing to run recon or enumeration scripts, execute reverse shells, exploit local DLL hijack/privilege escalation vulnerabilities, and get all password . 
Now im develop new tools the name is  Brutal 

# Brutal

Brutal is a toolkit to quickly create various payload,powershell attack , virus attack and launch listener for a Human Interface Device

Version Version Stage Build



Donate
If this project very help you to penetration testing and u want support me , you can give me a cup of coffee :)
Donation
Screenshoot
  

Video
Check this video https://www.youtube.com/watch?v=WaqY-pQpuV0

Do you want like a mr robot hacking scene when Angela moss plug usb into computer for get credential information ? you can choose payload in brutal ( optional 3 or 4 )

The Goal
Generate various payload and powershell attack without coding

To help breaking computer very fast and agile :p

The Payloads Compatibility > target Windows machines only

Requirements
Arduino Software ( I used v1.6.7 )

TeensyDuino

Linux udev rules

How install all requirements ? Visit This Wiki

Supported Hardware
The following hardware has been tested and is known to work.

Teensy 3.x

Usb Cable

📜 Changelog
Be sure to check out the [Changelog] and Read CHANGELOG.md

Getting Started
Copy and paste the PaensyLib folder inside your Arduino\libraries
git clone https://github.com/Screetsec/Brutal.git
cd Brutal
chmod +x Brutal.sh 
sudo ./Brutal.sh or sudo su ./Brutal.sh 
BUG ?
Submit new issue
Contact me
Hey sup ? do you want ask about all my tools ? you can join me in telegram.me/offscreetsec
Donations
Donation: Send to bitcoin

Addres Bitcoin : 1NuNTXo7Aato7XguFkvwYnTAFV2immXmjS



:octocat: Credits
Thanks to allah and Screetsec [ Edo -maland- ]
Dracos Linux from Scratch Indonesia ( Awesome Penetration os ), you can see in http://dracos-linux.org/
Offensive Security for the awesome OS ( http://www.offensive-security.com/ )
http://www.kali.org/
Jack Wilder admin in http://www.linuxsec.org
And another open sources tool in github
Uptodate new tools hacking visit http://www.kitploit.com
Disclaimer
Note: modifications, changes, or alterations to this sourcecode is acceptable, however,any public releases utilizing this code must be approved by writen this tool ( Edo -m- ).
"""

removed_content= remove_content_specific(response_generated)

print(removed_content)


# Brutal

Brutal is a toolkit to quickly create various payload,powershell attack , virus attack and launch listener for a Human Interface Device

Version Version Stage Build



Donate
If this project very help you to penetration testing and u want support me , you can give me a cup of coffee :)
Donation
Screenshoot
  

Video
Check this video https://www.youtube.com/watch?v=WaqY-pQpuV0

Do you want like a mr robot hacking scene when Angela moss plug usb into computer for get credential information ? you can choose payload in brutal ( optional 3 or 4 )

The Goal
Generate various payload and powershell attack without coding

To help breaking computer very fast and agile :p

The Payloads Compatibility > target Windows machines only

Requirements
Arduino Software ( I used v1.6.7 )

TeensyDuino

Linux udev rules

How install all requirements ? Visit This Wiki

Supported Hardware
The following hardware has been tested and is known to work.

Teensy 3.x

Usb Cable

📜 Changelog
Be sure to check

# Convert JSON to CSV

In [3]:
import json
import csv
import os

def json_to_csv(json_file_path, csv_file_path):
    # Read JSON data
    with open(json_file_path, 'r', encoding='utf-8') as json_file:
        data = json.load(json_file)

    # Check if data is a list of dictionaries
    if not isinstance(data, list) or not all(isinstance(item, dict) for item in data):
        raise ValueError("JSON file must contain an array of objects")

    # Extract column names from the first item
    col_names = data[0].keys()

    # Write to CSV file
    with open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=col_names)
        writer.writeheader()
        writer.writerows(data)

def main():
  json_file_name = input("Please enter the JSON file name (with extension): ")

  current_directory = os.getcwd()
  csv_file_name = os.path.splitext(json_file_name)[0] + '.csv'
  csv_file_path = os.path.join(current_directory, csv_file_name)

  try:
        json_to_csv(json_file_name, csv_file_path)
        print(f"Data has been successfully converted from {json_file_name} to {csv_file_name}")
  except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    main()

Data has been successfully converted from github_repositories_archive.json to github_repositories_archive.csv
