In [2]:
from openai import OpenAI
import os
import re

def summarize_code_files(source_dir):
    files = os.listdir(source_dir)
    sorted_files = sorted(files, key=lambda x: int(re.search(r'content_(\d+)', x).group(1)))
    code_dic = {}
    for file in sorted_files:
        if file.endswith(".txt") and 'content_' in file:
            with open(os.path.join(source_dir, file), 'r', encoding='utf-8') as f:
                code_snippet = f.read()
                code_dic[file] = code_snippet
    return code_dic

code_dic = summarize_code_files(source_dir = '/Users/vuh/Documents/chatgpt_code_summary/data/bigcode_the-stack-smol/content')

In [3]:
code_dic['content_10.txt']



In [8]:
# testing for 1 code snippet

# Instantiate a client to ChatGPT and grab the API key from the .env file
client = OpenAI(
    api_key=os.environ.get("API_KEY")
)


completion = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {
    "role": "system",
    "content": "You are an expert in programming, particularly in Python. Your task is to explain complex code snippets succinctly. Focus on providing clear, concise instructions that describe what the code does and how it works, suitable for use in training a code generation model."
    },
    {
    "role": "user",
    "content": "Please summarize the following Python code in about 100 words. The summary should serve as an instructional guide that clearly explains the purpose and functionality of the code to someone familiar with programming but not with this specific script."
    },
    {
    "role": "user",
    "content": f"{code_dic['content_10.txt']}"
    }
  ]
)

print(completion.choices[0].message)

ChatCompletionMessage(content="This Python script defines a command-line interface (CLI) program to create a CSS/LESS/SASS style guide. It utilizes the argparse library for parsing command-line arguments and the logging module for handling logging operations. The script allows users to specify the settings file path and log level as arguments. If no settings file is provided, it defaults to 'vitalstyles.json'. The script sets up logging based on the specified log level and creates a style guide using the provided settings. When executed as a standalone script, it triggers the CLI function to generate the style guide.", role='assistant', function_call=None, tool_calls=None)


In [11]:
print(completion.choices[0].message.content)

<class 'str'>


In [25]:
import os
import re
import time
from openai import OpenAI
invalid_file = [84, 99, 113, 124, 157, 164, 165, 166, 174, 178, 179, 203, 209, 223]


def summarize_code_files(source_dir, output_dir, client, log_file_path):
    files = os.listdir(source_dir)
    sorted_files = sorted(files, key=lambda x: int(re.search(r'content_(\d+)', x).group(1)))
    request_count = 0
    request_limit_per_minute = 10  # Adjust based on your API plan limits
    
    last_processed = 0
    # Load the last processed number from the log file
    if os.path.exists(log_file_path):
        with open(log_file_path, 'r') as log_file:
            last_processed_line = log_file.read().strip()
            last_processed = int(last_processed_line) if last_processed_line.isdigit() else -1

    for file in sorted_files:
        file_number = int(re.search(r'content_(\d+)', file).group(1))
        if file.endswith(".txt") and file_number > last_processed and file_number not in invalid_file:
            with open(os.path.join(source_dir, file), 'r', encoding='utf-8') as f:
                code_snippet = f.read()

                # Prepare the message for the API request
                messages = [
                    {
                    "role": "system",
                    "content": "You are an expert in programming, particularly in Python. Your task is to explain complex code snippets succinctly. Focus on providing clear, concise instructions that describe what the code does and how it works, suitable for use in training a code generation model."
                    },
                    {
                    "role": "user",
                    "content": "Please summarize the following Python code in about 100 words. The summary should serve as an instructional guide that clearly explains the purpose and functionality of the code to someone familiar with programming but not with this specific script."
                    },
                    {
                    "role": "user",
                    "content": f"{code_snippet}"
                    }
                ]

                # Handle rate limits more effectively
                while request_count >= request_limit_per_minute:
                    time.sleep(15)  # Wait for 60 seconds
                    request_count = 0  # Reset request count after waiting

                completion = client.chat.completions.create(
                    model="gpt-3.5-turbo",
                    messages=messages
                )
                summary = completion.choices[0].message.content
                request_count += 1

                # Save the response to a corresponding output file
                output_filename = f'response_{file_number}.txt'
                with open(os.path.join(output_dir, output_filename), 'w', encoding='utf-8') as out_file:
                    out_file.write(summary)
                
                # Update the log file with the last processed file number
                with open(log_file_path, 'w') as log_file:
                    log_file.write(str(file_number))

# Directory setup
source_dir = '/Users/vuh/Documents/chatgpt_code_summary/data/bigcode_the-stack-smol/content_test'
output_dir = '/Users/vuh/Documents/chatgpt_code_summary/data/bigcode_the-stack-smol/response'  # Make sure to adjust this to a real path
log_file_path = 'log_file.txt'  # Specify path for the log file

# API Client Setup
api_key = os.environ.get("API_KEY")
client = OpenAI(api_key=api_key)

# Execute the function
summarize_code_files(source_dir, output_dir, client, log_file_path)
