In [None]:
import os
from openai import OpenAI
from dotenv import load_dotenv
import numpy as np
from numpy.linalg import norm
from git import Repo
import json

In [None]:
load_dotenv()

client = OpenAI(
    # This is the default and can be omitted
    api_key=os.getenv("OPENAI_API_KEY") #os.environ.get("OPENAI_API_KEY"),
)

response = client.responses.create(
    model="gpt-4o",
    instructions="You are a coding assistant that talks like a pirate.",
    input="How do I check if a Python object is an instance of a class?",
)

print(response.output_text)

In [None]:
response = client.embeddings.create(
    input="What is the capital of France?",
    model="text-embedding-3-large"
)

embedding = response.data[0].embedding
print(embedding)

In [None]:
def extract_diff_lines(diff_text):
    diff_lines = []
    diff_dict = {}
    for line in diff_text.splitlines():
        diff_dict = {'Added': [],
                     'Removed': [],}
        if line.startswith('+') and not line.startswith('+++'):
            diff_dict['Added'].append(f"The following lines have been added: {line[1:]}")
        elif line.startswith('-') and not line.startswith('---'):
            diff_dict['Removed'].append(f"The following lines have been removed: {line[1:]}")
    return diff_lines

In [None]:
repo = Repo('../commit-messages-guide')  # or path to your repo
commits_data = []

for commit in repo.iter_commits():
    commit_info = {
        "commit": commit.hexsha,
        "author": str(commit.author),
        "date": str(commit.committed_datetime),
        "files_changed": []
    }

    for diff in commit.diff(None, create_patch=True):
        if diff.a_path and diff.diff:
            diff_raw = diff.diff.decode(errors="ignore", encoding="utf-8")

            commit_info["files_changed"].append({
                "file": diff.a_path,
                "diff": diff_raw,
                "diff_processed": extract_diff_lines(diff_raw)
            })

    commits_data.append(commit_info)

# Save or print as JSON
with open('sample_git_log_clean.json', 'w') as f:
    json.dump(commits_data, f, indent=2)


In [None]:
{
 'Added': [line.removeprefix('Added: ') for line in extract_diff_lines(diff_raw) if line.startswith('Added')],
 'Removed': [line.removeprefix('Removed: ')  for line in extract_diff_lines(diff_raw) if line.startswith('Removed')]
 }


In [None]:
commits_data

In [None]:
with open("sample_git_log.json", "r", encoding="utf-8") as f:
    data = json.load(f)

data[10]

In [None]:
data[10]['files_changed'][0]['diff_processed']

In [None]:
response = client.embeddings.create(
    input=data[10]['files_changed'][0]['diff_processed'],
    model="text-embedding-3-large"
)

embedding = response.data[0].embedding
print(embedding)

In [None]:
for i, commit in enumerate(data[:10]):
    files_changed = commit.get("files_changed")
    for file in files_changed:
        diff = file['diff_processed']
        filename = file['file']

In [None]:
import time
for i, commit in enumerate(data):
    try:
        files_changed = commit.get("files_changed")
        for file in files_changed:
            diff = file['diff_processed']
            filename = file['file']
            if not diff:
                commit["embedding"] = None
        if not files_changed:
            commit["embedding"] = None  # Skip if no diff


        # Get embedding
        response = client.embeddings.create(
            input=diff,
            model="text-embedding-3-large"
        )

        embedding = response.data[0].embedding
        commit["embedding"] = embedding

        if i % 100 == 0:
            print(f"Processed {i} commits")

        time.sleep(0.1)  # Avoid rate limits

    except Exception as e:
        print(f"Error at index {i}: {e}")
        commit["embedding"] = None

with open("commits_with_embeddings.json", "w") as f:
    json.dump(data, f, indent=2)

In [None]:
query = "When did someone add a PDF?"
response = client.embeddings.create(
    input=query,
    model="text-embedding-3-large"
)
query_embedding = response.data[0].embedding

In [None]:
data[1].keys()

In [None]:
data_with_embeddings = [d for d in data if 'embedding' in d]

with open("commits_with_embeddings_filtered_for_difference.json", "w") as f:
    json.dump(data_with_embeddings, f, indent=2)

In [None]:
def cosine_sim(a, b):
    return np.dot(a, b) / (norm(a) * norm(b))

top_matches = sorted(
    data_with_embeddings,
    key=lambda x: cosine_sim(query_embedding, x['embedding']),
    reverse=True
)[:10]

In [None]:
prompt = f"""
You are analyzing commit history.

Given the following top 5 commit diffs, answer the question:
"{query}"

Commits:
1. {top_matches[0]['files_changed'][0]['diff_processed']}
2. {top_matches[1]['files_changed'][0]['diff_processed']}
3. {top_matches[2]['files_changed'][0]['diff_processed']}
4. {top_matches[3]['files_changed'][0]['diff_processed']}
...

Only return a helpful answer — do not list all commits unless necessary.
"""

response = client.chat.completions.create(
    model="gpt-4.1-mini",
    messages=[{"role": "user", "content": prompt}]
)

print(response.choices[0].message.content)

In [None]:
sample_json = [{'commit_1': {'A': "a",
                'B': "b",
                'C': ['c', 'cc', 'ccc']},
               'commit_2': {'A': "a",
                            'B': "bb",
                'C': ['c', 'cc', 'ccc']}}]

for i, commit in enumerate(sample_json):
    commit['embedding'] = i
sample_json