/
chat_to_files.py
238 lines (194 loc) · 8.69 KB
/
chat_to_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
"""
This Python script provides functionalities for parsing chat transcripts that contain file paths and code blocks,
applying diffs to these files, and parsing unified git diff format strings. The script is designed to work within
a larger system that involves processing and manipulating code files based on chat inputs and diff information.
Key Components:
- chat_to_files_dict: Parses a chat transcript, extracting file paths and associated code blocks, and organizes
them into a FilesDict object, which is a custom dictionary format designed to hold file contents keyed by their paths.
- apply_diffs: Takes a dictionary of Diff objects (which represent changes to be made to files) and a FilesDict
object containing the current state of files. It applies the changes described by the Diff objects to the
corresponding files in the FilesDict, updating the file contents as specified by the diffs.
- parse_diffs: Parses a string containing diffs in the unified git diff format, extracting the changes described
in the diffs and organizing them into a dictionary of Diff objects, keyed by the filename to which each diff applies.
- parse_diff_block: Parses a single block of text from a diff string, translating it into a Diff object that
represents the changes described in that block of text.
This script is intended for use in environments where code collaboration or review is conducted through chat interfaces,
allowing for the dynamic application of changes to code bases and the efficient handling of file and diff information in chat transcripts.
"""
import logging
import re
from typing import Dict, Tuple
from regex import regex
from gpt_engineer.core.diff import ADD, REMOVE, RETAIN, Diff, Hunk
from gpt_engineer.core.files_dict import FilesDict, file_to_lines_dict
# Initialize a logger for this module
logger = logging.getLogger(__name__)
def chat_to_files_dict(chat: str) -> FilesDict:
"""
Converts a chat string containing file paths and code blocks into a FilesDict object.
Args:
- chat (str): The chat string containing file paths and code blocks.
Returns:
- FilesDict: A dictionary with file paths as keys and code blocks as values.
"""
# Regex to match file paths and associated code blocks
regex = r"(\S+)\n\s*```[^\n]*\n(.+?)```"
matches = re.finditer(regex, chat, re.DOTALL)
files_dict = FilesDict()
for match in matches:
# Clean and standardize the file path
path = re.sub(r'[\:<>"|?*]', "", match.group(1))
path = re.sub(r"^\[(.*)\]$", r"\1", path)
path = re.sub(r"^`(.*)`$", r"\1", path)
path = re.sub(r"[\]\:]$", "", path)
# Extract and clean the code content
content = match.group(2)
# Add the cleaned path and content to the FilesDict
files_dict[path.strip()] = content.strip()
return files_dict
def apply_diffs(diffs: Dict[str, Diff], files: FilesDict) -> FilesDict:
"""
Applies diffs to the provided files.
Args:
- diffs (Dict[str, Diff]): A dictionary of diffs to apply, keyed by filename.
- files (FilesDict): The original files to which diffs will be applied.
Returns:
- FilesDict: The updated files after applying diffs.
"""
files = FilesDict(files.copy())
REMOVE_FLAG = "<REMOVE_LINE>" # Placeholder to mark lines for removal
for diff in diffs.values():
if diff.is_new_file():
# If it's a new file, create it with the content from the diff
files[diff.filename_post] = "\n".join(
line[1] for hunk in diff.hunks for line in hunk.lines
)
else:
# Convert the file content to a dictionary of lines
line_dict = file_to_lines_dict(files[diff.filename_pre])
for hunk in diff.hunks:
current_line = hunk.start_line_pre_edit
for line in hunk.lines:
if line[0] == RETAIN:
current_line += 1
elif line[0] == ADD:
# Handle added lines
current_line -= 1
if (
current_line in line_dict.keys()
and line_dict[current_line] != REMOVE_FLAG
):
line_dict[current_line] += "\n" + line[1]
else:
line_dict[current_line] = line[1]
current_line += 1
elif line[0] == REMOVE:
# Mark removed lines with REMOVE_FLAG
line_dict[current_line] = REMOVE_FLAG
current_line += 1
# Remove lines marked for removal
line_dict = {
key: line_content
for key, line_content in line_dict.items()
if REMOVE_FLAG not in line_content
}
# Reassemble the file content
files[diff.filename_post] = "\n".join(line_dict.values())
return files
def parse_diffs(diff_string: str) -> dict:
"""
Parses a diff string in the unified git diff format.
Args:
- diff_string (str): The diff string to parse.
Returns:
- dict: A dictionary of Diff objects keyed by filename.
"""
# Regex to match individual diff blocks
diff_block_pattern = regex.compile(
r"```.*?\n\s*?--- .*?\n\s*?\+\+\+ .*?\n(?:@@ .*? @@\n(?:[-+ ].*?\n)*?)*?```",
re.DOTALL,
)
diffs = {}
try:
for block in diff_block_pattern.finditer(diff_string, timeout=1):
diff_block = block.group()
# Parse individual diff blocks and update the diffs dictionary
diffs.update(parse_diff_block(diff_block))
except TimeoutError:
print("gpt-engineer timed out while parsing git diff")
if not diffs:
print(
"GPT did not provide any proposed changes. Please try to reselect the files for uploading and edit your prompt file."
)
return diffs
def parse_diff_block(diff_block: str) -> dict:
"""
Parses a block of diff text into a Diff object.
Args:
- diff_block (str): A single block of diff text.
Returns:
- dict: A dictionary containing a single Diff object keyed by the post-edit filename.
"""
lines = diff_block.strip().split("\n")[1:-1] # Exclude the opening and closing ```
diffs = {}
current_diff = None
hunk_lines = []
filename_pre = None
filename_post = None
hunk_header = None
for line in lines:
if line.startswith("--- "):
# Pre-edit filename
filename_pre = line[4:]
elif line.startswith("+++ "):
# Post-edit filename and initiation of a new Diff object
if (
filename_post is not None
and current_diff is not None
and hunk_header is not None
):
current_diff.hunks.append(Hunk(*hunk_header, hunk_lines))
hunk_lines = []
filename_post = line[4:]
current_diff = Diff(filename_pre, filename_post)
diffs[filename_post] = current_diff
elif line.startswith("@@ "):
# Start of a new hunk in the diff
if hunk_lines and current_diff is not None and hunk_header is not None:
current_diff.hunks.append(Hunk(*hunk_header, hunk_lines))
hunk_lines = []
hunk_header = parse_hunk_header(line)
elif line.startswith("+"):
# Added line
hunk_lines.append((ADD, line[1:]))
elif line.startswith("-"):
# Removed line
hunk_lines.append((REMOVE, line[1:]))
else:
# Retained line
hunk_lines.append((RETAIN, line[1:]))
# Append the last hunk if any
if current_diff is not None and hunk_lines and hunk_header is not None:
current_diff.hunks.append(Hunk(*hunk_header, hunk_lines))
return diffs
def parse_hunk_header(header_line) -> Tuple[int, int, int, int]:
"""
Parses the header of a hunk from a diff.
Args:
- header_line (str): The header line of a hunk.
Returns:
- tuple: A tuple containing start and length information for pre- and post-edit.
"""
pattern = re.compile(r"^@@ -\d{1,},\d{1,} \+\d{1,},\d{1,} @@$")
if not pattern.match(header_line):
# Return a default value if the header does not match the expected format
return 0, 0, 0, 0
pre, post = header_line.split(" ")[1:3]
start_line_pre_edit, hunk_len_pre_edit = map(int, pre[1:].split(","))
start_line_post_edit, hunk_len_post_edit = map(int, post[1:].split(","))
return (
start_line_pre_edit,
hunk_len_pre_edit,
start_line_post_edit,
hunk_len_post_edit,
)