diff --git a/.gptinclude b/.gptinclude
new file mode 100644
index 0000000..8431972
--- /dev/null
+++ b/.gptinclude
@@ -0,0 +1 @@
+prompt/
\ No newline at end of file
diff --git a/README.md b/README.md
index a370b13..a88abf0 100644
--- a/README.md
+++ b/README.md
@@ -24,11 +24,46 @@ To use the git2gpt utility, run the following command:
git2gpt [flags] /path/to/git/repository
```
-### Ignoring Files
+### Including and Ignoring Files
-By default, your `.git` directory and your `.gitignore` files are ignored. Any files in your `.gitignore` are also skipped. If you want to change this behavior, you should add a `.gptignore` file to your repository. The `.gptignore` file should contain a list of files and directories to ignore, one per line. The `.gptignore` file should be in the same directory as your `.gitignore` file. Please note that this overwrites the default ignore list, so you should include the default ignore list in your `.gptignore` file if you want to keep it.
+By default, your `.git` directory and your `.gitignore` files are ignored. Any files in your `.gitignore` are also skipped. You can customize the files to include or ignore in several ways:
-### Flags
+### Including Only Specific Files (.gptinclude)
+
+Add a `.gptinclude` file to your repository to specify which files should be included in the output. Each line in the file should contain a glob pattern of files or directories to include. If a `.gptinclude` file is present, only files that match these patterns will be included.
+
+Example `.gptinclude` file:
+```
+# Include only these file types
+*.go
+*.js
+*.html
+*.css
+
+# Include specific directories
+src/**
+docs/api/**
+```
+
+### Ignoring Specific Files (.gptignore)
+
+Add a `.gptignore` file to your repository to specify which files should be ignored. This works similar to `.gitignore`, but is specific to git2gpt. The `.gptignore` file should contain a list of files and directories to ignore, one per line.
+
+Example `.gptignore` file:
+```
+# Ignore these file types
+*.log
+*.tmp
+*.bak
+
+# Ignore specific directories
+node_modules/**
+build/**
+```
+
+**Note**: When both `.gptinclude` and `.gptignore` files exist, git2gpt will first include files matching the `.gptinclude` patterns, and then exclude any of those files that also match `.gptignore` patterns.
+
+## Command Line Options
* `-p`, `--preamble`: Path to a text file containing a preamble to include at the beginning of the output file.
* `-o`, `--output`: Path to the output file. If not specified, will print to standard output.
@@ -36,6 +71,7 @@ By default, your `.git` directory and your `.gitignore` files are ignored. Any f
* `-j`, `--json`: Output to JSON rather than plain text. Use with `-o` to specify the output file.
* `-x`, `--xml`: Output to XML rather than plain text. Use with `-o` to specify the output file.
* `-i`, `--ignore`: Path to the `.gptignore` file. If not specified, will look for a `.gptignore` file in the same directory as the `.gitignore` file.
+* `-I`, `--include`: Path to the `.gptinclude` file. If not specified, will look for a `.gptinclude` file in the repository root.
* `-g`, `--ignore-gitignore`: Ignore the `.gitignore` file.
* `-s`, `--scrub-comments`: Remove comments from the output file to save tokens.
diff --git a/cmd/root.go b/cmd/root.go
index 840fd8b..d438984 100644
--- a/cmd/root.go
+++ b/cmd/root.go
@@ -1,49 +1,40 @@
package cmd
-
import (
"fmt"
"os"
-
"github.com/chand1012/git2gpt/prompt"
"github.com/spf13/cobra"
)
-
var repoPath string
var preambleFile string
var outputFile string
var estimateTokens bool
var ignoreFilePath string
+var includeFilePath string // New: Add variable for include file path
var ignoreGitignore bool
var outputJSON bool
var outputXML bool
var debug bool
var scrubComments bool
-
var rootCmd = &cobra.Command{
Use: "git2gpt [flags] /path/to/git/repository [/path/to/another/repository ...]",
Short: "git2gpt is a utility to convert one or more Git repositories to a text file for input into an LLM",
Args: cobra.MinimumNArgs(1),
Run: func(cmd *cobra.Command, args []string) {
- // Create a combined repository to hold all files
combinedRepo := &prompt.GitRepo{
Files: []prompt.GitFile{},
}
-
- // Process each repository path
for _, path := range args {
repoPath = path
ignoreList := prompt.GenerateIgnoreList(repoPath, ignoreFilePath, !ignoreGitignore)
- repo, err := prompt.ProcessGitRepo(repoPath, ignoreList)
+ includeList := prompt.GenerateIncludeList(repoPath, includeFilePath) // New: Generate include list
+ repo, err := prompt.ProcessGitRepo(repoPath, includeList, ignoreList) // Modified: Pass includeList
if err != nil {
fmt.Printf("Error processing %s: %s\n", repoPath, err)
os.Exit(1)
}
-
- // Add files from this repo to the combined repo
combinedRepo.Files = append(combinedRepo.Files, repo.Files...)
}
-
- // Update the file count
combinedRepo.FileCount = len(combinedRepo.Files)
if outputJSON {
output, err := prompt.MarshalRepo(combinedRepo, scrubComments)
@@ -52,7 +43,6 @@ var rootCmd = &cobra.Command{
os.Exit(1)
}
if outputFile != "" {
- // if output file exists, throw error
if _, err := os.Stat(outputFile); err == nil {
fmt.Printf("Error: output file %s already exists\n", outputFile)
os.Exit(1)
@@ -75,15 +65,11 @@ var rootCmd = &cobra.Command{
fmt.Printf("Error: %s\n", err)
os.Exit(1)
}
-
- // Validate the XML output
if err := prompt.ValidateXML(output); err != nil {
fmt.Printf("Error: %s\n", err)
os.Exit(1)
}
-
if outputFile != "" {
- // if output file exists, throw error
if _, err := os.Stat(outputFile); err == nil {
fmt.Printf("Error: output file %s already exists\n", outputFile)
os.Exit(1)
@@ -106,7 +92,6 @@ var rootCmd = &cobra.Command{
os.Exit(1)
}
if outputFile != "" {
- // if output file exists, throw error
if _, err := os.Stat(outputFile); err == nil {
fmt.Printf("Error: output file %s already exists\n", outputFile)
os.Exit(1)
@@ -126,33 +111,22 @@ var rootCmd = &cobra.Command{
}
},
}
-
func init() {
rootCmd.Flags().StringVarP(&preambleFile, "preamble", "p", "", "path to preamble text file")
- // output to file flag. Should be a string
rootCmd.Flags().StringVarP(&outputFile, "output", "o", "", "path to output file")
- // estimate tokens. Should be a bool
rootCmd.Flags().BoolVarP(&estimateTokens, "estimate", "e", false, "estimate the number of tokens in the output")
- // ignore file path. Should be a string
rootCmd.Flags().StringVarP(&ignoreFilePath, "ignore", "i", "", "path to .gptignore file")
- // ignore gitignore. Should be a bool
+ rootCmd.Flags().StringVarP(&includeFilePath, "include", "I", "", "path to .gptinclude file") // New: Add flag for include file
rootCmd.Flags().BoolVarP(&ignoreGitignore, "ignore-gitignore", "g", false, "ignore .gitignore file")
- // output JSON. Should be a bool
rootCmd.Flags().BoolVarP(&outputJSON, "json", "j", false, "output JSON")
- // output XML. Should be a bool
rootCmd.Flags().BoolVarP(&outputXML, "xml", "x", false, "output XML")
- // debug. Should be a bool
rootCmd.Flags().BoolVarP(&debug, "debug", "d", false, "debug mode. Do not output to standard output")
- // scrub comments. Should be a bool
rootCmd.Flags().BoolVarP(&scrubComments, "scrub-comments", "s", false, "scrub comments from the output. Decreases token count")
-
- // Update the example usage to show multiple paths
rootCmd.Example = " git2gpt /path/to/repo1 /path/to/repo2\n git2gpt -o output.txt /path/to/repo1 /path/to/repo2"
}
-
func Execute() {
if err := rootCmd.Execute(); err != nil {
fmt.Println(err)
os.Exit(1)
}
-}
+}
\ No newline at end of file
diff --git a/prompt/gptinclude_test.go b/prompt/gptinclude_test.go
new file mode 100644
index 0000000..fe17d68
--- /dev/null
+++ b/prompt/gptinclude_test.go
@@ -0,0 +1,140 @@
+package prompt
+
+import (
+ "os"
+ "path/filepath"
+ "testing"
+)
+
+func TestGptIncludeAndIgnore(t *testing.T) {
+ // Create a temporary directory structure for testing
+ tempDir, err := os.MkdirTemp("", "git2gpt-test")
+ if err != nil {
+ t.Fatalf("Failed to create temp directory: %v", err)
+ }
+ defer os.RemoveAll(tempDir)
+
+ // Create test files
+ testFiles := []struct {
+ path string
+ contents string
+ }{
+ {"file1.txt", "Content of file1"},
+ {"file2.txt", "Content of file2"},
+ {"file3.txt", "Content of file3"},
+ {"src/main.go", "package main\nfunc main() {}"},
+ {"src/lib/util.go", "package lib\nfunc Util() {}"},
+ {"docs/README.md", "# Documentation"},
+ }
+
+ for _, tf := range testFiles {
+ fullPath := filepath.Join(tempDir, tf.path)
+ // Create directory if it doesn't exist
+ dir := filepath.Dir(fullPath)
+ if err := os.MkdirAll(dir, 0755); err != nil {
+ t.Fatalf("Failed to create directory %s: %v", dir, err)
+ }
+ // Write the file
+ if err := os.WriteFile(fullPath, []byte(tf.contents), 0644); err != nil {
+ t.Fatalf("Failed to write file %s: %v", fullPath, err)
+ }
+ }
+
+ // Test cases
+ testCases := []struct {
+ name string
+ includeContent string
+ ignoreContent string
+ expectedFiles []string
+ unexpectedFiles []string
+ }{
+ {
+ name: "Only include src directory",
+ includeContent: "src/**",
+ ignoreContent: "",
+ expectedFiles: []string{"src/main.go", "src/lib/util.go"},
+ unexpectedFiles: []string{"file1.txt", "file2.txt", "file3.txt", "docs/README.md"},
+ },
+ {
+ name: "Include all, but ignore .txt files",
+ includeContent: "**",
+ ignoreContent: "*.txt",
+ expectedFiles: []string{"src/main.go", "src/lib/util.go", "docs/README.md"},
+ unexpectedFiles: []string{"file1.txt", "file2.txt", "file3.txt"},
+ },
+ {
+ name: "Include src and docs, but ignore lib directory",
+ includeContent: "src/**\ndocs/**",
+ ignoreContent: "src/lib/**",
+ expectedFiles: []string{"src/main.go", "docs/README.md"},
+ unexpectedFiles: []string{"file1.txt", "file2.txt", "file3.txt", "src/lib/util.go"},
+ },
+ {
+ name: "No include file (should include all), ignore .txt files",
+ includeContent: "",
+ ignoreContent: "*.txt",
+ expectedFiles: []string{"src/main.go", "src/lib/util.go", "docs/README.md"},
+ unexpectedFiles: []string{"file1.txt", "file2.txt", "file3.txt"},
+ },
+ }
+
+ for _, tc := range testCases {
+ t.Run(tc.name, func(t *testing.T) {
+ // Create .gptinclude file if needed
+ includeFilePath := filepath.Join(tempDir, ".gptinclude")
+ if tc.includeContent != "" {
+ if err := os.WriteFile(includeFilePath, []byte(tc.includeContent), 0644); err != nil {
+ t.Fatalf("Failed to write .gptinclude file: %v", err)
+ }
+ } else {
+ // Ensure no .gptinclude file exists
+ os.Remove(includeFilePath)
+ }
+
+ // Create .gptignore file if needed
+ ignoreFilePath := filepath.Join(tempDir, ".gptignore")
+ if tc.ignoreContent != "" {
+ if err := os.WriteFile(ignoreFilePath, []byte(tc.ignoreContent), 0644); err != nil {
+ t.Fatalf("Failed to write .gptignore file: %v", err)
+ }
+ } else {
+ // Ensure no .gptignore file exists
+ os.Remove(ignoreFilePath)
+ }
+
+ // Generate include and ignore lists
+ includeList := GenerateIncludeList(tempDir, "")
+ ignoreList := GenerateIgnoreList(tempDir, "", false)
+
+ // Process the repository
+ repo, err := ProcessGitRepo(tempDir, includeList, ignoreList)
+ if err != nil {
+ t.Fatalf("Failed to process repository: %v", err)
+ }
+
+ // Check if expected files are included
+ for _, expectedFile := range tc.expectedFiles {
+ found := false
+ for _, file := range repo.Files {
+ if file.Path == expectedFile {
+ found = true
+ break
+ }
+ }
+ if !found {
+ t.Errorf("Expected file %s to be included, but it wasn't", expectedFile)
+ }
+ }
+
+ // Check if unexpected files are excluded
+ for _, unexpectedFile := range tc.unexpectedFiles {
+ for _, file := range repo.Files {
+ if file.Path == unexpectedFile {
+ t.Errorf("File %s should have been excluded, but it was included", unexpectedFile)
+ break
+ }
+ }
+ }
+ })
+ }
+}
\ No newline at end of file
diff --git a/prompt/prompt.go b/prompt/prompt.go
index 1c0a462..7f969e6 100644
--- a/prompt/prompt.go
+++ b/prompt/prompt.go
@@ -10,27 +10,23 @@ import (
"path/filepath"
"strings"
"unicode/utf8"
-
"github.com/chand1012/git2gpt/utils"
"github.com/gobwas/glob"
"github.com/pkoukk/tiktoken-go"
)
-// GitFile is a file in a Git repository
type GitFile struct {
Path string `json:"path" xml:"path"` // path to the file relative to the repository root
Tokens int64 `json:"tokens" xml:"tokens"` // number of tokens in the file
Contents string `json:"contents" xml:"contents"` // contents of the file
}
-// GitRepo is a Git repository
type GitRepo struct {
TotalTokens int64 `json:"total_tokens" xml:"total_tokens"`
Files []GitFile `json:"files" xml:"files>file"`
FileCount int `json:"file_count" xml:"file_count"`
}
-// contains checks if a string is in a slice of strings
func contains(s []string, e string) bool {
for _, a := range s {
if a == e {
@@ -47,30 +43,50 @@ func getIgnoreList(ignoreFilePath string) ([]string, error) {
return ignoreList, err
}
defer file.Close()
-
scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if line == "" || strings.HasPrefix(line, "#") {
continue
}
- // if the line ends with a slash, add a globstar to the end
if strings.HasSuffix(line, "/") {
line = line + "**"
}
- // remove all preceding slashes
line = strings.TrimPrefix(line, "/")
- // line = filepath.FromSlash(line)
ignoreList = append(ignoreList, line)
}
return ignoreList, scanner.Err()
}
+// Similar to getIgnoreList, but for .gptinclude files
+func getIncludeList(includeFilePath string) ([]string, error) {
+ var includeList []string
+ file, err := os.Open(includeFilePath)
+ if err != nil {
+ return includeList, err
+ }
+ defer file.Close()
+ scanner := bufio.NewScanner(file)
+ for scanner.Scan() {
+ line := strings.TrimSpace(scanner.Text())
+ if line == "" || strings.HasPrefix(line, "#") {
+ continue
+ }
+ if strings.HasSuffix(line, "/") {
+ line = line + "**"
+ }
+ line = strings.TrimPrefix(line, "/")
+ includeList = append(includeList, line)
+ }
+ return includeList, scanner.Err()
+}
+
func windowsToUnixPath(windowsPath string) string {
unixPath := strings.ReplaceAll(windowsPath, "\\", "/")
return unixPath
}
+// This function is kept for backward compatibility
func shouldIgnore(filePath string, ignoreList []string) bool {
for _, pattern := range ignoreList {
g := glob.MustCompile(pattern, '/')
@@ -81,34 +97,55 @@ func shouldIgnore(filePath string, ignoreList []string) bool {
return false
}
-// GenerateIgnoreList generates a list of ignore patterns from the .gptignore file and the .gitignore file. Returns a slice of strings. Will return an empty slice if no ignore files exist.
+// Determines if a file should be included in the output
+// First checks if the file matches the include list (if provided)
+// Then checks if the file is excluded by the ignore list
+func shouldProcess(filePath string, includeList, ignoreList []string) bool {
+ // If includeList is provided, check if the file is included
+ if len(includeList) > 0 {
+ included := false
+ for _, pattern := range includeList {
+ g := glob.MustCompile(pattern, '/')
+ if g.Match(windowsToUnixPath(filePath)) {
+ included = true
+ break
+ }
+ }
+ if !included {
+ return false // If not in the include list, skip it
+ }
+ }
+
+ // Check if the file is excluded by ignoreList
+ for _, pattern := range ignoreList {
+ g := glob.MustCompile(pattern, '/')
+ if g.Match(windowsToUnixPath(filePath)) {
+ return false // If in the ignore list, skip it
+ }
+ }
+
+ return true // Process this file
+}
+
func GenerateIgnoreList(repoPath, ignoreFilePath string, useGitignore bool) []string {
if ignoreFilePath == "" {
ignoreFilePath = filepath.Join(repoPath, ".gptignore")
}
-
var ignoreList []string
if _, err := os.Stat(ignoreFilePath); err == nil {
- // .gptignore file exists
ignoreList, _ = getIgnoreList(ignoreFilePath)
}
- ignoreList = append(ignoreList, ".git/**", ".gitignore", ".gptignore")
-
+ ignoreList = append(ignoreList, ".git/**", ".gitignore", ".gptignore", ".gptinclude")
if useGitignore {
gitignorePath := filepath.Join(repoPath, ".gitignore")
if _, err := os.Stat(gitignorePath); err == nil {
- // .gitignore file exists
gitignoreList, _ := getIgnoreList(gitignorePath)
ignoreList = append(ignoreList, gitignoreList...)
}
}
-
var finalIgnoreList []string
- // loop through the ignore list and remove any duplicates
- // also check if any pattern is a directory and add a globstar to the end
for _, pattern := range ignoreList {
if !contains(finalIgnoreList, pattern) {
- // check if the pattern is a directory
info, err := os.Stat(filepath.Join(repoPath, pattern))
if err == nil && info.IsDir() {
pattern = filepath.Join(pattern, "**")
@@ -116,27 +153,44 @@ func GenerateIgnoreList(repoPath, ignoreFilePath string, useGitignore bool) []st
finalIgnoreList = append(finalIgnoreList, pattern)
}
}
-
return finalIgnoreList
}
-// ProcessGitRepo processes a Git repository and returns a GitRepo object
-func ProcessGitRepo(repoPath string, ignoreList []string) (*GitRepo, error) {
+// Generate include list from .gptinclude file
+func GenerateIncludeList(repoPath, includeFilePath string) []string {
+ if includeFilePath == "" {
+ includeFilePath = filepath.Join(repoPath, ".gptinclude")
+ }
+ var includeList []string
+ if _, err := os.Stat(includeFilePath); err == nil {
+ includeList, _ = getIncludeList(includeFilePath)
+ }
+
+ var finalIncludeList []string
+ for _, pattern := range includeList {
+ if !contains(finalIncludeList, pattern) {
+ info, err := os.Stat(filepath.Join(repoPath, pattern))
+ if err == nil && info.IsDir() {
+ pattern = filepath.Join(pattern, "**")
+ }
+ finalIncludeList = append(finalIncludeList, pattern)
+ }
+ }
+ return finalIncludeList
+}
+// Update the function signature to accept includeList
+func ProcessGitRepo(repoPath string, includeList, ignoreList []string) (*GitRepo, error) {
var repo GitRepo
-
- err := processRepository(repoPath, ignoreList, &repo)
+ err := processRepository(repoPath, includeList, ignoreList, &repo)
if err != nil {
return nil, fmt.Errorf("error processing repository: %w", err)
}
-
return &repo, nil
}
-// OutputGitRepo outputs a Git repository to a text file
func OutputGitRepo(repo *GitRepo, preambleFile string, scrubComments bool) (string, error) {
var repoBuilder strings.Builder
-
if preambleFile != "" {
preambleText, err := os.ReadFile(preambleFile)
if err != nil {
@@ -146,8 +200,6 @@ func OutputGitRepo(repo *GitRepo, preambleFile string, scrubComments bool) (stri
} else {
repoBuilder.WriteString("The following text is a Git repository with code. The structure of the text are sections that begin with ----, followed by a single line containing the file path and file name, followed by a variable amount of lines containing the file contents. The text representing the Git repository ends when the symbols --END-- are encounted. Any further text beyond --END-- are meant to be interpreted as instructions using the aforementioned Git repository as context.\n")
}
-
- // write the files to the repoBuilder here
for _, file := range repo.Files {
repoBuilder.WriteString("----\n")
repoBuilder.WriteString(fmt.Sprintf("%s\n", file.Path))
@@ -156,74 +208,89 @@ func OutputGitRepo(repo *GitRepo, preambleFile string, scrubComments bool) (stri
}
repoBuilder.WriteString(fmt.Sprintf("%s\n", file.Contents))
}
-
repoBuilder.WriteString("--END--")
-
output := repoBuilder.String()
-
repo.TotalTokens = EstimateTokens(output)
-
return output, nil
}
+
func OutputGitRepoXML(repo *GitRepo, scrubComments bool) (string, error) {
- // Prepare XML content
- if scrubComments {
- for i, file := range repo.Files {
- repo.Files[i].Contents = utils.RemoveCodeComments(file.Contents)
- }
- }
-
- // Add XML header
- var result strings.Builder
- result.WriteString("\n")
-
- // Use custom marshaling with proper CDATA for code contents
- result.WriteString("\n")
-
- // Skip the tokens for now
- result.WriteString(" PLACEHOLDER\n")
- result.WriteString(fmt.Sprintf(" %d\n", repo.FileCount))
- result.WriteString(" \n")
-
- for _, file := range repo.Files {
- result.WriteString(" \n")
- result.WriteString(fmt.Sprintf(" %s\n", escapeXML(file.Path)))
- result.WriteString(fmt.Sprintf(" %d\n", file.Tokens))
- result.WriteString(" \n")
- result.WriteString(" \n")
- }
-
- result.WriteString(" \n")
- result.WriteString("")
-
- // Get the output string
- outputStr := result.String()
-
- // Calculate tokens
- tokenCount := EstimateTokens(outputStr)
- repo.TotalTokens = tokenCount
-
- // Replace the placeholder with the actual token count
- outputStr = strings.Replace(outputStr, "PLACEHOLDER",
- fmt.Sprintf("%d", tokenCount), 1)
-
- return outputStr, nil
+ if scrubComments {
+ for i, file := range repo.Files {
+ repo.Files[i].Contents = utils.RemoveCodeComments(file.Contents)
+ }
+ }
+ var result strings.Builder
+ result.WriteString("\n")
+ result.WriteString("\n")
+
+ result.WriteString(" PLACEHOLDER\n")
+ result.WriteString(fmt.Sprintf(" %d\n", repo.FileCount))
+ result.WriteString(" \n")
+
+ for _, file := range repo.Files {
+ result.WriteString(" \n")
+ result.WriteString(fmt.Sprintf(" %s\n", escapeXML(file.Path)))
+ result.WriteString(fmt.Sprintf(" %d\n", file.Tokens))
+
+ // Split content around CDATA end marker (]]>) and create multiple CDATA sections
+ contents := file.Contents
+ result.WriteString(" ")
+
+ for {
+ idx := strings.Index(contents, "]]>")
+ if idx == -1 {
+ // No more CDATA end markers, write remaining content in one CDATA section
+ result.WriteString("")
+ break
+ }
+
+ // Write content up to the CDATA end marker
+ result.WriteString("") // Close this CDATA section
+
+ // Start a new CDATA section with the ">" character
+ result.WriteString("")
+
+ // Move past the "]]>" in the original content
+ contents = contents[idx+3:]
+ }
+
+ result.WriteString("\n")
+ result.WriteString(" \n")
+ }
+
+ result.WriteString(" \n")
+ result.WriteString("\n")
+
+ outputStr := result.String()
+
+ tokenCount := EstimateTokens(outputStr)
+ repo.TotalTokens = tokenCount
+
+ outputStr = strings.Replace(
+ outputStr,
+ "PLACEHOLDER",
+ fmt.Sprintf("%d", tokenCount),
+ 1,
+ )
+
+ return outputStr, nil
}
-// escapeXML escapes XML special characters in a string
func escapeXML(s string) string {
- s = strings.ReplaceAll(s, "&", "&")
- s = strings.ReplaceAll(s, "<", "<")
- s = strings.ReplaceAll(s, ">", ">")
- s = strings.ReplaceAll(s, "\"", """)
- s = strings.ReplaceAll(s, "'", "'")
- return s
+ s = strings.ReplaceAll(s, "&", "&")
+ s = strings.ReplaceAll(s, "<", "<")
+ s = strings.ReplaceAll(s, ">", ">")
+ s = strings.ReplaceAll(s, "\"", """)
+ s = strings.ReplaceAll(s, "'", "'")
+ return s
}
-// ValidateXML checks if the given XML string is well-formed
func ValidateXML(xmlString string) error {
decoder := xml.NewDecoder(strings.NewReader(xmlString))
for {
@@ -238,10 +305,7 @@ func ValidateXML(xmlString string) error {
return nil
}
-
-
func MarshalRepo(repo *GitRepo, scrubComments bool) ([]byte, error) {
- // run the output function to get the total tokens
_, err := OutputGitRepo(repo, "", scrubComments)
if err != nil {
return nil, fmt.Errorf("error marshalling repo: %w", err)
@@ -249,18 +313,17 @@ func MarshalRepo(repo *GitRepo, scrubComments bool) ([]byte, error) {
return json.Marshal(repo)
}
-func processRepository(repoPath string, ignoreList []string, repo *GitRepo) error {
+// Update the function signature to accept includeList and use shouldProcess
+func processRepository(repoPath string, includeList, ignoreList []string, repo *GitRepo) error {
err := filepath.Walk(repoPath, func(path string, info os.FileInfo, err error) error {
if err != nil {
return err
}
if !info.IsDir() {
relativeFilePath, _ := filepath.Rel(repoPath, path)
- ignore := shouldIgnore(relativeFilePath, ignoreList)
- // fmt.Println(relativeFilePath, ignore)
- if !ignore {
+ process := shouldProcess(relativeFilePath, includeList, ignoreList)
+ if process {
contents, err := os.ReadFile(path)
- // if the file is not valid UTF-8, skip it
if !utf8.Valid(contents) {
return nil
}
@@ -276,24 +339,19 @@ func processRepository(repoPath string, ignoreList []string, repo *GitRepo) erro
}
return nil
})
-
repo.FileCount = len(repo.Files)
-
if err != nil {
return fmt.Errorf("error walking the path %q: %w", repoPath, err)
}
-
return nil
}
-// EstimateTokens estimates the number of tokens in a string
func EstimateTokens(output string) int64 {
tke, err := tiktoken.GetEncoding("cl100k_base")
if err != nil {
fmt.Println("Error getting encoding:", err)
return 0
}
-
tokens := tke.Encode(output, nil, nil)
return int64(len(tokens))
-}
+}
\ No newline at end of file