Skip to content
This repository was archived by the owner on Oct 27, 2024. It is now read-only.
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions check_duplicates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import hashlib
import os

def hash_file(file_path):
"""Returns the SHA-1 hash of the file."""
hash_sha1 = hashlib.sha1()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_sha1.update(chunk)
return hash_sha1.hexdigest()

def check_duplicates(base_dir=".", extensions=(".cpp", ".java", ".py")):
"""Checks for duplicate files by content hash in the given directory."""
hashes = {}
duplicates = []

for root, _, files in os.walk(base_dir):
for file in files:
if file.endswith(extensions):
file_path = os.path.join(root, file)
file_hash = hash_file(file_path)

if file_hash in hashes:
duplicates.append((file_path, hashes[file_hash]))
else:
hashes[file_hash] = file_path

return duplicates

if __name__ == "__main__":
duplicates = check_duplicates()
if duplicates:
print("Duplicate files found:")
for dup in duplicates:
print(f"{dup[0]} is a duplicate of {dup[1]}")
exit(1) # Exit with error to flag as duplicate in GitHub Actions
else:
print("No duplicates found.")