Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Script: Add script to detect duplicate files by MD5 sum - refs BT#19673
- Loading branch information
Showing
1 changed file
with
215 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,215 @@ | ||
<?php | ||
/* For licensing terms, see /license.txt */ | ||
/** | ||
* This script finds duplicated documents to save disk space. | ||
* | ||
* It identifies duplicate documents by building an array of MD5 hashes | ||
* from the app/courses/[CODE]/document/ folders on your system | ||
* and checking for similar hashes everywhere in the other documents folders. | ||
* | ||
* This script should be located inside the tests/scripts/ folder to work | ||
* on the command line. To run it in a browser, move it to main/inc/ and | ||
* load it from there. | ||
* It can be run more than one time as it will only ever affect duplicate | ||
* documents. | ||
* | ||
* @author Yannick Warnier <yannick.warnier@beeznest.com> | ||
*/ | ||
//exit; //remove this line to execute from the command line | ||
|
||
use ChamiloSession as Session; | ||
|
||
ini_set('memory_limit', '512M'); | ||
|
||
require_once __DIR__.'/../../main/inc/global.inc.php'; | ||
|
||
$html = false; | ||
$htmlEOL = ''; | ||
if (PHP_SAPI !== 'cli') { | ||
$html = true; | ||
$htmlEOL = '<br />'; | ||
api_protect_admin_script(); | ||
//die('This script can only be executed from the command line'); | ||
} | ||
|
||
// Debug shows more output | ||
$debug = false; | ||
// Ignore template files | ||
$ignoreTemplates = true; | ||
$webCode = api_get_path(WEB_CODE_PATH).'document/document.php?cidReq='; | ||
|
||
if ($html) { | ||
echo "<html><body>".$htmlEOL; | ||
} | ||
echo "[".time()."] Querying courses$htmlEOL\n"; | ||
$sql = "SELECT id, code, directory FROM course order by id"; | ||
|
||
$resCourse = Database::query($sql); | ||
if ($resCourse === false) { | ||
exit('Could not find any course'.PHP_EOL); | ||
} | ||
$countCourses = Database::num_rows($resCourse); | ||
echo "[".time()."] Found $countCourses courses$htmlEOL".PHP_EOL; | ||
|
||
$md5Hashes = []; | ||
$md5Sizes = []; | ||
$totalDocs = 0; | ||
$totalDocsSize = 0; | ||
$uniqueDocs = 0; | ||
$duplicateDocsCount = 0; | ||
$totalDuplicateDocs = 0; | ||
$courseCodeDirMatch = []; | ||
$sysCoursePath = api_get_path(SYS_COURSE_PATH); | ||
|
||
// Search for duplicate tests, by looking for tests that have the exact same | ||
// title in the same course | ||
if ($debug) { | ||
echo "[".time()."] Iterating on courses...$htmlEOL".PHP_EOL; | ||
} | ||
while ($course = Database::fetch_assoc($resCourse)) { | ||
if ($debug) { | ||
echo "Course ".$course['id'].' ('.$course['code'].')..'.$htmlEOL.PHP_EOL; | ||
} | ||
$courseCodeDirMatch[$course['code']] = $course['directory']; | ||
$courseDir = $course['directory'].'/document'; | ||
$baseWorkDir = $sysCoursePath.$courseDir; | ||
$totalDocs += _scanSubDirs($baseWorkDir, $md5Hashes, $md5Sizes, $course['code'], $ignoreTemplates); | ||
} // end while on course | ||
|
||
// Sort array by sizes | ||
arsort($md5Sizes, SORT_NUMERIC); | ||
|
||
echo "[".time()."] Here is a list of duplicate files, joined together$htmlEOL".PHP_EOL; | ||
echo " with a clickable link to each document:$htmlEOL".PHP_EOL; | ||
foreach ($md5Sizes as $hash => $size) { | ||
$files = $md5Hashes[$hash]; | ||
$countFiles = $realFilesForThisHash = count($files); | ||
$realFilePath = ''; | ||
$i = 0; | ||
if ($countFiles > 1) { | ||
$kSize = floor((int) $files[0]['size']/1024); | ||
echo ($html ? '<b>' : '').$hash.", size: ".$kSize."KB ($countFiles copies): ".($html ? '</b>' : 0).$htmlEOL.PHP_EOL; | ||
if ($html) { | ||
echo "<ul>"; | ||
} | ||
} else { | ||
continue; | ||
} | ||
foreach ($files as $file) { | ||
if ($file['link']) { | ||
// This is a link. Discount from real files | ||
// and do not count as potential space savings | ||
$realFilesForThisHash--; | ||
$totalLinks++; | ||
} else { | ||
if ($i != 0) { | ||
// We don't count the first file as a duplicate | ||
// nor as potential space savings | ||
$totalDuplicateDocs++; | ||
$totalDocsSize += $file['size']; | ||
} | ||
if ($html) { | ||
echo "<li><a target='_blank' href='".$webCode.$file['course']."'>".$file['subpath']."(".$file['course'].")</a></li>".PHP_EOL; | ||
} else { | ||
echo $webCode.$file['course'].", as ".$file['path'].$htmlEOL.PHP_EOL; | ||
} | ||
$i++; | ||
} | ||
} | ||
if ($html) { | ||
echo "</ul>"; | ||
} | ||
if ($realFilesForThisHash > 0) { | ||
// if at least one of those was really a file, count it as unique | ||
if ($debug) { | ||
echo "$hash (".$file['path'].") has $realFilesForThisHash duplicates".PHP_EOL; | ||
} | ||
$uniqueDocs++; | ||
} | ||
} | ||
$duplicateDocsCount = $totalDocs - $uniqueDocs; | ||
|
||
$sizeInMB = floor((int) $totalDocsSize / (1024*1024)); | ||
echo "[".time()."] Found $totalDocs docs in total (including links).".PHP_EOL; | ||
echo " Only $uniqueDocs were original files with duplicates.".PHP_EOL; | ||
echo " $totalDuplicateDocs files remain, as duplicates versions of some of those $uniqueDocs.".PHP_EOL; | ||
echo " Potential savings: ~$sizeInMB MB.".PHP_EOL; | ||
|
||
echo "</body></html>"; | ||
|
||
/** | ||
* Scans the given directory and its subdirectories and returns a number of | ||
* files found | ||
* @param string $path | ||
* @param array &$md5Hashes | ||
* @param string $courseCode | ||
* @param bool $ignoreTemplates | ||
* @return int | ||
*/ | ||
function _scanSubDirs(string $path, array &$md5Hashes, array &$md5Sizes, string $courseCode, bool $ignoreTemplates = false): int | ||
{ | ||
global $debug; | ||
$count = 0; | ||
if (!is_dir($path)) { | ||
return 0; | ||
} | ||
if (substr(basename($path), 0, 1) === '.') { | ||
// If last path component starts with a '.', ignore | ||
return 0; | ||
} | ||
if ($ignoreTemplates) { | ||
if (preg_match('#/document/(audio|flash|images|video)#', $path)) { | ||
if ($debug) { | ||
echo $path." is part of the template, skipping".PHP_EOL; | ||
} | ||
return 0; | ||
} | ||
} | ||
$list = scandir($path); | ||
foreach ($list as $entry) { | ||
if (substr($entry, 0, 1) === '.') { | ||
// If the entry starts with a '.', ignore | ||
continue; | ||
} | ||
$subPath = $path.'/'.$entry; | ||
if ($ignoreTemplates && preg_match('#/document/index.html$#', $subPath)) { | ||
continue; | ||
} | ||
if (is_dir($subPath)) { | ||
$count += _scanSubDirs($subPath, $md5Hashes, $md5Sizes, $courseCode, $ignoreTemplates); | ||
} else { | ||
$fileMd5 = md5_file($subPath); | ||
if ($fileMd5 === false) { | ||
if ($debug) { | ||
echo 'Skipping: There was an error calculating MD5 of '.$subPath.PHP_EOL; | ||
} | ||
continue; | ||
} | ||
$fileSize = filesize($subPath); | ||
if ($fileSize > 0) { | ||
if (!isset($md5Hashes[$fileMd5])) { | ||
$md5Hashes[$fileMd5] = []; | ||
} | ||
$matches = []; | ||
preg_match('#/document/(.*)#', $subPath, $matches); | ||
$isLink = is_link($subPath); | ||
$md5Hashes[$fileMd5][] = [ | ||
'course' => $courseCode, | ||
'link' => $isLink, | ||
'path' => $subPath, | ||
'size' => $fileSize, | ||
'subpath' => $matches[1], | ||
]; | ||
$md5Sizes[$fileMd5] = $fileSize; | ||
} else { | ||
if ($debug) { | ||
echo 'Skipping: Filesize 0 for '.$subPath.PHP_EOL; | ||
} | ||
continue; | ||
} | ||
$count++; | ||
} | ||
} | ||
|
||
return $count; | ||
} |