Skip to content

Commit

Permalink
Script: Add script to massively clean duplicated documents - refs BT#…
Browse files Browse the repository at this point in the history
…19219
  • Loading branch information
ywarnier committed Nov 9, 2022
1 parent 631ab02 commit 74cd0cd
Showing 1 changed file with 185 additions and 0 deletions.
185 changes: 185 additions & 0 deletions tests/scripts/delete_duplicate_documents.php
@@ -0,0 +1,185 @@
<?php
/* For licensing terms, see /license.txt */
/**
* This script removes duplicated tests and questions created
* through something gone wrong in the course backup/copy process.
* It identifies duplicate documents by title, path and size, and
* makes sure no usage is associated with the duplicated document, and
* that the duplicated document is not used in a learning path.
* A duplicated document will generally match the following criteria:
* - same size as the original
* - same path and title with the addition of a _%d (int) suffix to the basename (before the file extension) (unlikely to ever be > 9, so one digit is enough)
* - same course, same session (otherwise considered a different file, a voluntary copy)
* - each have entries in c_item_property because it was created legitimately
* Possible duplicates can be found with a query like:
* SELECT id, size, title, path FROM c_document WHERE c_id = 470 AND path like '%\__.%' ORDER BY path, title;
* This script should be located inside the tests/scripts/ folder to work.
* It can be run more than one time as it will only ever affect duplicate
* documents.
* If you have a very large number of documents, we recommend you temporarily
* comment out the api_item_property_update() calls in
* DocumentManager::deleteDocumentFromDb.
* Chances are there is not even a registry of those documents there in the
* first place (they were probably duplicated through a short process) and
* this is where most of the time is spent during deletion.
* @author Yannick Warnier <yannick.warnier@beeznest.com>
*/
exit; //remove this line to execute from the command line
use ChamiloSession as Session;

ini_set('memory_limit', '256M');

if (PHP_SAPI !== 'cli') {
die('This script can only be executed from the command line');
}

require_once __DIR__.'/../../main/inc/global.inc.php';

$tests = [];

// Debug shows more output and only does a fake run
$debug = false;
$_user['user_id'] = 1;
Session::write('_user', $_user);

echo "[".time()."] Querying courses\n";
$sql = "SELECT id, code FROM course order by id";

$resCourse = Database::query($sql);
if ($resCourse === false) {
exit('Could not find any course'.PHP_EOL);
}
$countCourses = Database::num_rows($resCourse);
echo "[".time()."] Found $countCourses courses".PHP_EOL;

// Check all c_document.id = c_document.iid, otherwise cancel
$sql = "SELECT iid FROM c_document WHERE id != iid";
$res = Database::query($sql);
if (Database::num_rows($res) > 0) {
echo "We have detected that some c_document.id do not match c_document.iid.".PHP_EOL;
echo "This can lead to serious inconsistencies in the execution of this script.".PHP_EOL;
echo "Please fix this issue first, then try this script again.".PHP_EOL;
exit;
}

$duplicateDocsCount = 0;
$originalDocsCount = 0;
$deletedDocsCount = 0;
$docsWithTracking = 0;
$docsInLP = 0;
$deletedDocsSize = 0;

// Search for duplicate tests, by looking for tests that have the exact same
// title in the same course
echo "[".time()."] Iterating on courses: ";
while ($course = Database::fetch_assoc($resCourse)) {
$course['real_id'] = $course['id'];
if ($debug) {
echo PHP_EOL."Course ".$course['id'].'..'.PHP_EOL;
}
$_course = api_get_course_info_by_id($course['id']);
$courseDir = $_course['directory'].'/document';
$sysCoursePath = api_get_path(SYS_COURSE_PATH);
$baseWorkDir = $sysCoursePath.$courseDir;
// We consider duplicates in sessions to be highly improbable, as course
// copies that could have been broken are essentially made on base courses.
$sql2 = "SELECT iid, title, path, size FROM c_document
WHERE c_id = ".$course['id']."
AND (session_id = 0 OR session_id IS NULL)
AND filetype = 'file'
ORDER BY path desc, title, iid";
$res2 = Database::query($sql2);
if ($res2 === false) {
die("Error querying docs in course code ".$course['code'].": ".Database::error($res2)."\n");
}

// Extract the root filename, which is not always the one without _%d at the end.
// Sometimes, the original has been deleted but there are still replicates.
$lastOriginalDocPath = '';
$lastOriginalDocId = 0;
$lastOriginalDocSize = 0;
if (Database::num_rows($res2) > 0) {
while ($doc = Database::fetch_assoc($res2)) {
if ($debug) {
echo $doc['path'].PHP_EOL;
}
$matches = [];
$guessedOriginal = '';
$notOriginal = preg_match('/(.*)_\d(\.[a-zA-Z0-9]{1,4})$/', $doc['path'], $matches);

if ($notOriginal) {
if ($debug) {
echo "This looks like a copy".PHP_EOL;
}
$guessedOriginal = $matches[1].$matches[2];
if ($debug) {
echo "The original would be ".$guessedOriginal.PHP_EOL;
}
} else {
if ($debug) {
echo "This looks like an original. Recording and moving on...".PHP_EOL;
}
$lastOriginalDocPath = $doc['path'];
$lastOriginalDocId = $doc['iid'];
$lastOriginalDocSize = $doc['size'];
$originalDocsCount++;
// Move directly to the next item
continue;
}

if ($lastOriginalDocPath != $guessedOriginal) {
if ($debug) {
echo "The guessed original filename is different from the original, or the original could not be found. Skipping...".PHP_EOL;
}
// The title is different -> moving on to another doc, but
// recording new doc's details just in case
$lastOriginalDocPath = $doc['path'];
$lastOriginalDocId = $doc['iid'];
$lastOriginalDocSize = $doc['size'];
$originalDocsCount++;
} else {
// A likely duplicate...
// Only bother if the doc's internal ID is higher than the
// last original doc ID, which means this (duplicate) test
// has been created *after* the original.
if ($lastOriginalDocId < $doc['iid'] && $lastOriginalDocSize == $doc['size']) {
if ($debug) {
echo "This doc has been created after the original and has the same size. Good.".PHP_EOL;
}
// This duplicate document could have been seen or downloaded already,
// but this is not considered critical when deciding whether to clean
// it or not.
// It is, however, essential to make sure this duplicate document is
// not used from inside a learning path.
$sql4 = "SELECT lp_id FROM c_lp_item
WHERE c_id = ".$course['id']."
AND item_type = 'document' AND ref = ".$doc['iid'];
$res4 = Database::query($sql4);
if (0 === Database::num_rows($res4)) {
if ($debug) {
echo "The file is not used in any LP".PHP_EOL;
} else {
DocumentManager::delete_document($_course, $doc['path'], $baseWorkDir, null, $doc['iid']);
DocumentManager::purgeDocument($doc['iid'], $_course);
}
if ($debug) {
echo $doc['iid'].' deleted.'.PHP_EOL;
}
$deletedDocsCount++;
$deletedDocsSize += $doc['size'];
} else {
if ($debug) {
echo "This document is used from a learning path. Deletion cancelled.".PHP_EOL;
}
}
}
$duplicateDocsCount++;
}
} // end while on c_document
}
} // end while on course

$sizeInMB = (int) $deletedDocsSize / (1024*1024);
echo "[".time()."] Found $originalDocsCount original docs and $duplicateDocsCount duplicate docs...".PHP_EOL;
echo "Of these duplicates, $docsInLP were included in learning paths.".PHP_EOL;
echo "Deleted $deletedDocsCount ($duplicateDocsCount - $docsInLP) docs for a total of $sizeInMB MB.".PHP_EOL;

0 comments on commit 74cd0cd

Please sign in to comment.