Script: Add script to massively clean duplicated documents - refs BT#…

…19219
chamilo · Nov 9, 2022 · 74cd0cd · 74cd0cd
1 parent 631ab02
commit 74cd0cd
Showing 1 changed file with 185 additions and 0 deletions.
diff --git a/tests/scripts/delete_duplicate_documents.php b/tests/scripts/delete_duplicate_documents.php
@@ -0,0 +1,185 @@
+<?php
+/* For licensing terms, see /license.txt */
+/**
+ * This script removes duplicated tests and questions created
+ * through something gone wrong in the course backup/copy process.
+ * It identifies duplicate documents by title, path and size, and
+ * makes sure no usage is associated with the duplicated document, and
+ * that the duplicated document is not used in a learning path.
+ * A duplicated document will generally match the following criteria:
+ * - same size as the original
+ * - same path and title with the addition of a _%d (int) suffix to the basename (before the file extension) (unlikely to ever be > 9, so one digit is enough)
+ * - same course, same session (otherwise considered a different file, a voluntary copy)
+ * - each have entries in c_item_property because it was created legitimately
+ * Possible duplicates can be found with a query like:
+ * SELECT id, size, title, path FROM c_document WHERE c_id = 470 AND path like '%\__.%' ORDER BY path, title;
+ * This script should be located inside the tests/scripts/ folder to work.
+ * It can be run more than one time as it will only ever affect duplicate
+ * documents.
+ * If you have a very large number of documents, we recommend you temporarily
+ * comment out the api_item_property_update() calls in
+ * DocumentManager::deleteDocumentFromDb.
+ * Chances are there is not even a registry of those documents there in the
+ * first place (they were probably duplicated through a short process) and
+ * this is where most of the time is spent during deletion.
+ * @author Yannick Warnier <yannick.warnier@beeznest.com>
+ */
+exit; //remove this line to execute from the command line
+use ChamiloSession as Session;
+
+ini_set('memory_limit', '256M');
+
+if (PHP_SAPI !== 'cli') {
+    die('This script can only be executed from the command line');
+}
+
+require_once __DIR__.'/../../main/inc/global.inc.php';
+
+$tests = [];
+
+// Debug shows more output and only does a fake run
+$debug = false;
+$_user['user_id'] = 1;
+Session::write('_user', $_user);
+
+echo "[".time()."] Querying courses\n";
+$sql = "SELECT id, code FROM course order by id";
+
+$resCourse = Database::query($sql);
+if ($resCourse === false) {
+    exit('Could not find any course'.PHP_EOL);
+}
+$countCourses = Database::num_rows($resCourse);
+echo "[".time()."] Found $countCourses courses".PHP_EOL;
+
+// Check all c_document.id = c_document.iid, otherwise cancel
+$sql = "SELECT iid FROM c_document WHERE id != iid";
+$res = Database::query($sql);
+if (Database::num_rows($res) > 0) {
+    echo "We have detected that some c_document.id do not match c_document.iid.".PHP_EOL;
+    echo "This can lead to serious inconsistencies in the execution of this script.".PHP_EOL;
+    echo "Please fix this issue first, then try this script again.".PHP_EOL;
+    exit;
+}
+
+$duplicateDocsCount = 0;
+$originalDocsCount = 0;
+$deletedDocsCount = 0;
+$docsWithTracking = 0;
+$docsInLP = 0;
+$deletedDocsSize = 0;
+
+// Search for duplicate tests, by looking for tests that have the exact same
+// title in the same course
+echo "[".time()."] Iterating on courses: ";
+while ($course = Database::fetch_assoc($resCourse)) {
+    $course['real_id'] = $course['id'];
+    if ($debug) {
+        echo PHP_EOL."Course ".$course['id'].'..'.PHP_EOL;
+    }
+    $_course = api_get_course_info_by_id($course['id']);
+    $courseDir = $_course['directory'].'/document';
+    $sysCoursePath = api_get_path(SYS_COURSE_PATH);
+    $baseWorkDir = $sysCoursePath.$courseDir;
+    // We consider duplicates in sessions to be highly improbable, as course
+    // copies that could have been broken are essentially made on base courses.
+    $sql2 = "SELECT iid, title, path, size FROM c_document
+            WHERE c_id = ".$course['id']."
+            AND (session_id = 0 OR session_id IS NULL)
+            AND filetype = 'file'
+            ORDER BY path desc, title, iid";
+    $res2 = Database::query($sql2);
+    if ($res2 === false) {
+        die("Error querying docs in course code ".$course['code'].": ".Database::error($res2)."\n");
+    }
+
+    // Extract the root filename, which is not always the one without _%d at the end.
+    // Sometimes, the original has been deleted but there are still replicates.
+    $lastOriginalDocPath = '';
+    $lastOriginalDocId = 0;
+    $lastOriginalDocSize = 0;
+    if (Database::num_rows($res2) > 0) {
+        while ($doc = Database::fetch_assoc($res2)) {
+            if ($debug) {
+                echo $doc['path'].PHP_EOL;
+            }
+            $matches = [];
+            $guessedOriginal = '';
+            $notOriginal = preg_match('/(.*)_\d(\.[a-zA-Z0-9]{1,4})$/', $doc['path'], $matches);
+
+            if ($notOriginal) {
+                if ($debug) {
+                    echo "This looks like a copy".PHP_EOL;
+                }
+                $guessedOriginal = $matches[1].$matches[2];
+                if ($debug) {
+                    echo "The original would be ".$guessedOriginal.PHP_EOL;
+                }
+            } else {
+                if ($debug) {
+                    echo "This looks like an original. Recording and moving on...".PHP_EOL;
+                }
+                $lastOriginalDocPath = $doc['path'];
+                $lastOriginalDocId = $doc['iid'];
+                $lastOriginalDocSize = $doc['size'];
+                $originalDocsCount++;
+                // Move directly to the next item
+                continue;
+            }
+
+            if ($lastOriginalDocPath != $guessedOriginal) {
+                if ($debug) {
+                    echo "The guessed original filename is different from the original, or the original could not be found. Skipping...".PHP_EOL;
+                }
+                // The title is different -> moving on to another doc, but
+                // recording new doc's details just in case
+                $lastOriginalDocPath = $doc['path'];
+                $lastOriginalDocId = $doc['iid'];
+                $lastOriginalDocSize = $doc['size'];
+                $originalDocsCount++;
+            } else {
+                // A likely duplicate...
+                // Only bother if the doc's internal ID is higher than the
+                // last original doc ID, which means this (duplicate) test
+                // has been created *after* the original.
+                if ($lastOriginalDocId < $doc['iid'] && $lastOriginalDocSize == $doc['size']) {
+                    if ($debug) {
+                        echo "This doc has been created after the original and has the same size. Good.".PHP_EOL;
+                    }
+                    // This duplicate document could have been seen or downloaded already,
+                    // but this is not considered critical when deciding whether to clean
+                    // it or not.
+                    // It is, however, essential to make sure this duplicate document is
+                    // not used from inside a learning path.
+                    $sql4 = "SELECT lp_id FROM c_lp_item
+                    WHERE c_id = ".$course['id']."
+                    AND item_type = 'document' AND ref = ".$doc['iid'];
+                    $res4 = Database::query($sql4);
+                    if (0 === Database::num_rows($res4)) {
+                        if ($debug) {
+                            echo "The file is not used in any LP".PHP_EOL;
+                        } else {
+                            DocumentManager::delete_document($_course, $doc['path'], $baseWorkDir, null, $doc['iid']);
+                            DocumentManager::purgeDocument($doc['iid'], $_course);
+                        }
+                        if ($debug) {
+                            echo $doc['iid'].' deleted.'.PHP_EOL;
+                        }
+                        $deletedDocsCount++;
+                        $deletedDocsSize += $doc['size'];
+                    } else {
+                        if ($debug) {
+                            echo "This document is used from a learning path. Deletion cancelled.".PHP_EOL;
+                        }
+                    }
+                }
+                $duplicateDocsCount++;
+            }
+        } // end while on c_document
+    }
+} // end while on course
+
+$sizeInMB = (int) $deletedDocsSize / (1024*1024);
+echo "[".time()."] Found $originalDocsCount original docs and $duplicateDocsCount duplicate docs...".PHP_EOL;
+echo "Of these duplicates, $docsInLP were included in learning paths.".PHP_EOL;
+echo "Deleted $deletedDocsCount ($duplicateDocsCount - $docsInLP) docs for a total of $sizeInMB MB.".PHP_EOL;