Skip to content

Commit

Permalink
feat(scanner): ignore files from scanning using mimetype
Browse files Browse the repository at this point in the history
Signed-off-by: Anupam Ghosh <anupam.ghosh@siemens.com>
  • Loading branch information
ag4ums committed Apr 3, 2020
1 parent 8a17b95 commit 112e616
Show file tree
Hide file tree
Showing 36 changed files with 519 additions and 111 deletions.
4 changes: 2 additions & 2 deletions src/copyright/agent/copyright.cc
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ int main(int argc, char** argv)
}

bool json = cliOptions.doJsonOutput();
bool ignoreFilesWithMimeType = cliOptions.doignoreFilesWithMimeType();
CopyrightState state = getState(std::move(cliOptions));

if (!fileNames.empty())
Expand Down Expand Up @@ -144,7 +145,7 @@ int main(int argc, char** argv)
if (arsId <= 0)
return_sched(5);

if (!processUploadId(state, agentId, uploadId, copyrightDatabaseHandler))
if (!processUploadId(state, agentId, uploadId, copyrightDatabaseHandler, ignoreFilesWithMimeType))
return_sched(2);

fo_scheduler_heart(0);
Expand All @@ -154,6 +155,5 @@ int main(int argc, char** argv)
/* do not use bail, as it would prevent the destructors from running */
return_sched(0);
}

}

13 changes: 12 additions & 1 deletion src/copyright/agent/copyrightState.cc
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,13 @@ const std::list<unptr::shared_ptr<scanner>>& CopyrightState::getScanners() const
* \param verbosity Verbosity set by CLI
* \param type Type set by CLI
* \param json True to get output in JSON format
* \param ignoreFilesWithMimeType True to ignore files with particular mimetype
*/
CliOptions::CliOptions(int verbosity, unsigned int type, bool json) :
CliOptions::CliOptions(int verbosity, unsigned int type, bool json, bool ignoreFilesWithMimeType) :
verbosity(verbosity),
optType(type),
json(json),
ignoreFilesWithMimeType(ignoreFilesWithMimeType),
cliScanners()
{
}
Expand All @@ -68,6 +70,7 @@ CliOptions::CliOptions(int verbosity, unsigned int type, bool json) :
CliOptions::CliOptions() :
verbosity(0),
optType(ALL_TYPES),
ignoreFilesWithMimeType(false),
cliScanners()
{
}
Expand Down Expand Up @@ -126,3 +129,11 @@ bool CliOptions::doJsonOutput() const
return json;
}

/**
* \brief Check to ignore files with particular mimetype
* \return True if required, else false
*/
bool CliOptions::doignoreFilesWithMimeType() const
{
return ignoreFilesWithMimeType;
}
4 changes: 3 additions & 1 deletion src/copyright/agent/copyrightState.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ class CliOptions
int verbosity; /**< The verbosity level */
unsigned int optType; /**< Scan type (2 => url, 4 => email, 8 => author, 16 => ecc) */
bool json; /**< Whether to generate JSON output */
bool ignoreFilesWithMimeType; /**< Whether to ignore files with particular mimetype */
std::list<unptr::shared_ptr<scanner>> cliScanners; /**< List of available scanners */

public:
Expand All @@ -45,11 +46,12 @@ class CliOptions
unsigned int getOptType() const;

bool doJsonOutput() const;
bool doignoreFilesWithMimeType() const;

void addScanner(scanner* regexDesc);
std::list<unptr::shared_ptr<scanner>> extractScanners();

CliOptions(int verbosity, unsigned int type, bool json);
CliOptions(int verbosity, unsigned int type, bool json, bool ignoreFilesWithMimeType);
CliOptions();
};

Expand Down
11 changes: 8 additions & 3 deletions src/copyright/agent/copyrightUtils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,9 @@ bool parseCliOptions(int argc, char** argv, CliOptions& dest,
(
"json,J", "output JSON"
)
(
"ignoreFilesWithMimeType,I", "ignoreFilesWithMimeType"
)
(
"config,c", boost::program_options::value<string>(), "path to the sysconfigdir"
)
Expand Down Expand Up @@ -161,8 +164,9 @@ bool parseCliOptions(int argc, char** argv, CliOptions& dest,

unsigned long verbosity = vm.count("verbose");
bool json = vm.count("json") > 0 ? true : false;
bool ignoreFilesWithMimeType = vm.count("ignoreFilesWithMimeType") > 0 ? true : false;

dest = CliOptions(verbosity, type, json);
dest = CliOptions(verbosity, type, json, ignoreFilesWithMimeType);

if (vm.count("regex"))
{
Expand Down Expand Up @@ -400,11 +404,12 @@ void matchPFileWithLicenses(CopyrightState const& state, int agentId, unsigned l
* \param agentId Agent id
* \param uploadId Upload id to be processed
* \param databaseHandler Database handler object
* \param ignoreFilesWithMimeType To ignore files with particular mimetype
* \return True when upload is processed
*/
bool processUploadId(const CopyrightState& state, int agentId, int uploadId, CopyrightDatabaseHandler& databaseHandler)
bool processUploadId(const CopyrightState& state, int agentId, int uploadId, CopyrightDatabaseHandler& databaseHandler, bool ignoreFilesWithMimeType)
{
vector<unsigned long> fileIds = databaseHandler.queryFileIdsForUpload(agentId, uploadId);
vector<unsigned long> fileIds = databaseHandler.queryFileIdsForUpload(agentId, uploadId, ignoreFilesWithMimeType);

#pragma omp parallel
{
Expand Down
2 changes: 1 addition & 1 deletion src/copyright/agent/copyrightUtils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ std::vector<CopyrightMatch> matchStringToRegexes(const std::string& content, std
*/
void normalizeContent(std::string& content);

bool processUploadId(const CopyrightState& state, int agentId, int uploadId, CopyrightDatabaseHandler& handler);
bool processUploadId(const CopyrightState& state, int agentId, int uploadId, CopyrightDatabaseHandler& handler, bool ignoreFilesWithMimeType);

std::pair<std::string, std::list<match>> processSingleFile(const CopyrightState& state,
const std::string fileName);
Expand Down
37 changes: 34 additions & 3 deletions src/copyright/agent/database.cc
Original file line number Diff line number Diff line change
Expand Up @@ -311,13 +311,42 @@ bool CopyrightDatabaseHandler::createTableClearing() const
* \brief Get the list of pfile ids on which the given agent has no findings for a given upload
* \param agentId Agent id to be removed from result
* \param uploadId Upload id to scan for files
* \param ignoreFilesWithMimeType to exclude filetypes with particular mimetype
* \return List of pfiles on which the given agent has no findings
*/
std::vector<unsigned long> CopyrightDatabaseHandler::queryFileIdsForUpload(int agentId, int uploadId)
std::vector<unsigned long> CopyrightDatabaseHandler::queryFileIdsForUpload(int agentId, int uploadId, bool ignoreFilesWithMimeType)
{
std::string uploadTreeTableName = queryUploadTreeTableName(uploadId);

fo_dbManager_PreparedStatement* preparedStatement =
fo_dbManager_PreparedStatement* preparedStatement ;
if (ignoreFilesWithMimeType)
{
preparedStatement =
fo_dbManager_PrepareStamement(dbManager.getStruct_dbManager(),
("queryFileIdsForUpload:" IDENTITY "Agent" + uploadTreeTableName + "WithMimetype").c_str(),
("SELECT pfile_pk"
" FROM ("
" SELECT distinct(pfile_fk) AS PF"
" FROM " + uploadTreeTableName +
" WHERE upload_fk = $1 and (ufile_mode&x'3C000000'::int)=0"
" ) AS SS "
"LEFT OUTER JOIN " IDENTITY " ON (PF = pfile_fk AND agent_fk = $2) "
#ifdef IDENTITY_COPYRIGHT
"LEFT OUTER JOIN author AS au ON (PF = au.pfile_fk AND au.agent_fk = $2) "
#endif
"INNER JOIN pfile ON (PF = pfile_pk) "
#ifdef IDENTITY_COPYRIGHT
"WHERE copyright.copyright_pk IS NULL AND au.author_pk IS NULL "
#else
"WHERE (" IDENTITY "_pk IS NULL OR agent_fk <> $2) "
#endif
"AND (pfile_mimetypefk not in ( "
"SELECT mimetype_pk from mimetype where mimetype_name=any(string_to_array(( "
"SELECT conf_value from sysconfig where variablename='SkipFiles'),','))));").c_str(),
int, int);
}
else
{
preparedStatement =
fo_dbManager_PrepareStamement(dbManager.getStruct_dbManager(),
("queryFileIdsForUpload:" IDENTITY "Agent" + uploadTreeTableName).c_str(),
("SELECT pfile_pk"
Expand All @@ -337,10 +366,12 @@ std::vector<unsigned long> CopyrightDatabaseHandler::queryFileIdsForUpload(int a
"WHERE " IDENTITY "_pk IS NULL OR agent_fk <> $2;").c_str(),
#endif
int, int);
}
QueryResult queryResult = dbManager.execPrepared(preparedStatement,
uploadId, agentId);

return queryResult.getSimpleResults<unsigned long>(0, fo::stringToUnsignedLong);

}

/**
Expand Down
2 changes: 1 addition & 1 deletion src/copyright/agent/database.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ class CopyrightDatabaseHandler : public fo::AgentDatabaseHandler
bool createTables() const;
bool insertInDatabase(DatabaseEntry& entry) const;
bool insertNoResultInDatabase(long agentId, long pFileId) const;
std::vector<unsigned long> queryFileIdsForUpload(int agentId, int uploadId);
std::vector<unsigned long> queryFileIdsForUpload(int agentId, int uploadId, bool ignoreFilesWithMimeType);

private:
/**
Expand Down
43 changes: 43 additions & 0 deletions src/copyright/ui/agent-copyright.php
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,49 @@ function AgentHasResults($uploadId=0)
{
return CheckARS($uploadId, $this->AgentName, "copyright scanner", "copyright_ars");
}

/**
* @copydoc Fossology\Lib\Plugin\AgentPlugin::AgentAdd()
* @see \Fossology\Lib\Plugin\AgentPlugin::AgentAdd()
*/
public function AgentAdd($jobId, $uploadId, &$errorMsg, $dependencies=array(), $arguments=null)
{
$unpackArgs = intval($_POST['scm']) == 1 ? '-I' : '';
if ($this->AgentHasResults($uploadId) == 1) {
return 0;
}

$jobQueueId = \IsAlreadyScheduled($jobId, $this->AgentName, $uploadId);
if ($jobQueueId != 0) {
return $jobQueueId;
}

$args = $unpackArgs;
if (!empty($unpackArgs)) {
return $this->doAgentAdd($jobId, $uploadId, $errorMsg, array("agent_mimetype"),$uploadId,$args);
} else {
return $this->doAgentAdd($jobId, $uploadId, $errorMsg, array("agent_adj2nest"), $uploadId);
}
}

/**
* Check if agent already included in the dependency list
* @param mixed $dependencies Array of job dependencies
* @param string $agentName Name of the agent to be checked for
* @return boolean true if agent already in dependency list else false
*/
protected function isAgentIncluded($dependencies, $agentName)
{
foreach ($dependencies as $dependency) {
if ($dependency == $agentName) {
return true;
}
if (is_array($dependency) && $agentName == $dependency['name']) {
return true;
}
}
return false;
}
}

register_plugin(new CopyrightAgentPlugin());
43 changes: 43 additions & 0 deletions src/copyright/ui/agent-ecc.php
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,49 @@ function AgentHasResults($uploadId=0)
{
return CheckARS($uploadId, $this->AgentName, "ecc scanner", "ecc_ars");
}

/**
* @copydoc Fossology\Lib\Plugin\AgentPlugin::AgentAdd()
* @see \Fossology\Lib\Plugin\AgentPlugin::AgentAdd()
*/
public function AgentAdd($jobId, $uploadId, &$errorMsg, $dependencies=array(), $arguments=null)
{
$unpackArgs = intval($_POST['scm']) == 1 ? '-I' : '';
if ($this->AgentHasResults($uploadId) == 1) {
return 0;
}

$jobQueueId = \IsAlreadyScheduled($jobId, $this->AgentName, $uploadId);
if ($jobQueueId != 0) {
return $jobQueueId;
}

$args = $unpackArgs;
if (!empty($unpackArgs)) {
return $this->doAgentAdd($jobId, $uploadId, $errorMsg, array("agent_mimetype"),$uploadId,$args);
} else {
return $this->doAgentAdd($jobId, $uploadId, $errorMsg, array("agent_adj2nest"), $uploadId);
}
}

/**
* Check if agent already included in the dependency list
* @param mixed $dependencies Array of job dependencies
* @param string $agentName Name of the agent to be checked for
* @return boolean true if agent already in dependency list else false
*/
protected function isAgentIncluded($dependencies, $agentName)
{
foreach ($dependencies as $dependency) {
if ($dependency == $agentName) {
return true;
}
if (is_array($dependency) && $agentName == $dependency['name']) {
return true;
}
}
return false;
}
}

register_plugin(new EccAgentPlugin());
48 changes: 48 additions & 0 deletions src/copyright/ui/agent-keyword.php
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,58 @@ public function __construct() {
parent::__construct();
}

/**
* @copydoc Fossology::Lib::Plugin::AgentPlugin::AgentHasResults()
* @see Fossology::Lib::Plugin::AgentPlugin::AgentHasResults()
*/
function AgentHasResults($uploadId=0)
{
return CheckARS($uploadId, $this->AgentName, "keyword scanner", "keyword_ars");
}


/**
* @copydoc Fossology\Lib\Plugin\AgentPlugin::AgentAdd()
* @see \Fossology\Lib\Plugin\AgentPlugin::AgentAdd()
*/
public function AgentAdd($jobId, $uploadId, &$errorMsg, $dependencies=array(), $arguments=null)
{
$unpackArgs = intval($_POST['scm']) == 1 ? '-I' : '';
if ($this->AgentHasResults($uploadId) == 1) {
return 0;
}

$jobQueueId = \IsAlreadyScheduled($jobId, $this->AgentName, $uploadId);
if ($jobQueueId != 0) {
return $jobQueueId;
}

$args = $unpackArgs;
if (!empty($unpackArgs)) {
return $this->doAgentAdd($jobId, $uploadId, $errorMsg, array("agent_mimetype"),$uploadId,$args);
} else {
return $this->doAgentAdd($jobId, $uploadId, $errorMsg, array("agent_adj2nest"), $uploadId);
}
}

/**
* Check if agent already included in the dependency list
* @param mixed $dependencies Array of job dependencies
* @param string $agentName Name of the agent to be checked for
* @return boolean true if agent already in dependency list else false
*/
protected function isAgentIncluded($dependencies, $agentName)
{
foreach ($dependencies as $dependency) {
if ($dependency == $agentName) {
return true;
}
if (is_array($dependency) && $agentName == $dependency['name']) {
return true;
}
}
return false;
}
}

register_plugin(new KeywordAgentPlugin());
Loading

0 comments on commit 112e616

Please sign in to comment.