Skip to content

Commit

Permalink
Reorganize functions for checking Java identifiers (#859)
Browse files Browse the repository at this point in the history
Summary:


I made the misguided choice of refactoring and reusing some methods from ProguardLexer.cpp, and broke the build on Windows for a reason I cannot understand (`TokenType` enum values in ProguardLexer.h somehow were not parsed as valid anymore???? Dunno).

Revert that part of the change, and instead move the useful methods from ProguardLexer to the `java_names` namespace in DexUtil.h.

Differential Revision: D56921682
  • Loading branch information
wsanville authored and facebook-github-bot committed May 3, 2024
1 parent 8ca75d7 commit c8e8eaf
Show file tree
Hide file tree
Showing 6 changed files with 46 additions and 43 deletions.
26 changes: 26 additions & 0 deletions libredex/DexUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -466,3 +466,29 @@ bool is_valid_identifier(std::string_view s) {
}
return true;
}

namespace java_names {

namespace {
bool is_not_idenfitier_character(char ch) {
return ch == '=' || ch == '+' || ch == '|' || ch == '@' || ch == '#' ||
ch == '^' || ch == '&' || ch == '"' || ch == '\'' || ch == '`' ||
ch == '~' || ch == '-';
}
} // namespace

// Differs from above "is_valid_identifier" function since this is for external
// names
bool is_identifier(const std::string_view& ident) {
for (const char& ch : ident) {
// java identifiers can be multi-lingual so membership testing is complex.
// much simpler to test for what is definitely not an identifier and then
// assume everything else is a legal identifier char, accepting that we
// will have false positives.
if (is_deliminator(ch) || is_not_idenfitier_character(ch)) {
return false;
}
}
return true;
}
} // namespace java_names
10 changes: 10 additions & 0 deletions libredex/DexUtil.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

#include <algorithm>
#include <boost/algorithm/string/predicate.hpp>
#include <cctype>
#include <functional>
#include <string_view>
#include <unordered_set>
Expand Down Expand Up @@ -387,4 +388,13 @@ inline std::string package_name(std::string_view type_name) {
return nice_name;
}
}

inline bool is_deliminator(char ch) {
return isspace(ch) || ch == '{' || ch == '}' || ch == '(' || ch == ')' ||
ch == ',' || ch == ';' || ch == ':' || ch == '#';
}

// An identifier can refer to a class name, a field name or a package name.
// https://docs.oracle.com/javase/specs/jls/se16/html/jls-3.html#jls-JavaLetter
bool is_identifier(const std::string_view& ident);
} // namespace java_names
36 changes: 6 additions & 30 deletions libredex/ProguardLexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <vector>

#include "Debug.h"
#include "DexUtil.h"
#include "Macros.h"
#include "ProguardLexer.h"

Expand All @@ -30,17 +31,6 @@ constexpr char kPathDelim =
':';
#endif

bool is_deliminator(char ch) {
return isspace(ch) || ch == '{' || ch == '}' || ch == '(' || ch == ')' ||
ch == ',' || ch == ';' || ch == ':' || ch == '#';
}

bool is_not_idenfitier_character(char ch) {
return ch == '=' || ch == '+' || ch == '|' || ch == '@' || ch == '#' ||
ch == '^' || ch == '&' || ch == '"' || ch == '\'' || ch == '`' ||
ch == '~' || ch == '-';
}

void skip_whitespace(std::string_view& data, unsigned int* line) {
size_t index = 0;
for (; index != data.size(); ++index) {
Expand Down Expand Up @@ -213,21 +203,6 @@ using UnorderedStringViewIndexableMap = multi_index_container<

} // namespace

// An identifier can refer to a class name, a field name or a package name.
// https://docs.oracle.com/javase/specs/jls/se16/html/jls-3.html#jls-JavaLetter
bool is_identifier(const std::string_view& ident) {
for (const char& ch : ident) {
// java identifiers can be multi-lingual so membership testing is complex.
// much simpler to test for what is definitely not an identifier and then
// assume everything else is a legal identifier char, accepting that we
// will have false positives.
if (is_deliminator(ch) || is_not_idenfitier_character(ch)) {
return false;
}
}
return true;
}

std::string Token::show() const {
switch (type) {
case TokenType::openCurlyBracket:
Expand Down Expand Up @@ -715,8 +690,8 @@ std::vector<Token> lex(const std::string_view& in) {
// Check for commands.
if (ch == '-') {
data = data.substr(1);
auto command =
parse_part_fn</*kSkipWs=*/false>(data, &line, is_deliminator);
auto command = parse_part_fn</*kSkipWs=*/false>(
data, &line, java_names::is_deliminator);

{
auto it = simple_commands.find(command);
Expand Down Expand Up @@ -789,7 +764,8 @@ std::vector<Token> lex(const std::string_view& in) {
// At this point:
// * data is not empty
// * it does not start with a deliminator
auto word = parse_part_fn</*kSkipWs=*/false>(data, &line, is_deliminator);
auto word = parse_part_fn</*kSkipWs=*/false>(
data, &line, java_names::is_deliminator);

{
auto it = word_tokens.find(word);
Expand All @@ -811,7 +787,7 @@ std::vector<Token> lex(const std::string_view& in) {
continue;
}

if (is_identifier(word)) {
if (java_names::is_identifier(word)) {
add_token_data(TokenType::identifier, word);
continue;
}
Expand Down
4 changes: 0 additions & 4 deletions libredex/ProguardLexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,6 @@
namespace keep_rules {
namespace proguard_parser {

// An identifier can refer to a class name, a field name or a package name.
// https://docs.oracle.com/javase/specs/jls/se16/html/jls-3.html#jls-JavaLetter
bool is_identifier(const std::string_view& ident);

enum class TokenType {
openCurlyBracket,
closeCurlyBracket,
Expand Down
8 changes: 2 additions & 6 deletions libredex/RedexResources.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
#include "GlobalConfig.h"
#include "IOUtil.h"
#include "Macros.h"
#include "ProguardLexer.h"
#include "ReadMaybeMapped.h"
#include "StringUtil.h"
#include "Trace.h"
Expand Down Expand Up @@ -490,11 +489,8 @@ void gather_r_classes(const Scope& scope, std::vector<DexClass*>* vec) {
}
}

bool valid_java_identifier(const std::string& ident) {
return !ident.empty() && keep_rules::proguard_parser::is_identifier(ident);
}

bool valid_xml_element(const std::string& ident) {
return valid_java_identifier(ident) && ident.find('.') != std::string::npos;
return java_names::is_identifier(ident) &&
ident.find('.') != std::string::npos;
}
} // namespace resources
5 changes: 2 additions & 3 deletions libredex/RedexResources.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "androidfw/ResourceTypes.h"

#include "Debug.h"
#include "DexUtil.h"
#include "GlobalConfig.h"
#include "RedexMappedFile.h"

Expand Down Expand Up @@ -60,8 +61,6 @@ const inline std::set<std::string> POSSIBLE_CLASS_ATTRIBUTES = {
"actionViewClass", "class", "controller", "layout_behavior",
"layoutManager", "name", "targetClass",
};
// Returns false if there are any characters that are not valid Java identifier.
bool valid_java_identifier(const std::string& ident);
// Returns false if there is no dot or it's not a Java identifier.
bool valid_xml_element(const std::string& ident);

Expand All @@ -78,7 +77,7 @@ struct StringOrReference {
if (ref != 0) {
return true;
}
return valid_java_identifier(str);
return java_names::is_identifier(str);
}

bool operator==(const StringOrReference& that) const {
Expand Down

0 comments on commit c8e8eaf

Please sign in to comment.