Skip to content

Commit ee08aea

Browse files
committed
refactors: table drive cli + de-dupes + tag tokens once
1 parent a119dd6 commit ee08aea

File tree

3 files changed

+259
-423
lines changed

3 files changed

+259
-423
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
Prism is a single-file transpiler that makes C safer without changing how you write it.
77

8-
- **1028 tests** — edge cases, control flow, nightmares, trying hard to break Prism
8+
- **1032 tests** — edge cases, control flow, nightmares, trying hard to break Prism
99
- **Building Real C** — OpenSSL, SQLite, Bash, GNU Coreutils, Make, Curl
1010
- **Proper transpiler** — tracks typedefs, respects scope, catches unsafe patterns
1111
- **Opt-out features** Disable parts of the transpiler, like zero-init, with CLI flags
@@ -208,7 +208,7 @@ Not:
208208
Prism uses a GCC-compatible interface — most flags pass through to the backend compiler.
209209

210210
```sh
211-
Prism v0.101.0 - Robust C transpiler
211+
Prism v0.102.0 - Robust C transpiler
212212

213213
Usage: prism [options] source.c... [-o output]
214214

parse.c

Lines changed: 157 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,19 @@ enum
9090
TF_IS_FLOAT = 1 << 2,
9191
};
9292

93+
// Token tags - bitmask classification assigned once at tokenize time
94+
// Eliminates repeated string comparisons in the transpiler
95+
enum
96+
{
97+
TT_TYPE = 1 << 0, // Type keyword (int, char, void, struct, etc.)
98+
TT_QUALIFIER = 1 << 1, // Type qualifier (const, volatile, restrict, static, auto, register, _Atomic, _Alignas, __attribute__)
99+
TT_SUE = 1 << 2, // struct/union/enum
100+
TT_SKIP_DECL = 1 << 3, // Keywords that can't start a zero-init declaration
101+
TT_ATTR = 1 << 4, // Attribute keyword (__attribute__, __attribute, __declspec)
102+
TT_ASSIGN = 1 << 5, // Assignment or compound assignment operator (=, +=, ++, --, [)
103+
TT_MEMBER = 1 << 6, // Member access operator (. or ->)
104+
};
105+
93106
struct Token
94107
{
95108
char *loc;
@@ -103,6 +116,7 @@ struct Token
103116
TokenKind kind;
104117
uint16_t file_idx;
105118
uint8_t flags;
119+
uint8_t tag; // TT_* bitmask - token classification
106120
};
107121

108122
// Token accessors
@@ -708,74 +722,114 @@ static inline bool equal(Token *tok, const char *op)
708722
return equiv && strlen(equiv) == len && !memcmp(equiv, op, len);
709723
}
710724

725+
// Internal marker bit for keyword map: values are (tag | KW_MARKER)
726+
// This distinguishes tag=0 keywords from "not found" (NULL)
727+
#define KW_MARKER 0x80
728+
711729
static void init_keyword_map(void)
712730
{
713-
static char *kw[] = {
714-
"return",
715-
"if",
716-
"else",
717-
"for",
718-
"while",
719-
"do",
720-
"switch",
721-
"case",
722-
"default",
723-
"break",
724-
"continue",
725-
"goto",
726-
"sizeof",
727-
"alignof",
728-
"struct",
729-
"union",
730-
"enum",
731-
"typedef",
732-
"static",
733-
"extern",
734-
"inline",
735-
"const",
736-
"volatile",
737-
"restrict",
738-
"_Atomic",
739-
"_Noreturn",
740-
"_Thread_local",
741-
"void",
742-
"char",
743-
"short",
744-
"int",
745-
"long",
746-
"float",
747-
"double",
748-
"signed",
749-
"unsigned",
750-
"_Bool",
751-
"auto",
752-
"register",
753-
"_Alignas",
754-
"_Static_assert",
755-
"_Generic",
756-
"typeof",
757-
"__typeof__",
758-
"asm",
759-
"__asm__",
760-
"__attribute__",
761-
"__extension__",
762-
"__builtin_va_list",
763-
"__builtin_va_arg",
764-
"__builtin_offsetof",
765-
"__builtin_types_compatible_p",
731+
// Each entry: {keyword, TT_* tag bitmask}
732+
// Tags are assigned once here, then stored on tokens during convert_pp_tokens
733+
static struct
734+
{
735+
char *name;
736+
uint8_t tag;
737+
} kw[] = {
738+
// Control flow (skip-decl: can't start a zero-init declaration)
739+
{"return", TT_SKIP_DECL},
740+
{"if", TT_SKIP_DECL},
741+
{"else", TT_SKIP_DECL},
742+
{"for", TT_SKIP_DECL},
743+
{"while", TT_SKIP_DECL},
744+
{"do", TT_SKIP_DECL},
745+
{"switch", TT_SKIP_DECL},
746+
{"case", TT_SKIP_DECL},
747+
{"default", TT_SKIP_DECL},
748+
{"break", TT_SKIP_DECL},
749+
{"continue", TT_SKIP_DECL},
750+
{"goto", TT_SKIP_DECL},
751+
{"sizeof", TT_SKIP_DECL},
752+
{"alignof", TT_SKIP_DECL},
753+
{"_Alignof", TT_SKIP_DECL},
754+
{"_Generic", TT_SKIP_DECL},
755+
{"_Static_assert", 0},
756+
// struct/union/enum (also type keywords)
757+
{"struct", TT_TYPE | TT_SUE},
758+
{"union", TT_TYPE | TT_SUE},
759+
{"enum", TT_TYPE | TT_SUE},
760+
// Storage class / qualifiers that also skip decl
761+
{"typedef", TT_SKIP_DECL},
762+
{"static", TT_QUALIFIER | TT_SKIP_DECL},
763+
{"extern", TT_SKIP_DECL},
764+
{"inline", 0},
765+
// Type qualifiers
766+
{"const", TT_QUALIFIER},
767+
{"volatile", TT_QUALIFIER},
768+
{"restrict", TT_QUALIFIER},
769+
{"_Atomic", TT_QUALIFIER | TT_TYPE},
770+
{"_Noreturn", 0},
771+
{"_Thread_local", 0},
772+
// Type keywords
773+
{"void", TT_TYPE},
774+
{"char", TT_TYPE},
775+
{"short", TT_TYPE},
776+
{"int", TT_TYPE},
777+
{"long", TT_TYPE},
778+
{"float", TT_TYPE},
779+
{"double", TT_TYPE},
780+
{"signed", TT_TYPE},
781+
{"unsigned", TT_TYPE},
782+
{"_Bool", TT_TYPE},
783+
{"bool", TT_TYPE},
784+
{"_Complex", TT_TYPE},
785+
{"_Imaginary", TT_TYPE},
786+
{"__int128", TT_TYPE},
787+
{"__int128_t", TT_TYPE},
788+
{"__uint128", TT_TYPE},
789+
{"__uint128_t", TT_TYPE},
790+
{"typeof_unqual", TT_TYPE},
791+
{"auto", TT_QUALIFIER},
792+
{"register", TT_QUALIFIER},
793+
{"_Alignas", TT_QUALIFIER},
794+
{"typeof", TT_TYPE},
795+
{"__typeof__", TT_TYPE},
796+
{"__typeof", TT_TYPE},
797+
{"_BitInt", TT_TYPE},
798+
// Asm (skip-decl: can't start a declaration)
799+
{"asm", TT_SKIP_DECL},
800+
{"__asm__", TT_SKIP_DECL},
801+
{"__asm", TT_SKIP_DECL},
802+
// Attributes
803+
{"__attribute__", TT_ATTR | TT_QUALIFIER},
804+
{"__attribute", TT_ATTR | TT_QUALIFIER},
805+
{"__declspec", TT_ATTR | TT_QUALIFIER},
806+
// Other builtins
807+
{"__extension__", 0},
808+
{"__builtin_va_list", 0},
809+
{"__builtin_va_arg", 0},
810+
{"__builtin_offsetof", 0},
811+
{"__builtin_types_compatible_p", 0},
766812
// Prism keywords
767-
"defer",
768-
"raw",
813+
{"defer", 0},
814+
{"raw", 0},
769815
};
770816
for (size_t i = 0; i < sizeof(kw) / sizeof(*kw); i++)
771-
hashmap_put(&ctx->keyword_map, kw[i], strlen(kw[i]), (void *)1);
817+
hashmap_put(&ctx->keyword_map, kw[i].name, strlen(kw[i].name),
818+
(void *)(uintptr_t)(kw[i].tag | KW_MARKER));
772819
}
773820

774821
static bool is_keyword(Token *tok)
775822
{
776823
return hashmap_get(&ctx->keyword_map, tok->loc, tok->len) != NULL;
777824
}
778825

826+
// Get the tag bits for a keyword (0 if not a keyword)
827+
static uint8_t keyword_tag(Token *tok)
828+
{
829+
void *val = hashmap_get(&ctx->keyword_map, tok->loc, tok->len);
830+
return val ? (uint8_t)((uintptr_t)val & ~KW_MARKER) : 0;
831+
}
832+
779833
// Forward declaration for hex digit conversion
780834
static int from_hex(char c);
781835

@@ -1427,9 +1481,46 @@ static void convert_pp_tokens(Token *tok)
14271481
for (Token *t = tok; t && t->kind != TK_EOF; t = t->next)
14281482
{
14291483
if (is_keyword(t))
1484+
{
14301485
t->kind = TK_KEYWORD;
1486+
t->tag = keyword_tag(t);
1487+
}
14311488
else if (t->kind == TK_PP_NUM)
14321489
convert_pp_number(t);
1490+
else if (t->kind == TK_PUNCT)
1491+
{
1492+
// Tag punctuators: assignment ops and member access
1493+
char c = t->loc[0];
1494+
if (t->len == 1)
1495+
{
1496+
if (c == '=' || c == '[')
1497+
t->tag = TT_ASSIGN;
1498+
else if (c == '.')
1499+
t->tag = TT_MEMBER;
1500+
}
1501+
else if (t->len == 2)
1502+
{
1503+
char c2 = t->loc[1];
1504+
if (c2 == '=')
1505+
{
1506+
// +=, -=, *=, /=, %=, &=, |=, ^= are assignment; !=, <=, >= are not
1507+
if (c != '!' && c != '<' && c != '>' && c != '=')
1508+
t->tag = TT_ASSIGN;
1509+
}
1510+
else if (c == '+' && c2 == '+')
1511+
t->tag = TT_ASSIGN;
1512+
else if (c == '-' && c2 == '-')
1513+
t->tag = TT_ASSIGN;
1514+
else if (c == '-' && c2 == '>')
1515+
t->tag = TT_MEMBER;
1516+
}
1517+
else if (t->len == 3 && t->loc[2] == '=')
1518+
{
1519+
// <<= and >>=
1520+
if ((c == '<' || c == '>') && t->loc[1] == c)
1521+
t->tag = TT_ASSIGN;
1522+
}
1523+
}
14331524
}
14341525
}
14351526

@@ -1818,28 +1909,15 @@ Token *tokenize_file(char *path)
18181909
return tokenize(file);
18191910
}
18201911

1821-
// Reset state for reuse (keeps arena blocks for reuse)
1822-
void tokenizer_reset(void)
1823-
{
1824-
arena_reset(&ctx->main_arena);
1825-
// Free file view cache first (before freeing files)
1826-
free_file_view_cache();
1827-
for (int i = 0; i < ctx->input_file_count; i++)
1828-
free_file(ctx->input_files[i]);
1829-
free(ctx->input_files);
1830-
ctx->input_files = NULL;
1831-
ctx->input_file_count = 0;
1832-
ctx->input_file_capacity = 0;
1833-
ctx->current_file = NULL;
1834-
// Free interned filenames last (after all files are freed)
1835-
free_filename_intern_map();
1836-
}
1837-
1838-
// Full cleanup - frees all memory including arena blocks
1839-
void tokenizer_cleanup(void)
1912+
// Teardown tokenizer state
1913+
// full=false: reset for reuse (keeps arena blocks allocated)
1914+
// full=true: free all memory including arena blocks and keyword map
1915+
void tokenizer_teardown(bool full)
18401916
{
1841-
arena_free(&ctx->main_arena);
1842-
// Free file view cache first (before freeing files)
1917+
if (full)
1918+
arena_free(&ctx->main_arena);
1919+
else
1920+
arena_reset(&ctx->main_arena);
18431921
free_file_view_cache();
18441922
for (int i = 0; i < ctx->input_file_count; i++)
18451923
free_file(ctx->input_files[i]);
@@ -1848,7 +1926,7 @@ void tokenizer_cleanup(void)
18481926
ctx->input_file_count = 0;
18491927
ctx->input_file_capacity = 0;
18501928
ctx->current_file = NULL;
1851-
// Free interned filenames last (after all files are freed)
18521929
free_filename_intern_map();
1853-
hashmap_clear(&ctx->keyword_map);
1930+
if (full)
1931+
hashmap_clear(&ctx->keyword_map);
18541932
}

0 commit comments

Comments
 (0)