Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.1.15] - 2026-05-15

### Fixed
- **CodeQL call-graph edges silently dropped on `(file, start_line)` join miss** ([#25](https://github.com/codellm-devkit/codeanalyzer-python/issues/25)). CodeQL endpoints were matched back into Jedi's `PyCallable` signature space by an exact `(absolute_file_path, start_line)` key; when CodeQL and Jedi disagreed on a definition's start line (commonly with decorated functions), the caller lookup missed and the entire edge was discarded (callee misses degraded to ghost nodes). Replaced the exact-only index with a resolution ladder: exact `(file, start_line)` → candidates sharing `(file, short_name)` (single candidate taken directly, else nearest `start_line` among those whose parameter count matches the CodeQL positional arity) → no match (caller skipped / callee ghost, as before). The CodeQL query now emits `Function.getName()` and positional arity for both endpoints. Jedi's parameter count includes `*args`/`**kwargs`/keyword-only slots while CodeQL's arity is positional only, so the arity filter is exact for plain signatures and yields to the nearest-line tiebreak otherwise.

## [0.1.14] - 2026-05-13

### Added
Expand Down
136 changes: 109 additions & 27 deletions codeanalyzer/semantic_analysis/codeql/codeql_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,67 @@
from codeanalyzer.utils import logger


class _CallableResolver:
"""Maps a CodeQL endpoint ``(file, start_line, name, arity)`` to a Jedi
``PyCallable``.

Resolution ladder:
1. exact ``(abs_path, start_line)`` — the precise join;
2. on miss, candidates sharing ``(abs_path, short_name)``: a single
candidate is taken directly; otherwise prefer those whose
parameter count equals the CodeQL positional arity, then the
nearest ``start_line``;
3. no name match -> ``None`` (caller row skipped / callee becomes
a ghost node).

Step 2 recovers edges the ``(file, line)`` join silently drops when
CodeQL and Jedi disagree on a definition's start line (e.g. decorator
handling). Jedi's ``parameters`` counts every declared slot (incl.
``*args``/``**kwargs``/keyword-only) whereas CodeQL's arity is
positional only, so the arity filter is exact for plain signatures
and otherwise yields to the nearest-line tiebreak.
"""

def __init__(self) -> None:
self._by_loc: Dict[Tuple[str, int], Any] = {}
self._by_name: Dict[Tuple[str, str], List[Any]] = {}

@staticmethod
def _abs(path: str) -> str:
try:
return str(Path(path).resolve())
except (OSError, RuntimeError):
return path

@classmethod
def from_symbol_table(
cls, symbol_table: Dict[str, PyModule]
) -> "_CallableResolver":
resolver = cls()
for c in iter_callables_in_symbol_table(symbol_table):
abs_path = cls._abs(c.path)
resolver._by_loc[(abs_path, c.start_line)] = c
resolver._by_name.setdefault((abs_path, c.name), []).append(c)
return resolver

def resolve(
self, file: str, start_line: int, name: str, arity: int
) -> Any:
exact = self._by_loc.get((file, start_line))
if exact is not None:
return exact
if not name:
return None
candidates = self._by_name.get((file, name))
if not candidates:
return None
if len(candidates) == 1:
return candidates[0]
arity_matched = [c for c in candidates if len(c.parameters) == arity]
pool = arity_matched or candidates
return min(pool, key=lambda c: abs(c.start_line - start_line))


class CodeQL:
"""A class for building the application view of a Python application using CodeQL.

Expand Down Expand Up @@ -99,9 +160,14 @@ def _query_call_edges(self) -> DataFrame:
# codeql/python-all 7.x — it returns the ``CallNode`` (CFG)
# whose target was resolved to that ``Value``. Cleaner than
# poking at ``pointsTo`` directly.
"from CallNode call, Function caller, FunctionValue calleeVal",
# ``callee`` is bound to the FunctionValue's scope so the
# endpoint emits the same Function-level facts (name, arity,
# location) the post-processor needs for the name+arity
# fallback when the (file, start_line) join misses.
"from CallNode call, Function caller, FunctionValue calleeVal, Function callee",
"where",
" call.getScope() = caller and",
" callee = calleeVal.getScope() and",
" (",
# Direct function / bound-method call: foo() or obj.foo()
" call = calleeVal.getACall()",
Expand All @@ -115,15 +181,20 @@ def _query_call_edges(self) -> DataFrame:
" )",
" )",
"select",
# --- Caller endpoint --- (joins to PyCallable via file + start_line)
# --- Caller endpoint --- (joins to PyCallable: exact by
# (file, start_line), else by (file, name) + arity)
" caller.getLocation().getFile().getAbsolutePath(),",
" caller.getLocation().getStartLine(),",
" caller.getQualifiedName(),",
" caller.getName(),",
" count(caller.getArg(_)),",
# --- Callee endpoint --- (file/line may live in a library stub;
# post-processor classifies as in-source or ghost)
" calleeVal.getScope().getLocation().getFile().getAbsolutePath(),",
" calleeVal.getScope().getLocation().getStartLine(),",
" callee.getLocation().getFile().getAbsolutePath(),",
" callee.getLocation().getStartLine(),",
" calleeVal.getQualifiedName(),",
" callee.getName(),",
" count(callee.getArg(_)),",
# --- Call-site location --- (for PyCallsite augmentation)
" call.getLocation().getStartLine(),",
" call.getLocation().getStartColumn(),",
Expand All @@ -149,9 +220,13 @@ def _query_call_edges(self) -> DataFrame:
"caller_file",
"caller_start_line",
"caller_qname",
"caller_name",
"caller_arity",
"callee_file",
"callee_start_line",
"callee_qname",
"callee_name",
"callee_arity",
"call_start_line",
"call_start_column",
"call_end_line",
Expand All @@ -162,24 +237,15 @@ def _query_call_edges(self) -> DataFrame:
return df

@staticmethod
def _build_callable_location_index(
def _build_callable_resolver(
symbol_table: Dict[str, PyModule],
) -> Dict[Tuple[str, int], "PyCallable"]:
"""Build ``(absolute_file_path, start_line) -> PyCallable`` from Jedi.
) -> _CallableResolver:
"""Build the endpoint -> ``PyCallable`` resolver from Jedi.

Paths are resolved so they match CodeQL's ``getAbsolutePath()``
regardless of symlinks or the current working directory.
"""
from codeanalyzer.schema.py_schema import PyCallable # local to avoid cycle

index: Dict[Tuple[str, int], PyCallable] = {}
for c in iter_callables_in_symbol_table(symbol_table):
try:
abs_path = str(Path(c.path).resolve())
except (OSError, RuntimeError):
abs_path = c.path
index[(abs_path, c.start_line)] = c
return index
return _CallableResolver.from_symbol_table(symbol_table)

def _iter_resolved_rows(
self, symbol_table: Dict[str, PyModule]
Expand All @@ -194,19 +260,27 @@ def _iter_resolved_rows(
df = self._query_call_edges()
if df.empty:
return
location_index = self._build_callable_location_index(symbol_table)
resolver = self._build_callable_resolver(symbol_table)

skipped_unknown_caller = 0
ghost_callees = 0
for row in df.itertuples(index=False):
caller_key = (row.caller_file, int(row.caller_start_line))
caller = location_index.get(caller_key)
caller = resolver.resolve(
row.caller_file,
int(row.caller_start_line),
row.caller_name,
int(row.caller_arity),
)
if caller is None:
skipped_unknown_caller += 1
continue

callee_key = (row.callee_file, int(row.callee_start_line))
callee = location_index.get(callee_key)
callee = resolver.resolve(
row.callee_file,
int(row.callee_start_line),
row.callee_name,
int(row.callee_arity),
)
if callee is not None:
target_sig = callee.signature
else:
Expand Down Expand Up @@ -267,20 +341,28 @@ def augment_call_sites(self, symbol_table: Dict[str, PyModule]) -> int:
Returns:
Number of ``PyCallsite`` entries augmented.
"""
location_index = self._build_callable_location_index(symbol_table)
resolver = self._build_callable_resolver(symbol_table)
df = self._query_call_edges()
if df.empty:
return 0

augmented = 0
for row in df.itertuples(index=False):
caller_key = (row.caller_file, int(row.caller_start_line))
caller = location_index.get(caller_key)
caller = resolver.resolve(
row.caller_file,
int(row.caller_start_line),
row.caller_name,
int(row.caller_arity),
)
if caller is None:
continue

callee_key = (row.callee_file, int(row.callee_start_line))
callee = location_index.get(callee_key)
callee = resolver.resolve(
row.callee_file,
int(row.callee_start_line),
row.callee_name,
int(row.callee_arity),
)
resolved_sig = callee.signature if callee is not None else row.callee_qname

call_start = int(row.call_start_line)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "codeanalyzer-python"
version = "0.1.14"
version = "0.1.15"
description = "Static Analysis on Python source code using Jedi, CodeQL and Treesitter."
readme = "README.md"
authors = [
Expand Down