Skip to content

Commit

Permalink
Adjective rules
Browse files Browse the repository at this point in the history
  • Loading branch information
frreiss committed Feb 18, 2020
1 parent 6937db8 commit f9eeeb2
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 18 deletions.
58 changes: 45 additions & 13 deletions pandas_text/gremlin.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,19 +216,25 @@ def as_(self, *names: str) -> "GraphTraversal":
"""
return AsTraversal(self, names)

def out(self) -> "GraphTraversal":
def out(self, *edge_types: str) -> "GraphTraversal":
"""
:param edge_types: 0 or more names of types of edges.
Zero types means "all edge types".
:returns: A GraphTraversal that adds the destination of any edges out
of the current traversal's last elemehasnt.
"""
return OutTraversal(self)
return OutTraversal(self, edge_types)

def in_(self) -> "GraphTraversal":
def in_(self, *edge_types: str) -> "GraphTraversal":
"""
:param edge_types: 0 or more names of types of edges.
Zero types means "all edge types".
:returns: A GraphTraversal that adds the destination of any edges into
the current traversal's last element.
"""
return InTraversal(self)
return InTraversal(self, edge_types)

def select(self, *args) -> "GraphTraversal":
"""
Expand Down Expand Up @@ -646,8 +652,9 @@ class OutTraversal(UnaryTraversal):
"""Result of calling GraphTraversal.out()"""
# TODO: This class ought to be combined with InTraversal, but currently
# they are separate as a workaround for some puzzling behavior of pd.merge
def __init__(self, parent: GraphTraversal):
def __init__(self, parent: GraphTraversal, edge_types: Tuple[str]):
UnaryTraversal.__init__(self, parent)
self._edge_types = edge_types

def compute_impl(self) -> None:
if self.parent.step_types[-1] != "v":
Expand All @@ -656,11 +663,16 @@ def compute_impl(self) -> None:
"vertex. Last element type is {}".format(
self.parent.step_types[-1]))

# Column of path is a list of vertices. Join with edges table.
# Last column of path is a list of vertices. Join with edges table.
p = self.parent.paths
edges = self.parent.edges
# Filter down to requested edge types if present
if len(self._edge_types) > 0:
edges = edges[edges["type"].isin(self._edge_types)]
edges = edges[["from", "to"]] # "type" col has served its purpose
new_paths = (
p
.merge(self.parent.edges, left_on=p.columns[-1], right_on="from")
.merge(edges, left_on=p.columns[-1], right_on="from")
.drop("from",
axis="columns") # merge keeps both sides of equijoin
.rename(columns={
Expand All @@ -671,8 +683,9 @@ def compute_impl(self) -> None:

class InTraversal(UnaryTraversal):
"""Result of calling GraphTraversal.in_()"""
def __init__(self, parent: GraphTraversal):
def __init__(self, parent: GraphTraversal, edge_types: Tuple[str]):
UnaryTraversal.__init__(self, parent)
self._edge_types = edge_types

def compute_impl(self) -> None:
if self.parent.step_types[-1] != "v":
Expand All @@ -682,11 +695,16 @@ def compute_impl(self) -> None:
self.parent.step_types[-1]))
# Last column of path is a list of vertices. Join with edges table.
merge_tmp = self.parent.paths.copy()
edges = self.parent.edges
# Filter down to requested edge types if present
if len(self._edge_types) > 0:
edges = edges[edges["type"].isin(self._edge_types)]
edges = edges[["from", "to"]] # "type" col has served its purpose
# pd.merge() doesn't like integer series names for join keys
merge_tmp["join_key"] = merge_tmp[merge_tmp.columns[-1]]
new_paths = (
merge_tmp
.merge(self.parent.edges, left_on="join_key", right_on="to")
.merge(edges, left_on="join_key", right_on="to")
.drop(["to", "join_key"], axis="columns")
.rename(columns={"from": len(self.parent.paths.columns)})
)
Expand Down Expand Up @@ -1542,7 +1560,9 @@ def lt(other):


def token_features_to_traversal(token_features: pd.DataFrame,
drop_self_links=True):
drop_self_links: bool = True,
link_cols: Iterable[str] = (
"head", "left", "right")):
"""
Turn a DataFrame of token features in the form returned by
`make_tokens_and_features` into an empty graph traversal.
Expand All @@ -1555,13 +1575,25 @@ def token_features_to_traversal(token_features: pd.DataFrame,
:param drop_self_links: If `True`, remove links from nodes to themselves
to simplify query logic.
:param link_cols: Names of the columns to treat as links, if present.
This function will ignore any name in this list that doesn't match a
column name in `token_features`.
:returns: A traversal containing a graph version of `token_features` and
an empty set of paths.
"""
valid_link_cols = set(link_cols).intersection(token_features.columns)
# Don't include token IDs in the vertex attributes
vertices = token_features.drop(["token_num", "head_token_num"], axis=1)
edges = pd.DataFrame(
{"from": token_features.index, "to": token_features["head_token_num"]})
vertices = token_features.drop(["token_num"] + list(valid_link_cols),
axis=1)
# Add edges for every column name in link_cols that is present.
edges_list = []
for name in valid_link_cols:
df = pd.DataFrame(
{"from": token_features.index, "to": token_features[name],
"type": name})
edges_list.append(df[~df["to"].isnull()])
edges = pd.concat(edges_list)
if drop_self_links:
edges = edges[edges["from"] != edges["to"]]
paths = pd.DataFrame()
Expand Down
27 changes: 22 additions & 5 deletions pandas_text/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,16 @@ def make_tokens(target_text: str,


def make_tokens_and_features(target_text: str,
language_model: spacy.language.Language
) -> pd.DataFrame:
language_model: spacy.language.Language,
add_left_and_right=False) -> pd.DataFrame:
"""
:param target_text: Text to analyze
:param language_model: Preconfigured spaCy language model object
:param add_left_and_right: If `True`, add columns "left" and "right"
containing references to previous and next tokens.
:return: The tokens of the text plus additional linguistic features that the
language model generates, represented as a `pd.DataFrame`.
"""
Expand Down Expand Up @@ -78,20 +83,32 @@ def categorical_hack(values):
# TODO: Replace references to categorical_hack with pd.Categorical when the
# bug is fixed.

return pd.DataFrame({
df_cols = {
"token_num": range(len(tok_begins)),
"char_span": tokens_series,
"token_span": token_spans,
"lemma": [t.lemma_ for t in spacy_doc],
"pos": categorical_hack([str(t.pos_) for t in spacy_doc]),
"tag": categorical_hack([str(t.tag_) for t in spacy_doc]),
"dep": categorical_hack([str(t.dep_) for t in spacy_doc]),
"head_token_num": np.array([idx_to_id[t.head.idx] for t in spacy_doc]),
"head": np.array([idx_to_id[t.head.idx] for t in spacy_doc]),
"shape": categorical_hack([t.shape_ for t in spacy_doc]),
"is_alpha": np.array([t.is_alpha for t in spacy_doc]),
"is_stop": np.array([t.is_stop for t in spacy_doc]),
"sentence": _make_sentences_series(spacy_doc, tokens_array)
})
}

if add_left_and_right:
# Use nullable int type because these columns contain nulls
df_cols["left"] = pd.array(
[None] + list(range(len(tok_begins) - 1)), dtype=pd.Int32Dtype()
)
df_cols["right"] = pd.array(
list(range(1, len(tok_begins))) + [None], dtype=pd.Int32Dtype()
)


return pd.DataFrame(df_cols)


def _make_sentences_series(spacy_doc: spacy.tokens.doc.Doc,
Expand Down

0 comments on commit f9eeeb2

Please sign in to comment.