Adjective rules

frreiss · Feb 18, 2020 · f9eeeb2 · f9eeeb2
1 parent 6937db8
commit f9eeeb2
Show file tree

Hide file tree

Showing 2 changed files with 67 additions and 18 deletions.
diff --git a/pandas_text/gremlin.py b/pandas_text/gremlin.py
@@ -216,19 +216,25 @@ def as_(self, *names: str) -> "GraphTraversal":
         """
         return AsTraversal(self, names)
 
-    def out(self) -> "GraphTraversal":
+    def out(self, *edge_types: str) -> "GraphTraversal":
         """
+        :param edge_types: 0 or more names of types of edges.
+         Zero types means "all edge types".
+
         :returns: A GraphTraversal that adds the destination of any edges out
         of the current traversal's last elemehasnt.
         """
-        return OutTraversal(self)
+        return OutTraversal(self, edge_types)
 
-    def in_(self) -> "GraphTraversal":
+    def in_(self, *edge_types: str) -> "GraphTraversal":
         """
+        :param edge_types: 0 or more names of types of edges.
+         Zero types means "all edge types".
+
         :returns: A GraphTraversal that adds the destination of any edges into
         the current traversal's last element.
         """
-        return InTraversal(self)
+        return InTraversal(self, edge_types)
 
     def select(self, *args) -> "GraphTraversal":
         """
@@ -646,8 +652,9 @@ class OutTraversal(UnaryTraversal):
     """Result of calling GraphTraversal.out()"""
     # TODO: This class ought to be combined with InTraversal, but currently
     #  they are separate as a workaround for some puzzling behavior of pd.merge
-    def __init__(self, parent: GraphTraversal):
+    def __init__(self, parent: GraphTraversal, edge_types: Tuple[str]):
         UnaryTraversal.__init__(self, parent)
+        self._edge_types = edge_types
 
     def compute_impl(self) -> None:
         if self.parent.step_types[-1] != "v":
@@ -656,11 +663,16 @@ def compute_impl(self) -> None:
                 "vertex. Last element type is {}".format(
                     self.parent.step_types[-1]))
 
-        # Column of path is a list of vertices. Join with edges table.
+        # Last column of path is a list of vertices. Join with edges table.
         p = self.parent.paths
+        edges = self.parent.edges
+        # Filter down to requested edge types if present
+        if len(self._edge_types) > 0:
+            edges = edges[edges["type"].isin(self._edge_types)]
+        edges = edges[["from", "to"]]  # "type" col has served its purpose
         new_paths = (
             p
-            .merge(self.parent.edges, left_on=p.columns[-1], right_on="from")
+            .merge(edges, left_on=p.columns[-1], right_on="from")
             .drop("from",
                   axis="columns")  # merge keeps both sides of equijoin
             .rename(columns={
@@ -671,8 +683,9 @@ def compute_impl(self) -> None:
 
 class InTraversal(UnaryTraversal):
     """Result of calling GraphTraversal.in_()"""
-    def __init__(self, parent: GraphTraversal):
+    def __init__(self, parent: GraphTraversal, edge_types: Tuple[str]):
         UnaryTraversal.__init__(self, parent)
+        self._edge_types = edge_types
 
     def compute_impl(self) -> None:
         if self.parent.step_types[-1] != "v":
@@ -682,11 +695,16 @@ def compute_impl(self) -> None:
                     self.parent.step_types[-1]))
         # Last column of path is a list of vertices. Join with edges table.
         merge_tmp = self.parent.paths.copy()
+        edges = self.parent.edges
+        # Filter down to requested edge types if present
+        if len(self._edge_types) > 0:
+            edges = edges[edges["type"].isin(self._edge_types)]
+        edges = edges[["from", "to"]]  # "type" col has served its purpose
         # pd.merge() doesn't like integer series names for join keys
         merge_tmp["join_key"] = merge_tmp[merge_tmp.columns[-1]]
         new_paths = (
             merge_tmp
-            .merge(self.parent.edges, left_on="join_key", right_on="to")
+            .merge(edges, left_on="join_key", right_on="to")
             .drop(["to", "join_key"], axis="columns")
             .rename(columns={"from": len(self.parent.paths.columns)})
         )
@@ -1542,7 +1560,9 @@ def lt(other):
 
 
 def token_features_to_traversal(token_features: pd.DataFrame,
-                                drop_self_links=True):
+                                drop_self_links: bool = True,
+                                link_cols: Iterable[str] = (
+                                    "head", "left", "right")):
     """
     Turn a DataFrame of token features in the form returned by
     `make_tokens_and_features` into an empty graph traversal.
@@ -1555,13 +1575,25 @@ def token_features_to_traversal(token_features: pd.DataFrame,
     :param drop_self_links: If `True`, remove links from nodes to themselves
     to simplify query logic.
 
+    :param link_cols: Names of the columns to treat as links, if present.
+     This function will ignore any name in this list that doesn't match a
+     column name in `token_features`.
+
     :returns: A traversal containing a graph version of `token_features` and
     an empty set of paths.
     """
+    valid_link_cols = set(link_cols).intersection(token_features.columns)
     # Don't include token IDs in the vertex attributes
-    vertices = token_features.drop(["token_num", "head_token_num"], axis=1)
-    edges = pd.DataFrame(
-        {"from": token_features.index, "to": token_features["head_token_num"]})
+    vertices = token_features.drop(["token_num"] + list(valid_link_cols),
+                                   axis=1)
+    # Add edges for every column name in link_cols that is present.
+    edges_list = []
+    for name in valid_link_cols:
+        df = pd.DataFrame(
+            {"from": token_features.index, "to": token_features[name],
+             "type": name})
+        edges_list.append(df[~df["to"].isnull()])
+    edges = pd.concat(edges_list)
     if drop_self_links:
         edges = edges[edges["from"] != edges["to"]]
     paths = pd.DataFrame()

diff --git a/pandas_text/io.py b/pandas_text/io.py
@@ -40,11 +40,16 @@ def make_tokens(target_text: str,
 
 
 def make_tokens_and_features(target_text: str,
-                             language_model: spacy.language.Language
-                             ) -> pd.DataFrame:
+                             language_model: spacy.language.Language,
+                             add_left_and_right=False) -> pd.DataFrame:
     """
     :param target_text: Text to analyze
+
     :param language_model: Preconfigured spaCy language model object
+
+    :param add_left_and_right: If `True`, add columns "left" and "right"
+    containing references to previous and next tokens.
+
     :return: The tokens of the text plus additional linguistic features that the
     language model generates, represented as a `pd.DataFrame`.
     """
@@ -78,20 +83,32 @@ def categorical_hack(values):
     # TODO: Replace references to categorical_hack with pd.Categorical when the
     #  bug is fixed.
 
-    return pd.DataFrame({
+    df_cols = {
         "token_num": range(len(tok_begins)),
         "char_span": tokens_series,
         "token_span": token_spans,
         "lemma": [t.lemma_ for t in spacy_doc],
         "pos": categorical_hack([str(t.pos_) for t in spacy_doc]),
         "tag": categorical_hack([str(t.tag_) for t in spacy_doc]),
         "dep": categorical_hack([str(t.dep_) for t in spacy_doc]),
-        "head_token_num": np.array([idx_to_id[t.head.idx] for t in spacy_doc]),
+        "head": np.array([idx_to_id[t.head.idx] for t in spacy_doc]),
         "shape": categorical_hack([t.shape_ for t in spacy_doc]),
         "is_alpha": np.array([t.is_alpha for t in spacy_doc]),
         "is_stop": np.array([t.is_stop for t in spacy_doc]),
         "sentence": _make_sentences_series(spacy_doc, tokens_array)
-    })
+    }
+
+    if add_left_and_right:
+        # Use nullable int type because these columns contain nulls
+        df_cols["left"] = pd.array(
+            [None] + list(range(len(tok_begins) - 1)), dtype=pd.Int32Dtype()
+        )
+        df_cols["right"] = pd.array(
+            list(range(1, len(tok_begins))) + [None], dtype=pd.Int32Dtype()
+        )
+
+
+    return pd.DataFrame(df_cols)
 
 
 def _make_sentences_series(spacy_doc: spacy.tokens.doc.Doc,