Skip to content

Commit

Permalink
Merge pull request #44 from crawlab-team/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
tikazyq committed Apr 22, 2023
2 parents df5254b + 8edcee3 commit 96ca673
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 14 deletions.
16 changes: 11 additions & 5 deletions webspot/detect/detectors/plain_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def __init__(
score_threshold: float = 1.,
sample_item_nodes: int = 10,
min_item_nodes: int = 5,
node2vec_ratio: float = 1.,
node2vec_ratio: float = 10.,
result_name_prefix: str = 'List',
text_length_discount: float = 0.1,
max_text_length: float = 2048,
Expand Down Expand Up @@ -129,14 +129,21 @@ def html_base64(self):
def results_base64(self):
return base64.b64encode(json.dumps(self.results).encode('utf-8')).decode('utf-8')

@property
def pruned_nodes_features(self):
features = self.graph_loader.nodes_features_tensor.detach().numpy()
features_count = features.sum(axis=0)
features_idx = np.argwhere(features_count > 1).T[0]
return features[:, features_idx]

def _get_nodes_features_tags_attrs(self, nodes_idx: np.ndarray = None):
"""
nodes features (tags + attributes)
"""
if nodes_idx is None:
features = self.graph_loader.nodes_features_tensor.detach().numpy()
features = self.pruned_nodes_features
else:
features = self.graph_loader.nodes_features_tensor.detach().numpy()[nodes_idx].sum(axis=1)
features = self.pruned_nodes_features[nodes_idx].sum(axis=1)

return normalize(
features,
Expand All @@ -153,8 +160,7 @@ def _get_nodes_features_node2vec(self, nodes_idx: np.ndarray = None):
else:
embedded_tensor = self.graph_loader.nodes_embedded_tensor[nodes_idx.T[0]]

embedded_features_tensor = self.graph_loader.nodes_features_tensor[embedded_tensor].sum(dim=1)
embedded_features = embedded_features_tensor.detach().numpy()
embedded_features = self.pruned_nodes_features[embedded_tensor].sum(axis=1)

return normalize(
embedded_features,
Expand Down
16 changes: 8 additions & 8 deletions webspot/test/detect/test_plain_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,14 @@
}
}
},
# {
# 'url': 'https://github.com/crawlab-team/crawlab/actions',
# 'result': {
# 'selectors': {
# 'list': '#partial-actions-workflow-runs'
# }
# }
# },
{
'url': 'https://github.com/crawlab-team/crawlab/actions',
'result': {
'selectors': {
'list': '#partial-actions-workflow-runs'
}
}
},
# {
# 'url': 'https://cuiqingcai.com',
# 'result': {
Expand Down
8 changes: 7 additions & 1 deletion webspot/web/routes/api/link.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,11 @@ async def requests(payload: RequestPayload = Body(
link_list = []
text_length = 0

# items elements
el_items = graph_loader.soup.select(list_items_selector)

# iterate items
for el_item in graph_loader.soup.select(list_items_selector):
for el_item in el_items:
link_list_result_link = Link()
el_item_field = el_item.select_one(f['selector'])
if not el_item_field:
Expand All @@ -92,6 +95,9 @@ async def requests(payload: RequestPayload = Body(
link_list.append(link_list_result_link)
text_length += len(link_list_result_link.text)

# if text_length / len(el_items) > 80:
# continue

text_lengths.append(text_length)
link_list_result_list.append(link_list)

Expand Down

0 comments on commit 96ca673

Please sign in to comment.