Skip to content

Commit

Permalink
YR/Remove-fields-with-one-letter-DBotFindSimilarIncidents/XSUP-29299 (#…
Browse files Browse the repository at this point in the history
…31161)

* fixes

* code and test

* remove Json feed from this pr

* test

* note

* pre commit

* RN

* CR and Flake8

* format

* pre commit
  • Loading branch information
RosenbergYehuda committed Nov 30, 2023
1 parent e6f3251 commit a12b00a
Show file tree
Hide file tree
Showing 6 changed files with 97 additions and 47 deletions.
6 changes: 6 additions & 0 deletions Packs/Base/ReleaseNotes/1_32_51.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@

#### Scripts

##### DBotFindSimilarIncidents

Improved implementation to prevent error messages by excluding fields with fewer than two letters from entering the calculation model.
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import json
import pandas as pd
from scipy.spatial.distance import cdist
from typing import Any, List, Dict, Union
from typing import Any

warnings.simplefilter("ignore")
warnings.filterwarnings('ignore', category=UserWarning)
Expand Down Expand Up @@ -60,7 +60,7 @@
REPLACE_COMMAND_LINE = {"=": " = ", "\\": "/", "[": "", "]": "", '"': "", "'": "", }


def keep_high_level_field(incidents_field: List[str]) -> List[str]:
def keep_high_level_field(incidents_field: list[str]) -> list[str]:
"""
Return list of fields if they are in the first level of the argument - xdralert.commandline will return xdralert
:param incidents_field: list of incident fields
Expand All @@ -69,7 +69,7 @@ def keep_high_level_field(incidents_field: List[str]) -> List[str]:
return [x.split('.')[0] if '.' in x else x for x in incidents_field]


def wrapped_list(obj: List) -> List:
def wrapped_list(obj: list) -> list:
"""
Wrapped object into a list if not list
:param obj:
Expand All @@ -80,7 +80,7 @@ def wrapped_list(obj: List) -> List:
return obj


def preprocess_incidents_field(incidents_field: str, prefix_to_remove: List[str]) -> str:
def preprocess_incidents_field(incidents_field: str, prefix_to_remove: list[str]) -> str:
"""
Remove prefixe from incident fields
:param incidents_field: field
Expand All @@ -103,13 +103,13 @@ def check_list_of_dict(obj) -> bool: # type: ignore
return bool(obj) and all(isinstance(elem, dict) for elem in obj) # type: ignore


def remove_duplicates(seq: List[str]) -> List[str]:
def remove_duplicates(seq: list[str]) -> list[str]:
seen = set() # type: ignore
seen_add = seen.add
return [x for x in seq if not (x in seen or seen_add(x))]


def recursive_filter(item: Union[List[Dict], Dict], regex_patterns: List, *fieldsToRemove):
def recursive_filter(item: list[dict] | dict, regex_patterns: list, *fieldsToRemove):
"""
:param item: Dict of list of Dict
Expand Down Expand Up @@ -157,7 +157,7 @@ def normalize_json(obj) -> str: # type: ignore
if isinstance(obj, str):
obj = json.loads(obj)
if check_list_of_dict(obj):
obj = {k: v for k, v in enumerate(obj)}
obj = dict(enumerate(obj))
if not isinstance(obj, dict):
return " "
my_dict = recursive_filter(obj, REGEX_DATE_PATTERN, "None", "N/A", None, "")
Expand Down Expand Up @@ -187,19 +187,19 @@ def normalize_command_line(command: str) -> str:
return ''


def fill_nested_fields(incidents_df: pd.DataFrame, incidents: pd.DataFrame, *list_of_field_list: List[str]) -> \
def fill_nested_fields(incidents_df: pd.DataFrame, incidents: pd.DataFrame, *list_of_field_list: list[str]) -> \
pd.DataFrame:
for field_type in list_of_field_list:
for field in field_type:
if '.' in field:
if isinstance(incidents, list):
value_list = [wrapped_list(demisto.dt(incident, field)) for incident in incidents]
value_list = [' '.join(set(list(filter(lambda x: x not in ['None', None, 'N/A'], x)))) for x in
value_list = [' '.join(set(filter(lambda x: x not in ['None', None, 'N/A'], x))) for x in
value_list]
else:
value_list = wrapped_list(demisto.dt(incidents, field))
value_list = ' '.join( # type: ignore
set(list(filter(lambda x: x not in ['None', None, 'N/A'], value_list)))) # type: ignore
set(filter(lambda x: x not in ['None', None, 'N/A'], value_list))) # type: ignore
incidents_df[field] = value_list
return incidents_df

Expand Down Expand Up @@ -392,15 +392,15 @@ def init_prediction(self, p_incident_to_match, p_incidents_df, p_field_for_comma
self.field_for_json = p_field_for_json

def predict(self):
self.remove_empty_field()
self.remove_empty_or_short_fields()
self.get_score()
self.compute_final_score()
return self.prepare_for_display(), self.field_for_command_line + self.field_for_potential_exact_match + \
self.field_for_json

def remove_empty_field(self):
def remove_empty_or_short_fields(self):
"""
Remove field where value if empty or unusable or does not exist in the incident...
Remove field where value is empty or is shorter than 2 characters or unusable or does not exist in the incident.
:return:
"""
remove_list = []
Expand All @@ -410,6 +410,7 @@ def remove_empty_field(self):
or (not isinstance(self.incident_to_match[field].values[0], str) and not isinstance(
self.incident_to_match[field].values[0], list)) \
or self.incident_to_match[field].values[0] == 'None' \
or len(self.incident_to_match[field].values[0]) < 2 \
or self.incident_to_match[field].values[0] == 'N/A':
remove_list.append(field)
self.field_for_command_line = [x for x in self.field_for_command_line if x not in remove_list]
Expand All @@ -418,6 +419,7 @@ def remove_empty_field(self):
for field in self.field_for_potential_exact_match:
if field not in self.incident_to_match.columns or not self.incident_to_match[field].values[
0] or not isinstance(self.incident_to_match[field].values[0], str) or \
len(self.incident_to_match[field].values[0]) < 2 or \
self.incident_to_match[field].values[0] == 'None' or self.incident_to_match[field].values[
0] == 'N/A':
remove_list.append(field)
Expand All @@ -426,10 +428,12 @@ def remove_empty_field(self):
remove_list = []
for field in self.field_for_json:
if field not in self.incident_to_match.columns or not self.incident_to_match[field].values[
0] or self.incident_to_match[field].values[0] == 'None' or self.incident_to_match[field].values[
0] == 'N/A' or all(not x for x in self.incident_to_match[field].values[0]):
0] or self.incident_to_match[field].values[0] == 'None' \
or len(self.incident_to_match[field].values[0]) < 2 \
or self.incident_to_match[field].values[0] == 'N/A' \
or all(not x for x in self.incident_to_match[field].values[0]):
remove_list.append(field)
self.field_for_json = [x for x in self.field_for_json if x not in remove_list]
self.field_for_json = [x for x in self.field_for_json if x not in remove_list]

def get_score(self):
"""
Expand Down Expand Up @@ -479,7 +483,7 @@ def return_clean_date(timestamp: str) -> str:


def prepare_incidents_for_display(similar_incidents: pd.DataFrame, confidence: float, show_distance: bool, max_incidents: int,
fields_used: List[str],
fields_used: list[str],
aggregate: str, include_indicators_similarity: bool) -> pd.DataFrame:
"""
Organize data
Expand All @@ -493,14 +497,14 @@ def prepare_incidents_for_display(similar_incidents: pd.DataFrame, confidence: f
:return: Clean Dataframe
"""
if 'id' in similar_incidents.columns.tolist():
similar_incidents[COLUMN_ID] = similar_incidents['id'].apply(lambda _id: "[%s](#/Details/%s)" % (_id, _id))
similar_incidents[COLUMN_ID] = similar_incidents['id'].apply(lambda _id: f"[{_id}](#/Details/{_id})")
if COLUMN_TIME in similar_incidents.columns:
similar_incidents[COLUMN_TIME] = similar_incidents[COLUMN_TIME].apply(lambda x: return_clean_date(x))
if aggregate == 'True':
agg_fields = [x for x in similar_incidents.columns if x not in FIELDS_NO_AGGREGATION]
similar_incidents = similar_incidents.groupby(agg_fields, as_index=False, dropna=False).agg(
{
COLUMN_TIME: lambda x: "%s -> %s" % (min(filter(None, x)), max(filter(None, x))) if len(x) > 1 else x,
COLUMN_TIME: lambda x: f"{min(filter(None, x))} -> {max(filter(None, x))}" if len(x) > 1 else x,
'id': lambda x: ' , '.join(x),
COLUMN_ID: lambda x: ' , '.join(x),
}
Expand All @@ -510,7 +514,7 @@ def prepare_incidents_for_display(similar_incidents: pd.DataFrame, confidence: f
similar_incidents = similar_incidents[similar_incidents[SIMILARITY_COLUNM_NAME] >= confidence]
if show_distance == 'False':
col_to_remove = ['similarity %s' % field for field in fields_used]
similar_incidents.drop(col_to_remove, axis=1, inplace=True)
similar_incidents = similar_incidents.drop(col_to_remove, axis=1)
if include_indicators_similarity == "True":
similar_incidents = similar_incidents.sort_values(by=ORDER_SCORE_WITH_INDICATORS, ascending=False)
else:
Expand All @@ -519,7 +523,7 @@ def prepare_incidents_for_display(similar_incidents: pd.DataFrame, confidence: f
return similar_incidents.head(max_incidents)


def get_incident_by_id(incident_id: str, populate_fields: List[str], from_date: str, to_date: str):
def get_incident_by_id(incident_id: str, populate_fields: list[str], from_date: str, to_date: str):
"""
Get incident acording to incident id
:param incident_id:
Expand All @@ -546,8 +550,8 @@ def get_incident_by_id(incident_id: str, populate_fields: List[str], from_date:
return incident[0]


def get_all_incidents_for_time_window_and_exact_match(exact_match_fields: List[str], populate_fields: List[str],
incident: Dict, from_date: str, to_date: str,
def get_all_incidents_for_time_window_and_exact_match(exact_match_fields: list[str], populate_fields: list[str],
incident: dict, from_date: str, to_date: str,
query_sup: str, limit: int):
"""
Get incidents for a time window and exact match for somes fields
Expand All @@ -566,7 +570,7 @@ def get_all_incidents_for_time_window_and_exact_match(exact_match_fields: List[s
if exact_match_field not in incident.keys():
msg += "%s \n" % MESSAGE_NO_FIELD % exact_match_field
else:
exact_match_fields_list.append('%s: "%s"' % (exact_match_field, incident[exact_match_field]))
exact_match_fields_list.append(f'{exact_match_field}: "{incident[exact_match_field]}"')
query = " AND ".join(exact_match_fields_list)
query += " AND -id:%s " % incident['id']
if query_sup:
Expand All @@ -593,7 +597,7 @@ def get_all_incidents_for_time_window_and_exact_match(exact_match_fields: List[s
return incidents, msg


def extract_fields_from_args(arg: List[str]) -> List[str]:
def extract_fields_from_args(arg: list[str]) -> list[str]:
fields_list = [preprocess_incidents_field(x.strip(), PREFIXES_TO_REMOVE) for x in arg if x]
return list(dict.fromkeys(fields_list))

Expand Down Expand Up @@ -644,7 +648,7 @@ def get_args(): # type: ignore
show_actual_incident, incident_id, include_indicators_similarity


def load_current_incident(incident_id: str, populate_fields: List[str], from_date: str, to_date: str):
def load_current_incident(incident_id: str, populate_fields: list[str], from_date: str, to_date: str):
"""
Load current incident if incident_id given or load current incident investigated
:param incident_id: incident_id
Expand Down Expand Up @@ -676,7 +680,7 @@ def remove_fields_not_in_incident(*args, incorrect_fields):
return [[x for x in field_type if x not in incorrect_fields] for field_type in args]


def get_similar_incidents_by_indicators(args: Dict):
def get_similar_incidents_by_indicators(args: dict):
"""
Use DBotFindSimilarIncidentsByIndicators automation and return similars incident from the automation
:param args: argument for DBotFindSimilarIncidentsByIndicators automation
Expand All @@ -698,21 +702,21 @@ def get_data_from_indicators_automation(res, TAG_SCRIPT_INDICATORS_VALUE):
return None


def dumps_json_field_in_incident(incident: Dict):
def dumps_json_field_in_incident(incident: dict):
"""
Dumps value that are dict in for incident values
:param incident: json representing the incident
:return:
"""
for field in incident.keys():
for field in incident:
if isinstance(incident[field], dict):
incident[field] = json.dumps(incident[field])
incident_df = pd.DataFrame.from_dict(incident, orient='index').T
return incident_df


def return_outputs_summary(confidence: float, number_incident_fetched: int, number_incidents_found: int,
fields_used: List[str], global_msg: str) -> None:
fields_used: list[str], global_msg: str) -> None:
"""
Return entry for summary of the automation - Give information about the automation run
:param confidence: confidence level given by the user
Expand Down Expand Up @@ -752,8 +756,8 @@ def create_context_for_incidents(similar_incidents=pd.DataFrame()):


def return_outputs_similar_incidents(show_actual_incident: bool, current_incident: pd.DataFrame,
similar_incidents: pd.DataFrame, context: Dict,
tag: Union[str, None] = None):
similar_incidents: pd.DataFrame, context: dict,
tag: str | None = None):
"""
Return entry and context for similar incidents
:param show_actual_incident: Boolean if showing the current incident
Expand Down Expand Up @@ -798,11 +802,11 @@ def return_outputs_similar_incidents(show_actual_incident: bool, current_inciden
"EntryContext": {'DBotFindSimilarIncidents': context},
}
if tag is not None:
return_entry["Tags"] = ['SimilarIncidents_{}'.format(tag)]
return_entry["Tags"] = [f'SimilarIncidents_{tag}']
demisto.results(return_entry)


def find_incorrect_fields(populate_fields: List[str], incidents_df: pd.DataFrame, global_msg: str):
def find_incorrect_fields(populate_fields: list[str], incidents_df: pd.DataFrame, global_msg: str):
"""
Check Field that appear in populate_fields but are not in the incidents_df and return message
:param populate_fields: List of fields
Expand Down Expand Up @@ -839,7 +843,7 @@ def return_outputs_similar_incidents_empty():
outputs={'DBotFindSimilarIncidents': create_context_for_incidents()})


def enriched_with_indicators_similarity(full_args_indicators_script: Dict, similar_incidents: pd.DataFrame):
def enriched_with_indicators_similarity(full_args_indicators_script: dict, similar_incidents: pd.DataFrame):
"""
Take DataFrame of similar_incidents and args for indicators script and add information about indicators
to similar_incidents
Expand All @@ -860,9 +864,9 @@ def enriched_with_indicators_similarity(full_args_indicators_script: Dict, simil
return similar_incidents


def prepare_current_incident(incident_df: pd.DataFrame, display_fields: List[str], similar_text_field: List[str],
similar_json_field: List[str], similar_categorical_field: List[str],
exact_match_fields: List[str]) -> pd.DataFrame:
def prepare_current_incident(incident_df: pd.DataFrame, display_fields: list[str], similar_text_field: list[str],
similar_json_field: list[str], similar_categorical_field: list[str],
exact_match_fields: list[str]) -> pd.DataFrame:
"""
Prepare current incident for visualization
:param incident_df: incident_df
Expand All @@ -880,7 +884,7 @@ def prepare_current_incident(incident_df: pd.DataFrame, display_fields: List[str
if COLUMN_TIME in incident_filter.columns.tolist():
incident_filter[COLUMN_TIME] = incident_filter[COLUMN_TIME].apply(lambda x: return_clean_date(x))
if 'id' in incident_filter.columns.tolist():
incident_filter[COLUMN_ID] = incident_filter['id'].apply(lambda _id: "[%s](#/Details/%s)" % (_id, _id))
incident_filter[COLUMN_ID] = incident_filter['id'].apply(lambda _id: f"[{_id}](#/Details/{_id})")
return incident_filter


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@ args:
- description: Incident ID to get the prediction of. If empty, predicts the the current incident ID.
name: incidentId
- auto: PREDEFINED
description: 'Comma-separated list of incident text fields to take into account when computing similarity. For example: commandline, URL.'
description: 'Comma-separated list of incident text fields to take into account when computing similarity. For example: commandline, URL. Note: In order to calculate similarity, fields must consist of a minimum of 2 letters.'
name: similarTextField
- auto: PREDEFINED
description: 'Comma-separated list of incident categorical fields to take into account whe computing similarity. For example: IP, URL.'
description: 'Comma-separated list of incident categorical fields to take into account whe computing similarity. For example: IP, URL. Note: In order to calculate similarity, fields must consist of a minimum of 2 letters.'
name: similarCategoricalField
- auto: PREDEFINED
description: 'Comma-separated list of incident JSON fields to take into account whe computing similarity. For example: CustomFields.'
description: 'Comma-separated list of incident JSON fields to take into account whe computing similarity. For example: CustomFields. Note: In order to calculate similarity, fields must consist of a minimum of 2 letters.'
name: similarJsonField
- auto: PREDEFINED
description: Comma-separated list of additional incident fields to display, but which will not be taken into account when computing similarity.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
preprocess_incidents_field, PREFIXES_TO_REMOVE, check_list_of_dict, REGEX_IP, match_one_regex, \
SIMILARITY_COLUNM_NAME_INDICATOR, SIMILARITY_COLUNM_NAME, euclidian_similarity_capped, find_incorrect_fields, \
MESSAGE_NO_INCIDENT_FETCHED, MESSAGE_INCORRECT_FIELD, MESSAGE_WARNING_TRUNCATED, COLUMN_ID, COLUMN_TIME, \
TAG_SCRIPT_INDICATORS
TAG_SCRIPT_INDICATORS, Model

import pytest
import json
import numpy as np
import pandas as pd
Expand Down Expand Up @@ -344,3 +345,42 @@ def test_build_message_of_values():

baz = ['baz1', 'baz2']
assert build_message_of_values([foo, bar, baz]) == "foo_value; bar_value; ['baz1', 'baz2']"


@pytest.fixture
def sample_data():
# Create sample data for testing
data = {'created': ["2019-02-20T15:47:23.962164+02:00"],
'Name': ["t"],
'Id': [["123"]],
'test': [None],
'xdralerts': ['N/A'],
"test2": [""]}
return pd.DataFrame(data)


fields_to_match = ['created', 'Name', 'test', 'Id', 'test2', 'xdralerts']
expected_results = ['created']


def test_remove_empty_or_short_fields(sample_data):
"""
Given:
- sample_data: a dataframe with a column of strings
When:
- calling remove_empty_or_short_fields function
Then:
- assert that the function removes empty or short or None or 'N/A' or list objects fields
"""
# Create an instance of Model
my_instance = Model({})
my_instance.incident_to_match = sample_data

my_instance.field_for_command_line = fields_to_match
my_instance.field_for_potential_exact_match = fields_to_match
my_instance.field_for_json = fields_to_match

my_instance.remove_empty_or_short_fields()
assert my_instance.field_for_command_line == expected_results
assert my_instance.field_for_potential_exact_match == expected_results
assert my_instance.field_for_json == expected_results

0 comments on commit a12b00a

Please sign in to comment.