## Evaluating String Comparison Methods

### Installs

In [46]:
!pip install spacy fuzzywuzzy scikit-learn



### Imports

In [32]:
import spacy
import re

from sklearn.feature_extraction.text import TfidfVectorizer

try:
    nlp = spacy.load("en_core_web_lg")
except OSError:
    spacy.cli.download("en_core_web_lg")
    nlp = spacy.load("en_core_web_lg")


### Constants

In [19]:
alert = "Increase in FCIs on /api/v1/azure/status over the past 5 minutes"
changes = [
    "CHNG559732: Recent deployment on azuredatabasenodeserv for manifest ID azuredatabasenodeserv-062723230148699022",
    "CHNG494690: Recent deployment on legacycassandradbnodeserv for manifest ID legacycassandradbnodeserv-062723230148700108",
    "CHNG336829: Recent deployment on oldpostgresadapternodeweb for manifest ID oldpostgresadapternodeweb-062723230148703462"
]

### A) Compute Similarity Score using Spacy

In [23]:
# Calculate similarity with each change string
alert_doc = nlp(alert)
for change in changes:
    change_doc = nlp(change)
    similarity = alert_doc.similarity(change_doc)
    print(alert_doc, "<->", change_doc)
    print("Similarity:", similarity)
    print()


Increase in FCIs on /api/v1/azure/status over the past 5 minutes <-> CHNG559732: Recent deployment on azuredatabasenodeserv for manifest ID azuredatabasenodeserv-062723230148699022
Similarity: 0.5555793900741783

Increase in FCIs on /api/v1/azure/status over the past 5 minutes <-> CHNG494690: Recent deployment on legacycassandradbnodeserv for manifest ID legacycassandradbnodeserv-062723230148700108
Similarity: 0.5555793900741783

Increase in FCIs on /api/v1/azure/status over the past 5 minutes <-> CHNG336829: Recent deployment on oldpostgresadapternodeweb for manifest ID oldpostgresadapternodeweb-062723230148703462
Similarity: 0.5555793900741783



### B) Extract Entity Relationships w/ Spacy & Fuzzywuzzy

In [29]:
import spacy
from fuzzywuzzy import fuzz

def extract_entities(text):
    nlp = spacy.load("en_core_web_lg")
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents]
    print(f"Entities extracted from {text}:\n{entities}\n")
    return entities

def find_relationships(alert, change_tickets):
    alert_entities = extract_entities(alert)
    relationships = []

    for ticket in change_tickets:
        ticket_entities = extract_entities(ticket)

        for alert_entity in alert_entities:
            for ticket_entity in ticket_entities:
                similarity = fuzz.ratio(alert_entity.lower(), ticket_entity.lower())
                if similarity >= 10:  # Adjust the similarity threshold as needed
                    relationships.append((alert_entity, ticket_entity, ticket))

    return relationships

def main():
    relationships = find_relationships(alert, changes)

    if relationships:
        print("\nRelationships found:")
        for alert_entity, ticket_entity, ticket in relationships:
            print(f"Entity from Alert: {alert_entity}")
            print(f"Entity from Ticket: {ticket_entity}")
            print(f"Change Ticket: {ticket}")
            print()
    else:
        print("\nNo relationships found.")

if __name__ == "__main__":
    main()


Entities extracted from Increase in FCIs on /api/v1/azure/status over the past 5 minutes:
['/api', 'the past 5 minutes']

Entities extracted from CHNG559732: Recent deployment on azuredatabasenodeserv for manifest ID azuredatabasenodeserv-062723230148699022:
['CHNG559732']

Entities extracted from CHNG494690: Recent deployment on legacycassandradbnodeserv for manifest ID legacycassandradbnodeserv-062723230148700108:
[]

Entities extracted from CHNG336829: Recent deployment on oldpostgresadapternodeweb for manifest ID oldpostgresadapternodeweb-062723230148703462:
['CHNG336829', 'oldpostgresadapternodeweb-062723230148703462']


Relationships found:
Entity from Alert: the past 5 minutes
Entity from Ticket: CHNG559732
Change Ticket: CHNG559732: Recent deployment on azuredatabasenodeserv for manifest ID azuredatabasenodeserv-062723230148699022

Entity from Alert: the past 5 minutes
Entity from Ticket: CHNG336829
Change Ticket: CHNG336829: Recent deployment on oldpostgresadapternodeweb for m

### C) Using Spacy filters to identify tags

In [49]:
from spacy.lang.en import English

nlp = English()
ruler = nlp.add_pipe("entity_ruler")
patterns = [
    {
        "label": "ORG", 
        "pattern": [{
            "LOWER": { "REGEX": "[Pp]ay[Pp]al?" }
        }]
    },
    {
        "label": "APP", 
        "pattern": [{
            "TEXT" : { "REGEX": r"\b(serv|nodeserv|nodeweb)\b" } # This acts weird
        }]
    }
]

ruler.add_patterns(patterns)

doc = nlp(alert)
print([(ent.text, ent.label_) for ent in doc.ents])

[('Increase', 'APP'), ('in', 'APP'), ('FCIs', 'APP'), ('on', 'APP'), ('/api', 'ORG'), ('/', 'APP'), ('v1', 'APP'), ('/', 'APP'), ('azure', 'APP'), ('/', 'APP'), ('status', 'APP'), ('over', 'APP'), ('the past 5 minutes', 'TIME')]
