# Multilabel Labeling Functions (LFs)
In order to convert our single-labels to multilabels, we created labeling functions to assign multiple labels, rather than mutually exclusive single-labels.
These are rulebased however and need refinement. This turns our single-label dataset into a multi-label dataset, that can be used to finetune BERT to assign multiple labels at once (see next script for that)

In [1]:
import pandas as pd
import re
pd.set_option('display.max_colwidth', None) 
import numpy as np

from snorkel.labeling import labeling_function, PandasLFApplier, LFAnalysis
from snorkel.labeling.model import LabelModel
from snorkel.labeling import LabelingFunction, LFApplier

In [2]:
df_train = pd.read_csv("snorkel_iteration/train.tsv",sep="\t")

In [3]:
label_names = df_train["label"].unique().tolist()
label_names.sort()  # optional but ensures consistent order
#label_names

In [4]:
label_to_index = {}

In [5]:
LABEL_ABSTAIN = -1  # Snorkel's LABEL_ABSTAIN label

## Labeling Functions

In [6]:
'''
LABEL_REQUEST = label_to_index["request"]

@labeling_function()
def lf_request_query(x):
    # Regular expressions to capture requests to visit or stop by
    patterns = [
        r"would you",
        r"can you",
        r"could you",
        r"can you",
        r'can we',
    ]
    text = x if isinstance(x, str) else x.text
    text = text.lower()  # Ensure matching is case-insensitive
    if any(re.search(pattern, text) for pattern in patterns):
        return LABEL_REQUEST  # Positive label for visit request
    else:
        return LABEL_ABSTAIN  # Negative label if no visit request is found

'''

'\nLABEL_REQUEST = label_to_index["request"]\n\n@labeling_function()\ndef lf_request_query(x):\n    # Regular expressions to capture requests to visit or stop by\n    patterns = [\n        r"would you",\n        r"can you",\n        r"could you",\n        r"can you",\n        r\'can we\',\n    ]\n    text = x if isinstance(x, str) else x.text\n    text = text.lower()  # Ensure matching is case-insensitive\n    if any(re.search(pattern, text) for pattern in patterns):\n        return LABEL_REQUEST  # Positive label for visit request\n    else:\n        return LABEL_ABSTAIN  # Negative label if no visit request is found\n\n'

In [7]:
'''
LABEL_PROPOSAL = label_to_index["proposal"]

@labeling_function()
def lf_proposal_query(x):
    # Regular expressions to capture requests to visit or stop by
    patterns = [
        r"may I suggest",
        r'how about'
        r"let's"
    ]
    text = x if isinstance(x, str) else x.text
    text = text.lower()  # Ensure matching is case-insensitive
    if any(re.search(pattern, text) for pattern in patterns):
        return LABEL_PROPOSAL  # Positive label for visit request
    else:
        return LABEL_ABSTAIN  # Negative label if no visit request is found

'''

'\nLABEL_PROPOSAL = label_to_index["proposal"]\n\n@labeling_function()\ndef lf_proposal_query(x):\n    # Regular expressions to capture requests to visit or stop by\n    patterns = [\n        r"may I suggest",\n        r\'how about\'\n        r"let\'s"\n    ]\n    text = x if isinstance(x, str) else x.text\n    text = text.lower()  # Ensure matching is case-insensitive\n    if any(re.search(pattern, text) for pattern in patterns):\n        return LABEL_PROPOSAL  # Positive label for visit request\n    else:\n        return LABEL_ABSTAIN  # Negative label if no visit request is found\n\n'

In [8]:
LABEL_REQUEST_VISIT = len(label_to_index.items())
label_to_index['request_visit'] = LABEL_REQUEST_VISIT

@labeling_function()
def lf_request_visit(x):
    # Regular expressions to capture requests to visit or stop by
    patterns = [
        r"\bplease (come by|stop by|drop by)\b",
        r"\bcan you please (come|stop) by\b",
        r"\b(visit|see) me when you have a (minute|moment|chance)\b",
        r"\bplease visit\b",
        r"\bplease drop by\b",
        r"\bcome by as soon as possible\b",
        r"\bstop by my (office|desk)\b",
        r"\bcome by tomorrow\b"
    ]
    text = x if isinstance(x, str) else x.text
    text = text.lower()  # Ensure matching is case-insensitive
    if any(re.search(pattern, text) for pattern in patterns):
        return LABEL_REQUEST_VISIT  # Positive label for visit request
    else:
        return LABEL_ABSTAIN  # Negative label if no visit request is found



In [9]:
LABEL_REQUEST_VERIFY_INFORMATION = len(label_to_index.items())
label_to_index['request_verify_information'] = LABEL_REQUEST_VERIFY_INFORMATION


@labeling_function()
def lf_request_verify_information(x):
    # Regular expressions to capture verification requests
    patterns = [
        r"\bplease (check|verify|double check)\b.*",
        r"\b(check|verify|double check) if\b.*",
        r"\bmake sure\b.*is correct",
        r"\bcan you (please )?(check|verify)\b.*",
        r"\blet (us|me) know if\b.*is incorrect",
        r"\bcould you (please )?(check|verify)\b.*at your end\b"
    ]
    text = x if isinstance(x, str) else x.text
    text = text.lower()  # Ensure matching is case-insensitive
    if any(re.search(pattern, text) for pattern in patterns):
        return LABEL_REQUEST_VERIFY_INFORMATION  # Positive label for verification request
    else:
        return LABEL_ABSTAIN  # Negative label if no verification request is found



In [10]:
LABEL_REQUEST_URGENCY_WITH_DEADLINE = len(label_to_index.items())
label_to_index['request_urgency_with_deadline'] = LABEL_REQUEST_URGENCY_WITH_DEADLINE

@labeling_function()
def lf_request_urgency_with_deadline(x):
    # Regular expressions to capture urgency and deadline requests
    patterns = [
        r"\bplease (send|complete|have|update|reply|respond|get|present|pull) (this|it)? by \b.*?by\b.*\d{1,2}(:\d{2})? (am|pm)?",
        r"\b(can you|could you|I need this|I'd like) (to )?(have|get|see|receive|ensure|complete) (this)? by\b.*?\d{1,2}(:\d{2})? (am|pm|today|tomorrow|next week|Friday)?",
        r"\b(no later than|end of|before|latest by|deadline)\b.*?\d{1,2}(:\d{2})? (am|pm|o'clock)?",
        r"\b(get this done|finish|complete this task|resolve this|submit)\b.*?by\b.*?\d{1,2}(:\d{2})? (am|pm|today|tomorrow)",
        r"\bby\b.*?(\d{1,2}:\d{2}|\d{1,2}) (am|pm|o'clock)? on\b (Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)",
        r"deadline (is|of) \b.*?\d{1,2}(:\d{2})? (am|pm|today|tomorrow)"
    ]
    text = x if isinstance(x, str) else x.text
    text = text.lower()  # Ensure matching is case-insensitive
    if any(re.search(pattern, text) for pattern in patterns):
        return LABEL_REQUEST_URGENCY_WITH_DEADLINE # Positive label for urgency with deadline
    else:
        return LABEL_ABSTAIN  # Negative label if no urgency with deadline is found


In [11]:
LABEL_REQUEST_UPDATE_CONTACT_INFO = len(label_to_index.items())
label_to_index['request_update_contact_info'] = LABEL_REQUEST_UPDATE_CONTACT_INFO

@labeling_function()
def lf_request_update_contact_info(x):
    # Regular expressions to find phrases related to updating contact information
    patterns = [
        r"\bplease (update|change) (the )?(contact info|contact information|phone number|email|address|directory)",
        r"could you (please )?(update|change) (the )?(contact info|contact information|phone number|email|address|directory)",
        r"(updating|changing) (my )?(contact details|phone number|email|address)",
        r"please make sure (my )?(contact info|contact information|phone number|email|address) is updated",
        r"new contact (information|number|details) (is|are) [:a-zA-Z0-9._%-]+",
        r"\bplease add (the following )?(phone number|email address|contact) [:a-zA-Z0-9._%-]+"
    ]
    text = x if isinstance(x, str) else x.text 
    text = text.lower()  # Ensure matching is case-insensitive
    if any(re.search(pattern, text) for pattern in patterns):
        return LABEL_REQUEST_UPDATE_CONTACT_INFO  # Label for indicating request to update contact info
    else:
        return LABEL_ABSTAIN  # No request found

In [12]:
LABEL_REQUEST_STATUS_UPDATES = len(label_to_index.items())
label_to_index['request_status_updates'] = LABEL_REQUEST_STATUS_UPDATES

@labeling_function()
def lf_request_status_updates(x):
    phrases = [
        "please keep me posted",
        "keep me updated",
        "keep me in the loop",
        "please keep me up to date"
    ]
    text = x if isinstance(x, str) else x.text 
    text = text.lower()  # Assuming 'x' has a text field containing the sentence.
    return LABEL_REQUEST_STATUS_UPDATES if any(phrase in text for phrase in phrases) else LABEL_ABSTAIN


In [13]:
LABEL_START_BUILD = len(label_to_index.items())
label_to_index['request_start_build'] = LABEL_START_BUILD

@labeling_function()
def lf_start_build(x):
    text = x if isinstance(x, str) else x.text 
    text = text.lower()

    request = [
        "please","can you", "would you", "could you"
    ]
    action_terms = ["start", "restart",]
    context_terms = ["build"]

    mentions_request = any(term in text for term in request)
    mentions_action = any(term in text for term in action_terms)
    mentions_context = any(term in text for term in context_terms)

    return LABEL_START_BUILD if mentions_request and mentions_action and mentions_context else LABEL_ABSTAIN


In [14]:
LABEL_SPECIFIC_FORMAT = len(label_to_index.items())
label_to_index['request_specific_format'] = LABEL_SPECIFIC_FORMAT


@labeling_function()
def lf_specific_format(x):
    text = x if isinstance(x, str) else x.text 
    text = text.lower()

    format_terms = [
        "format", "formatted", "in word", "word document", "ppt", "powerpoint", 
        "pdf", "visio", "tiff", "eps", "plain text", "doc", "docx"
    ]

    action_terms = [
        "send", "email", "copy", "strip out", "use", "provide", "forward", "submit"
    ]

    mentions_format = any(term in text for term in format_terms)
    mentions_action = any(term in text for term in action_terms)

    return LABEL_SPECIFIC_FORMAT if mentions_format and mentions_action else LABEL_ABSTAIN


In [15]:
LABEL_SIGN_AND_APPROVAL = len(label_to_index.items())
label_to_index['sign_and_approval'] = LABEL_SIGN_AND_APPROVAL

@labeling_function()
def lf_sign_and_approval(x):
    text = x if isinstance(x, str) else x.text 
    text = text.lower()

    signature_terms = [
        "sign", "signature", "signed", "sign off", "approval", 
        "approve", "consent", "approve", "approval", "sign off", "sign-off", 
    ]

    has_signature = any(sig in text for sig in signature_terms)

    return LABEL_SIGN_AND_APPROVAL if has_signature else LABEL_ABSTAIN


In [16]:
LABEL_PRICING = len(label_to_index.items())
label_to_index['request_send_pricing_information'] = LABEL_PRICING

@labeling_function()
def lf_pricing_info_request(x):
    text = x if isinstance(x, str) else x.text 
    text = text.lower()

    request_terms = [
        "please send", "could you send", "can you provide", "let me know", "please inform", 
        "could you tell me", "do you have", "i was wondering", "please shoot me", 
        "could you find out", "please scope and provide"
    ]
    
    pricing_terms = [
        "price", "pricing", "quote", "quotes", "cost", "rates", 
        "rate", "expenses", "fees", "purchase details", 
        "upgrade options", "conditions"
    ]

    has_request = any(req in text for req in request_terms)
    has_pricing = any(p in text for p in pricing_terms)

    return LABEL_PRICING if has_request and has_pricing else LABEL_ABSTAIN


In [17]:
'''
LABEL_SEND_FILE = label_to_index["request_send_file"]

@labeling_function()
def lf_send_file_combo(x):
    text = x if isinstance(x, str) else x.text 
    text = text.lower()

    request_terms = ["can you send", "could you send", "please send", "would you send", "can you forward", "could you please send", "please find attached", "would you mind sending", "please forward"]
    file_terms = ["file", "attachment", "document", "jar", "xml", "html", "spreadsheet", "datasource", "schema", "pqa", "bmp", "logs", "image", "zip", "report", "chf", "palm", "cab", "altmon", "perl", "pdf"]

    has_request = any(req in text for req in request_terms)
    has_file = any(ft in text for ft in file_terms)

    return LABEL_SEND_FILE if has_request and has_file else LABEL_ABSTAIN
'''

'\nLABEL_SEND_FILE = label_to_index["request_send_file"]\n\n@labeling_function()\ndef lf_send_file_combo(x):\n    text = x if isinstance(x, str) else x.text \n    text = text.lower()\n\n    request_terms = ["can you send", "could you send", "please send", "would you send", "can you forward", "could you please send", "please find attached", "would you mind sending", "please forward"]\n    file_terms = ["file", "attachment", "document", "jar", "xml", "html", "spreadsheet", "datasource", "schema", "pqa", "bmp", "logs", "image", "zip", "report", "chf", "palm", "cab", "altmon", "perl", "pdf"]\n\n    has_request = any(req in text for req in request_terms)\n    has_file = any(ft in text for ft in file_terms)\n\n    return LABEL_SEND_FILE if has_request and has_file else LABEL_ABSTAIN\n'

In [18]:
LABEL_SEND_FEEDBACK = len(label_to_index.items())
label_to_index['request_send_feedback'] = LABEL_SEND_FEEDBACK


@labeling_function()
def lf_send_feedback_combo(x):
    text = x if isinstance(x, str) else x.text 
    text = text.lower()

    request_terms = ["please send", "can you give", "let me know", "please get back", "please review", "would you", "could you", "do you have", "share your", "send me"]
    feedback_terms = ["feedback", "suggestion", "comment", "thought", "idea", "opinion", "review", "how this sounds", "changes", "input"]

    has_request = any(req in text for req in request_terms)
    has_feedback = any(fb in text for fb in feedback_terms)

    return LABEL_SEND_FEEDBACK if has_request and has_feedback else LABEL_ABSTAIN


In [19]:
LABEL_SEND_DOCUMENT = len(label_to_index.items())
label_to_index['request_send_document'] = LABEL_SEND_DOCUMENT

@labeling_function()
def lf_send_document_combo(x):
    text = x if isinstance(x, str) else x.text
    text = text.lower()

    request_terms = ["please send", "can you send", "could you send", "would you send", "may i have", "email me", "provide me", "submit"]
    doc_terms = ["document", "presentation", "slides", "report", "resume", "cv", "list", "spreadsheet", "copy", "invoice", "sheet", "results", "attachment", "powerpoint"]

    has_request = any(req in text for req in request_terms)
    has_doc_term = any(term in text for term in doc_terms)

    return LABEL_SEND_DOCUMENT if has_request and has_doc_term else LABEL_ABSTAIN


In [20]:
LABEL_RUN_TEST = len(label_to_index.items())
label_to_index['request_run_test'] = LABEL_RUN_TEST

@labeling_function()
def lf_run_test_combo(x):
    text = x if isinstance(x, str) else x.text
    text = text.lower()

    request_words = ["please", "can you", "could you", "would you", "help me", "let me know"]
    test_terms = ["test", ]

    has_request = any(word in text for word in request_words)
    has_test_term = any(term in text for term in test_terms)

    return LABEL_RUN_TEST if has_request and has_test_term else LABEL_ABSTAIN


In [21]:
LABEL_RESCHEDULE = len(label_to_index.items())
label_to_index['request_reschedule'] = LABEL_RESCHEDULE

@labeling_function()
def lf_reschedule_combo(x):
    text = x if isinstance(x, str) else x.text 
    text = text.lower()

    action_terms = ["reschedule", "postpone", "push", "move", "delay", "change", "adjust", "shift"]
    time_terms = ["meeting", "call", "schedule", "time", "day", "slot", "session", "appointment"]

    has_action = any(term in text for term in action_terms)
    has_time = any(term in text for term in time_terms)

    return LABEL_RESCHEDULE if has_action and has_time else LABEL_ABSTAIN


In [22]:
LABEL_REPRO_STEPS = len(label_to_index.items())
label_to_index['request_reproduction_steps'] = LABEL_REPRO_STEPS

@labeling_function()
def lf_request_reproduction_steps(x):
    text = x if isinstance(x, str) else x.text 
    text = text.lower()

    group_a = ["reproduce", "reproduction", "replicate", "recreating", "simulate", "mimic"]
    group_b = ["error", "problem", "issue", "bug", "fault", "failure"]

    has_group_a = any(term in text for term in group_a)
    has_group_b = any(term in text for term in group_b)

    return LABEL_REPRO_STEPS if has_group_a and has_group_b else LABEL_ABSTAIN


In [23]:
LABEL_REMOVE = len(label_to_index.items())
label_to_index['request_removal_from_list'] = LABEL_REMOVE


@labeling_function()
def lf_request_removal_from_list(x):
    text = x if isinstance(x, str) else x.text
    text = text.lower()

    verbs = ["remove", "delete", "drop", "take me off", "unsubscribe"]
    targets = ["list", "mailing list", "email list", "distribution list", "contact list", "marketing list", "file"]

    has_verb = any(v in text for v in verbs)
    has_target = any(t in text for t in targets)

    return LABEL_REMOVE if has_verb and has_target else LABEL_ABSTAIN


In [24]:
LABEL_REMINDER = len(label_to_index.items())
label_to_index['request_reminder'] = LABEL_REMINDER


@labeling_function()
def lf_request_reminder(x):
    text = x if isinstance(x, str) else x.text
    text = text.lower()

    # Core requirement
    has_reminder = "remind" in text or "reminder" in text

    return LABEL_REMINDER if has_reminder else LABEL_ABSTAIN


In [25]:
LABEL_PRINT = len(label_to_index.items())
label_to_index['request_print'] = LABEL_PRINT

@labeling_function()
def lf_request_print(x):
    text = x if isinstance(x, str) else x.text 
    text = text.lower()
    request_terms = [
        "can you", "could you", "please", 
    ]
    print_terms = ["print"]

    has_request = any(term in text for term in request_terms)
    has_print = any(term in text for term in print_terms)
    return LABEL_PRINT if has_print and has_request else LABEL_ABSTAIN


In [None]:
LABEL_MEETING_TIME = len(label_to_index.items())
label_to_index['meeting_time'] = LABEL_MEETING_TIME


@labeling_function()
def lf_request_meeting_time(x):

    meeting_terms = ["meeting", "meet", "session", "call", "1:1", "discussion", "conference-call", "con call", "conference call"]
    time_location_terms = [
        "time", "date", "when", "what time", 
    ]

    text = x if isinstance(x, str) else x.text 
    text = text.lower()

    has_meeting = any(term in text for term in meeting_terms)
    has_time_location = any(term in text for term in time_location_terms)

    return LABEL_MEETING_TIME if has_meeting and has_time_location else LABEL_ABSTAIN


In [27]:
LABEL_MEETING_LOCATION = len(label_to_index.items())
label_to_index['meeting_location'] = LABEL_MEETING_LOCATION


@labeling_function()
def lf_request_meeting_location(x):

    meeting_terms = ["meeting", "meet", "session", "call", "1:1", "discussion", "conference-call", "con call", "conference call"]
    time_location_terms = [
        "where", "location", "place", "address", 
    ]

    text = x if isinstance(x, str) else x.text 
    text = text.lower()

    has_meeting = any(term in text for term in meeting_terms)
    has_time_location = any(term in text for term in time_location_terms)

    return LABEL_MEETING_LOCATION if has_meeting and has_time_location else LABEL_ABSTAIN


In [None]:
LABEL_REQUEST_MEETING = len(label_to_index.items())
label_to_index['request_meeting'] = LABEL_REQUEST_MEETING

@labeling_function()
def lf_request_meeting(x):
    request_words = ["can", "could ", "would ", ]
    arrangement_words = ["schedule", "set up", "arrange", "organize",]
    meeting_words = ["meeting", "meet", "get together", "appointment"]

    text = x if isinstance(x, str) else x.text
    text = text.lower()

    has_request = any(p in text for p in request_words)
    has_arrangement = any(a in text for a in arrangement_words)
    has_meeting = any(m in text for m in meeting_words)

    return LABEL_REQUEST_MEETING if has_request and has_arrangement and has_meeting else LABEL_ABSTAIN


In [29]:
LABEL_REQUEST_LOGIN_CREDENTIALS = len(label_to_index.items())
label_to_index['request_login_credentials'] = LABEL_REQUEST_LOGIN_CREDENTIALS

@labeling_function()
def lf_request_login_credentials(x):
    credentials = ["login", "username", "user id", "user name", "password", "credentials", "vpn"]
    verbs = ["send", "give", "provide", "reset", "change", "email", "supply"]
    
    text = x if isinstance(x, str) else x.text 
    text = text.lower()
    has_cred = any(c in text for c in credentials)
    has_verb = any(v in text for v in verbs)
    
    return LABEL_REQUEST_LOGIN_CREDENTIALS if has_cred and has_verb else LABEL_ABSTAIN


In [30]:
LABEL_REQUEST_LINK = len(label_to_index.items())
label_to_index['request_link'] = LABEL_REQUEST_LINK



@labeling_function()
def lf_request_link(x):
    url_words = ["url", "link", "webpage", "web page", "web address", "html url"]
    verbs = ["send", "provide", "email", "give", "tell", "point me to", "let me know"]
    
    text = x if isinstance(x, str) else x.text 
    text = text.lower()
    
    has_url_term = any(word in text for word in url_words)
    has_verb = any(v in text for v in verbs)
    
    return LABEL_REQUEST_LINK if has_url_term and has_verb else LABEL_ABSTAIN


In [31]:
LABEL_REQUEST_PROCEED = len(label_to_index.items())
label_to_index['request_instructions_on_how_to_proceed'] = LABEL_REQUEST_PROCEED

@labeling_function()
def lf_request_instructions_proceed(x):
    proceed_phrases = [
        "how to proceed", "how you would like to proceed", "how best to proceed",
        "what our next steps should be", "next steps", "how we should proceed",
        "how should i proceed", "how do you want me to proceed",
        "how we might proceed", "what should we do", "how to handle this"
    ]
    
    guidance_verbs = [
        "advise", "let me know", "tell me", "suggest", "your thoughts on how", 
        "could you advise", "can you advise"
    ]

    text = x if isinstance(x, str) else x.text
    text = text.lower()
    
    phrase_hit = any(p in text for p in proceed_phrases)
    verb_hit = any(v in text for v in guidance_verbs)

    return LABEL_REQUEST_PROCEED if phrase_hit and verb_hit else LABEL_ABSTAIN


In [32]:
LABEL_REQUEST_INCLUSION = len(label_to_index.items())
label_to_index['request_inclusion'] = LABEL_REQUEST_INCLUSION

@labeling_function()
def lf_request_inclusion(x):
    inclusion_verbs = [
        "include", "add","invite", "involve", "loop in", "stay in the loop",
        "make sure to include", "ensure to include", "also include"
    ]

    inclusion_targets = [
        "me", "on the invite", "in the invite", "on the list", "to the meeting", 
        "on all communications", "on any correspondence", "in this discussion", 
        "in the process", "in the review", "in your meetings", "in the distribution"
    ]

    text = x if isinstance(x, str) else x.text 
    text = text.lower()

    verb_hit = any(v in text for v in inclusion_verbs)
    target_hit = any(t in text for t in inclusion_targets)

    return LABEL_REQUEST_INCLUSION if verb_hit and target_hit else LABEL_ABSTAIN


In [33]:
LABEL_REQUEST_HOLDING_OFF = len(label_to_index.items())
label_to_index['request_holding_off'] = LABEL_REQUEST_HOLDING_OFF

@labeling_function()
def lf_request_holding_off(x):
    delay_verbs = [
        "wait", "hold off", "please delay", "pause", "freeze", 
        "don't use", "don't respond", "do not finalize", "do not proceed"
    ]

    timing_clues = [
        "until", "before", "after", "till", "once we", "once you", "we have", 
        "you receive", "you get", "we finalize", "we hear"
    ]

    text = x if isinstance(x, str) else x.text 
    text = text.lower()

    verb_hit = any(v in text for v in delay_verbs)
    timing_hit = any(t in text for t in timing_clues)

    return LABEL_REQUEST_HOLDING_OFF if verb_hit and timing_hit else LABEL_ABSTAIN


In [34]:
LABEL_REQUEST_FURTHER_INFO = len(label_to_index.items())
label_to_index['request_further_information'] = LABEL_REQUEST_FURTHER_INFO

@labeling_function()
def lf_request_further_information(x):
    request_phrases = [
        "let me know", "can you provide", "can you send", "please provide", 
        "give me more", "send me", "inform me", "any other information", 
        "what else is required", "what else do you need", "add more details", 
        "describe", "context", "more information", "more details"
    ]

    info_targets = [
        "information", "details", "data", "context", "output", "version", 
        "url", "names", "versions", "changes", "specs", "requirements"
    ]

    text = x if isinstance(x, str) else x.text 
    text = text.lower()

    phrase_hit = any(p in text for p in request_phrases)
    target_hit = any(t in text for t in info_targets)

    return LABEL_REQUEST_FURTHER_INFO if phrase_hit and target_hit else LABEL_ABSTAIN


In [None]:
LABEL_REQUEST_FOLLOW_UP = len(label_to_index.items())
label_to_index['request_follow_up'] = LABEL_REQUEST_FOLLOW_UP

@labeling_function()
def lf_request_follow_up(x):
    followup_phrases = [
        "follow up", "follow-up", "get back to me", "contact me again", 
        "please respond again", "please reply again", "reach back out", 
        "check in with", "please check in", "update me again"
    ]



    text = x if isinstance(x, str) else x.text 
    text = text.lower()

    phrase_hit = any(p in text for p in followup_phrases)

    return LABEL_REQUEST_FOLLOW_UP if phrase_hit else LABEL_ABSTAIN


In [36]:
LABEL_REQUEST_FAX = len(label_to_index.items())
label_to_index['request_fax'] = LABEL_REQUEST_FAX

@labeling_function()
def lf_request_fax(x):
    fax_verbs = [
        "fax", "fax back", "please fax", "send a fax", "fax me", "please fax in"
    ]

    fax_targets = [
        "nda", "document", "copy", "signature", "agreement", "form", "invoice", "letter", 
        "reseller certificate", "timesheet", "loi", "visio", "approval", "report"
    ]

    text = x if isinstance(x, str) else x.text 
    text = text.lower()

    verb_hit = any(v in text for v in fax_verbs)
    target_hit = any(t in text for t in fax_targets)
    fax_number_hit = "fax" in text and any(d in text for d in ["(", ")", "-", "fax to", "fax back to", "fax it to"])

    return LABEL_REQUEST_FAX if (verb_hit and target_hit) or fax_number_hit else LABEL_ABSTAIN


In [37]:
LABEL_REQUEST_ERROR_DETAILS = len(label_to_index.items())
label_to_index['request_error_details'] = LABEL_REQUEST_ERROR_DETAILS

@labeling_function()
def lf_request_error_details(x):
    request_verbs = [
        "can you send", "could you send", "could you give", "can you give", 
        "please send", "please provide", "describe", "tell me", "let us know"
    ]

    error_terms = [
        "error", "bug", "issue", "problem", "bug number", "bug #", "error message", 
        "stack trace", "what went wrong"
    ]

    text = x if isinstance(x, str) else x.text 
    text = text.lower()

    verb_hit = any(v in text for v in request_verbs)
    error_hit = any(t in text for t in error_terms)

    return LABEL_REQUEST_ERROR_DETAILS if verb_hit and error_hit else LABEL_ABSTAIN


In [38]:
LABEL_REQUEST_DISREGARD_PREV = len(label_to_index.items())
label_to_index['request_disregard_of_previous_request'] = LABEL_REQUEST_DISREGARD_PREV

@labeling_function()
def lf_request_disregard_previous(x):
    disregard_verbs = [
        "ignore", "disregard", 
    ]

    previous_refs = [
        "previous", "prior", "last", "earlier", "already sent", 
        "sent by mistake", "attached version", "earlier message"
    ]

    text = x if isinstance(x, str) else x.text 
    text = text.lower()

    verb_hit = any(v in text for v in disregard_verbs)
    prev_hit = any(p in text for p in previous_refs)

    return LABEL_REQUEST_DISREGARD_PREV if verb_hit and prev_hit else LABEL_ABSTAIN


In [39]:
LABEL_REQUEST_DELETION = len(label_to_index.items())
label_to_index['request_deletion'] = LABEL_REQUEST_DELETION
@labeling_function()
def lf_request_deletion(x):
    deletion_verbs = [
        "delete", "remove", "discard", "erase", "destroy", "clean up", 
        "empty your trash", "eliminate", "wipe"
    ]

    deletion_targets = [
        "file", "files", "folder", "directory", "document", "copy", "copies", 
        "email", "e-mail", "log", "entry", "distribution", "attachment", 
        "subdirectory", "resource", "path"
    ]

    text = x if isinstance(x, str) else x.text 
    text = text.lower()

    verb_hit = any(v in text for v in deletion_verbs)
    target_hit = any(t in text for t in deletion_targets)

    return LABEL_REQUEST_DELETION if verb_hit and target_hit else LABEL_ABSTAIN


In [40]:
LABEL_REQUEST_CREATE_BASELINE = len(label_to_index.items())
label_to_index['request_create_baseline'] = LABEL_REQUEST_CREATE_BASELINE
@labeling_function()
def lf_request_create_baseline(x):
    baseline_verbs = [
        "create", "make", "set", "establish", "baseline"  # baseline used directly as verb
    ]

    baseline_targets = [
        "baseline", "new baseline", "baseline for", "baseline of"
    ]

    text = x if isinstance(x, str) else x.text 
    text = text.lower()

    # Either 'baseline' used alone as a verb (e.g. "please baseline this") 
    # OR an action verb + target noun
    direct_baseline_use = "baseline" in text and any(v in text for v in ["please", "can you", "would you"])
    verb_target_combo = any(v in text for v in baseline_verbs) and any(t in text for t in baseline_targets)

    return LABEL_REQUEST_CREATE_BASELINE if direct_baseline_use and verb_target_combo else LABEL_ABSTAIN


In [41]:
LABEL_REQUEST_COORDINATION = len(label_to_index.items())
label_to_index['request_coordination'] = LABEL_REQUEST_COORDINATION

@labeling_function()
def lf_request_coordination(x):
    coordination_verbs = [
        "coordinate", "liaise", "sync with", "follow up with", "work with", 
        "take it up with", "collaborate with", "align with", "reach out to"
    ]

    text = x if isinstance(x, str) else x.text 
    text = text.lower()
    
    return LABEL_REQUEST_COORDINATION if any(v in text for v in coordination_verbs) else LABEL_ABSTAIN


In [42]:
LABEL_REQUEST_CONTACT_INFORMATION = len(label_to_index.items())
label_to_index['request_contact_information'] = LABEL_REQUEST_CONTACT_INFORMATION

@labeling_function()
def lf_request_contact_info(x):
    actions = [
        "please send me", "could you send", "can you provide", "please forward", 
        "please email me", "would you please send", "let me know", 
        "can you get me", "could you also email", "can you tell me", 
        "can i please get", "would you let me know", "give me", "mail me"
    ]

    contact_terms = [
        "phone number", "fax", "address", 
        "contact",

    ]
    text = x if isinstance(x, str) else x.text
    text = text.lower()

    action_hit = any(a in text for a in actions)
    term_hit = any(t in text for t in contact_terms)

    return LABEL_REQUEST_CONTACT_INFORMATION if action_hit and term_hit else LABEL_ABSTAIN


In [43]:
LABEL_REQUEST_CONFIDENTIALITY = len(label_to_index.items())
label_to_index['request_confidentiality'] = LABEL_REQUEST_CONFIDENTIALITY

@labeling_function()
def lf_request_confidentiality_composed(x):
    text = x if isinstance(x, str) else x.text 
    text = text.lower()

    # --- Combo 1: "treat"/"mark" + "confidential"
    confidentiality_actions = [
        "treat", "keep", "mark", "consider", "safeguard", "handle", "protect", "label"
    ]

    confidentiality_terms = [
        "confidential", "private", "sensitive", "internal", "restricted", 
        "classified", "under nda"
    ]

    combo1 = any(a in text for a in confidentiality_actions) and any(t in text for t in confidentiality_terms)

    # --- Combo 2: "do not"/"don't" + "distribute"/"share"/"communicate" + "externally"/"outside"
    negations = ["do not", "don't", "please don't", "please do not"]
    restrict_verbs = ["distribute", "share", "send", "communicate", "forward"]
    scope_terms = ["externally", "outside", "beyond the company", "outside the company"]

    combo2 = (
        any(n in text for n in negations) and 
        any(v in text for v in restrict_verbs) and 
        any(s in text for s in scope_terms)
    )

    return LABEL_REQUEST_CONFIDENTIALITY if combo1 or combo2 else LABEL_ABSTAIN


In [44]:
LABEL_REQUEST_CONFERENCE_CALL = len(label_to_index.items())
label_to_index['request_conference_call'] = LABEL_REQUEST_CONFERENCE_CALL

@labeling_function()
def lf_request_conference_call(x):
    call_actions = [
        "set", "schedule", "arrange", "coordinate", "plan", "make", "have", "hold", "available", "suggest"
    ]
    
    call_terms = [
        "conference call", "con call", "teleconference", "meeting call", "call bridge", "telephone conversation", "conference bridge", "conference-call", 
    ]
    
    text = x if isinstance(x, str) else x.text
    text = text.lower()
    
    action_hit = any(a in text for a in call_actions)
    term_hit = any(t in text for t in call_terms)
    
    return LABEL_REQUEST_CONFERENCE_CALL if action_hit and term_hit else LABEL_ABSTAIN


In [45]:
LABEL_REQUEST_CLOSE_BUG = len(label_to_index.items())
label_to_index['request_close_bug'] = LABEL_REQUEST_CLOSE_BUG

@labeling_function()
def lf_request_close_bug_combo(x):
    close_actions = [
        "close", "mark as closed", "mark this bug fixed", 
        "resolve", "verify and close", "verify/close", "mark it closed", "fixed"
    ]
    
    bug_terms = [
        "bug", "issue", "ticket", "bugs"
    ]
    
    text = x if isinstance(x, str) else x.text 
    text = text.lower()
    
    action_hit = any(c in text for c in close_actions)
    bug_hit = any(b in text for b in bug_terms)
    
    return LABEL_REQUEST_CLOSE_BUG if action_hit and bug_hit else LABEL_ABSTAIN


In [46]:
LABEL_REQUEST_CALL = len(label_to_index.items())
label_to_index['request_call'] = LABEL_REQUEST_CALL

@labeling_function()
def lf_request_call(x):

    call_phrases = [
        "please call me", "give me a call", "call me at", "call my cell", 
         "call on my mobile", "call me on", 
        "please call her", "please call him", "call my home phone", 
        "please give me a call"
    ]


    text = x if isinstance(x, str) else x.text 
    text = text.lower()
    phrase_hit = any(p in text for p in call_phrases)


    return LABEL_REQUEST_CALL if phrase_hit else LABEL_ABSTAIN


In [47]:
LABEL_REQUEST_BUG_REPORT = len(label_to_index.items())
label_to_index['request_bug_report'] = LABEL_REQUEST_BUG_REPORT

@labeling_function()
def lf_request_bug_report(x):
    bug_phrases = [
        "file a bug", "file bugs", "log a bug", "open a bug", 
        "report a bug", "report any bugs", "please file this as a bug", 
        "file it as a new bug", "submit a bug", "file a bug against", 
        "report all bugs", "please file all problems as bugs"
    ]
    
    text = x if isinstance(x, str) else x.text 
    text = text.lower()
    
    return LABEL_REQUEST_BUG_REPORT if any(p in text for p in bug_phrases) else LABEL_ABSTAIN


In [48]:
LABEL_REQUEST_AVAILABILITY = len(label_to_index.items())
label_to_index['request_availability'] = LABEL_REQUEST_AVAILABILITY

@labeling_function()
def lf_request_availability(x):
    availability_phrases = [
         "do you have time",
        "are you available", "would you be available", "available at",
         "suggest a time",
          "convenient time",
        
         "available time", "which days", "when is good for you"
    ]

    text = x if isinstance(x, str) else x.text 
    text = text.lower()
    return LABEL_REQUEST_AVAILABILITY if any(p in text for p in availability_phrases) else LABEL_ABSTAIN


In [49]:
LABEL_REQUEST_ATTENDANCE_INFO = len(label_to_index.items())
label_to_index['request_attendance_info'] = LABEL_REQUEST_ATTENDANCE_INFO

@labeling_function()
def lf_request_attendance_info(x):
    request_phrases = [
        "let me know", "please advise", "send me", "provide an estimate", 
        "find out", "can you confirm", "how many people", "could you follow up",
        "do you know who", "who will be attending"
    ]
    
    attendance_terms = [
        "attending", "attendees", "coming", "participating", "will be there", 
        "will attend", "list of attendees", "will be joining", "be present"
    ]
    
    text = x if isinstance(x, str) else x.text 
    text = text.lower()
    
    req_hit = any(p in text for p in request_phrases)
    attn_hit = any(a in text for a in attendance_terms)
    
    return LABEL_REQUEST_ATTENDANCE_INFO if req_hit and attn_hit else LABEL_ABSTAIN


In [50]:
LABEL_REQUEST_ATTENDANCE = len(label_to_index.items())
label_to_index['request_attendance'] = LABEL_REQUEST_ATTENDANCE

@labeling_function()
def lf_request_attendance(x):
    attendance_verbs = [
        "attend", "join", 
    ]
    
    meeting_contexts = [
        "meeting", "session", "event", "training", "conference", "call", "in", "at"
    ]
    
    text = x if isinstance(x, str) else x.text 
    text = text.lower()
    
    verb_hit = any(v in text for v in attendance_verbs)
    context_hit = any(c in text for c in meeting_contexts)
    
    return LABEL_REQUEST_ATTENDANCE if verb_hit and context_hit else LABEL_ABSTAIN


In [51]:
LABEL_REQUEST_ADDITION_TO_LIST = len(label_to_index.items())
label_to_index['request_addition_to_list'] = LABEL_REQUEST_ADDITION_TO_LIST

@labeling_function()
def lf_request_addition_to_list_combo(x):
    action_phrases = [
        "please add", "could you add", "please include", "add me to",
        "add them to", "add him to", "add her to", "include"
    ]
    
    list_targets = [
        "list", "mailing list", "email list", "distribution list", "alias", 
        "group", "mail alias", "team list", "attendee list"
    ]
    
    text = x if isinstance(x, str) else x.text
    text = text.lower()
    
    action_hit = any(p in text for p in action_phrases)
    target_hit = any(t in text for t in list_targets)
    
    return LABEL_REQUEST_ADDITION_TO_LIST if action_hit and target_hit else LABEL_ABSTAIN


In [52]:
LABEL_REQUEST_ADD_CC = len(label_to_index.items())
label_to_index['request_add_cc'] = LABEL_REQUEST_ADD_CC

@labeling_function()
def lf_request_add_cc(x):
    cc_phrases = [
        "please cc", "could you cc", "can you cc", "always cc", "cc me",
        "copy me on", "copy her on", "copy him on", 
        "please copy", "have her cc", "cc:", 
    ]
    
    text = x if isinstance(x, str) else x.text
    text = text.lower()
    
    return LABEL_REQUEST_ADD_CC if any(p in text for p in cc_phrases) else LABEL_ABSTAIN


In [53]:
LABEL_REQUEST_ACCESS = len(label_to_index.items())
label_to_index['request_access'] = LABEL_REQUEST_ACCESS

@labeling_function()
def lf_request_access(x):
    verbs = [
        "give", "provide", "grant", "get", "make sure", "ensure"
    ]
    
    access_terms = [
        "access to", "access", "permission to", 
    ]
    
    text = x if isinstance(x, str) else x.text 
    text = text.lower()
    
    verb_hit = any(v in text for v in verbs)
    access_hit = any(a in text for a in access_terms)
    
    return LABEL_REQUEST_ACCESS if verb_hit and access_hit else LABEL_ABSTAIN


In [54]:
LABEL_PURCHASE_ORDER = len(label_to_index.items())
label_to_index['purchase_order'] = LABEL_PURCHASE_ORDER


@labeling_function()
def lf_purchase_order_request(x):
    verbs = [
        "issue", "send", "create", "prepare", "fill out", 
        "get", "cut", "order", "put together", "draft"
    ]
    
    targets = [
        "po", "p.o.", "purchase order", "purchase req", "po template", "purchase"
    ]
    
    text = x if isinstance(x, str) else x.text 
    text = text.lower()
    
    verb_hit = any(v in text for v in verbs)
    target_hit = any(t in text for t in targets)
    
    return LABEL_PURCHASE_ORDER if verb_hit and target_hit else LABEL_ABSTAIN


In [55]:
LABEL_PROVIDE_LINK = len(label_to_index.items())
label_to_index['provide_link'] = LABEL_PROVIDE_LINK


@labeling_function()
def lf_provide_link_with_url(x):
    import re
    
    provision_phrases = [
        "please view", "please visit", "please use", "visit our website", 
        "go to", "view more", "see more", "for additional information",
        "for further information", "for more information", "for more details"
    ]
    
    # Regex to match typical URLs or domains
    url_regex = r"(https?://\S+|www\.\S+\.\S+)"
    
    text = x if isinstance(x, str) else x.text 
    text = text.lower()
    
    phrase_hit = any(p in text for p in provision_phrases)
    url_hit = re.search(url_regex, text) is not None
    
    return LABEL_PROVIDE_LINK if phrase_hit and url_hit else LABEL_ABSTAIN


In [57]:
LABEL_OFFER_ASSISTANCE = len(label_to_index.items())
label_to_index['offer_assistance'] = LABEL_OFFER_ASSISTANCE

@labeling_function()
def lf_offer_assistance(x):
    verbs = ["help", "assistance", "questions", "question", ]
    nouns = ["need", "require", "let us know", "give"]
    
    text = x if isinstance(x, str) else x.text 
    text = text.lower()

    # Check if any verb and any noun co-occur
    verb_hit = any(v in text for v in verbs)
    noun_hit = any(n in text for n in nouns)

    return LABEL_OFFER_ASSISTANCE if verb_hit and noun_hit else LABEL_ABSTAIN


In [58]:
LABEL_MARK_CALENDAR = len(label_to_index.items())
label_to_index['mark_calendar'] = LABEL_MARK_CALENDAR

@labeling_function()
def lf_mark_calendar(x):
    verbs = ["mark", "put", "note", "book", "make sure", ]
    nouns = ["schedule", "schedules", "calendar", "calendars"]
    
    text = x if isinstance(x, str) else x.text 
    text = text.lower()

    # Check if any verb and any noun co-occur
    verb_hit = any(v in text for v in verbs)
    noun_hit = any(n in text for n in nouns)

    return LABEL_MARK_CALENDAR if verb_hit and noun_hit else LABEL_ABSTAIN



In [59]:
LABEL_EXPRESS_APOLOGY = len(label_to_index.items())
label_to_index['express_apology'] = LABEL_EXPRESS_APOLOGY

@labeling_function()
def lf_express_apology(x):
    verbs = ["express", "accept", ]
    nouns = ["apology", "apologies", "apologize", "sorry"]
    
    text = x if isinstance(x, str) else x.text 
    text = text.lower()

    # Check if any verb and any noun co-occur
    verb_hit = any(v in text for v in verbs)
    noun_hit = any(n in text for n in nouns)

    return LABEL_EXPRESS_APOLOGY if verb_hit and noun_hit else LABEL_ABSTAIN


In [60]:
LABEL_EXPRESS_GREETINGS = len(label_to_index.items())
label_to_index['express_greetings'] = LABEL_EXPRESS_GREETINGS

@labeling_function()
def lf_express_greetings(x):
    verbs = ["give", "send", "convey", "extend"]
    nouns = ["regards", "best wishes", "get well", "take care", "good care", "greetings"]
    
    text = x if isinstance(x, str) else x.text 
    text = text.lower()

    # Check if any verb and any noun co-occur
    verb_hit = any(v in text for v in verbs)
    noun_hit = any(n in text for n in nouns)

    return LABEL_EXPRESS_GREETINGS if verb_hit and noun_hit else LABEL_ABSTAIN


In [61]:
LABEL_INFORM_ATTACHMENT = len(label_to_index.items())
label_to_index['inform_attachment'] = LABEL_INFORM_ATTACHMENT

@labeling_function()
def lf_inform_attachment(x):
    patterns = [
        "find attached",
        "find enclosed",
        "attached you'll find",
        "attached is",
        "i have attached",
        "see attached"
    ]
    
    text = x if isinstance(x, str) else x.text 
    text = text.lower()
    
    return LABEL_INFORM_ATTACHMENT if any(p in text for p in patterns) else LABEL_ABSTAIN


In [62]:
'''
LABEL_INTRODUCTION = len(label_to_index.items())
label_to_index['introduction'] = LABEL_INTRODUCTION


@labeling_function()
def lf_introduction_welcome(x):
    patterns = [
        "please join me in welcoming",
        "please welcome",
        "join us in welcoming",
        "say hello to",
        "welcome to the team",
        "welcome to our family",
        "help me welcome"
    ]
    
    text = x if isinstance(x, str) else x.text 
    text = text.lower()
    
    return LABEL_INTRODUCTION if any(p in text for p in patterns) else LABEL_ABSTAIN
'''

'\nLABEL_INTRODUCTION = len(label_to_index.items())\nlabel_to_index[\'introduction\'] = LABEL_INTRODUCTION\n\n\n@labeling_function()\ndef lf_introduction_welcome(x):\n    patterns = [\n        "please join me in welcoming",\n        "please welcome",\n        "join us in welcoming",\n        "say hello to",\n        "welcome to the team",\n        "welcome to our family",\n        "help me welcome"\n    ]\n    \n    text = x if isinstance(x, str) else x.text \n    text = text.lower()\n    \n    return LABEL_INTRODUCTION if any(p in text for p in patterns) else LABEL_ABSTAIN\n'

In [63]:
'''
LABEL_REQUEST_SUBMIT_TIMESHEET = label_to_index["request_submit_timesheet"]

@labeling_function()
def lf_submit_or_complete_timesheet(x):
    verbs = ["submit", "complete", "send", "turn in", "send in", "email", "e-mail"]
    nouns = ["timesheet", "timesheets"]
    
    text = x if isinstance(x, str) else x.text 
    text = text.lower()

    # Check if any verb and any noun co-occur
    verb_hit = any(v in text for v in verbs)
    noun_hit = any(n in text for n in nouns)

    return LABEL_REQUEST_SUBMIT_TIMESHEET if verb_hit and noun_hit else LABEL_ABSTAIN
'''

'\nLABEL_REQUEST_SUBMIT_TIMESHEET = label_to_index["request_submit_timesheet"]\n\n@labeling_function()\ndef lf_submit_or_complete_timesheet(x):\n    verbs = ["submit", "complete", "send", "turn in", "send in", "email", "e-mail"]\n    nouns = ["timesheet", "timesheets"]\n    \n    text = x if isinstance(x, str) else x.text \n    text = text.lower()\n\n    # Check if any verb and any noun co-occur\n    verb_hit = any(v in text for v in verbs)\n    noun_hit = any(n in text for n in nouns)\n\n    return LABEL_REQUEST_SUBMIT_TIMESHEET if verb_hit and noun_hit else LABEL_ABSTAIN\n'

In [64]:
index_to_label = {idx: label for label, idx in label_to_index.items()}

## Applying the Labeling Functions

In [65]:
lfs = [value for name, value in globals().items() if isinstance(value, LabelingFunction)]

In [66]:
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df_train)

100%|██████████| 4804/4804 [00:01<00:00, 3796.57it/s]


## multilabel attempts

In [67]:
def get_predictions(row):
    preds = list(set(row))
    preds.remove(-1)
    return [index_to_label[x] for x in preds]

In [68]:
df_train

Unnamed: 0,text,label
0,Please email me what date and time is good to have a conference call.,request_conference_call
1,"Would you be available for a conference call on Friday, 12.15.00 at 8am PST/11am EST?",request_conference_call
2,"Would you be available for a conference call on Tuesday, 5/23 at 10am PDT?",request_conference_call
3,Would you be available for a conference call at 9:30 am on 9/11?,request_conference_call
4,Would you be available this Wednesday at 10 am for an introductory conference call?,request_conference_call
...,...,...
4799,Please join me in my graduation party.,introduction
4800,Please join me in welcoming Todd.,introduction
4801,Please join me in welcoming Mark Porter to the Sitraka Mobility Professional Services team.,introduction
4802,Please join me in welcoming Bhagavan to AvocadoIT.,introduction


In [69]:
df_multilabel = df_train

In [70]:
df_multilabel['heuristic_labels'] = [get_predictions(row) for row in L_train]

In [72]:
df_multilabel.to_csv("multi_labels/train_heuristics.tsv",sep="\t", index=False)

In [None]:
from itertools import combinations
import pandas as pd

# Function to update co-occurrence matrix
def update_cooccurrence_matrix(co_occurrence_dict, label_list):
    for label1, label2 in combinations(label_list, 2):
        if label1 != label2:
            pair = tuple(sorted([label1, label2]))
            co_occurrence_dict[pair] = co_occurrence_dict.get(pair, 0) + 1

# Initialize dictionary to store co-occurrence counts
co_occurrence_dict = {}

# Loop through DataFrame rows and update co-occurrence counts
for labels in df_train['multi_label']:
    update_cooccurrence_matrix(co_occurrence_dict, labels)

# Convert dictionary to DataFrame
co_occurrence_df = pd.DataFrame(co_occurrence_dict.items(), columns=['label_pair', 'co_occurrence_count'])


In [None]:
df_sorted = co_occurrence_df.sort_values(by='co_occurrence_count', ascending=False)
df_sorted.head(20)

In [None]:
# Create a dictionary to store the sets of labels for each utterance
label_sets = [set(labels) for labels in df_multilabel['multi_label']]

# Compute pairwise Jaccard similarity
from sklearn.metrics import jaccard_score

label_list = sorted(set([label for sublist in df_multilabel['multi_label'] for label in sublist]))
label_index = {label: i for i, label in enumerate(label_list)}

# Create a matrix to store Jaccard similarity
jaccard_matrix = pd.DataFrame(0, index=label_list, columns=label_list)

for i in range(len(label_sets)):
    for label1 in label_sets[i]:
        for label2 in label_sets[i]:
            if label1 != label2:
                jaccard_matrix.loc[label1, label2] += 1

# Normalize the Jaccard Matrix
jaccard_matrix = jaccard_matrix / len(df_multilabel)
jaccard_matrix
#import ace_tools as tools; tools.display_dataframe_to_user(name="Jaccard Similarity Matrix", dataframe=jaccard_matrix)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Let's assume `jaccard_matrix` is the matrix you want to visualize.

# Plot the heatmap using seaborn
plt.figure(figsize=(10, 8))
sns.heatmap(jaccard_matrix, annot=True, cmap="YlGnBu", fmt='.2f', linewidths=0.5)

# Set labels and title
plt.title('Jaccard Similarity Heatmap')
plt.xlabel('Labels')
plt.ylabel('Labels')

# Show the plot
plt.show()


In [None]:
new_cols = pd.DataFrame(L_train, columns=label_names)
df_multilabel = pd.concat([df_train, new_cols], axis=1)
df_multilabel.drop(columns=['numeric_manual_label'],inplace=True)

## Other Experiments

In [72]:
# Analyzing labeling functions' coverage, overlaps, and conflicts
lf_summary = LFAnalysis(L=L_train, lfs=lfs).lf_summary()
lf_summary

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
lf_request_visit,0,[0],0.007702,0.002498,0.002498
lf_request_verify_information,1,[1],0.004371,0.001665,0.001665
lf_request_urgency_with_deadline,2,[2],0.000833,0.000624,0.000624
lf_request_update_contact_info,3,[3],0.000624,0.0,0.0
lf_request_status_updates,4,[4],0.004163,0.000416,0.000416
lf_start_build,5,[5],0.00458,0.000208,0.000208
lf_specific_format,6,[6],0.020192,0.015404,0.015404
lf_sign_and_approval,7,[7],0.036844,0.023106,0.023106
lf_pricing_info_request,8,[8],0.002914,0.001873,0.001873
lf_send_feedback_combo,9,[9],0.020192,0.0102,0.0102


In [73]:
lf_summary['Coverage'].sum()

0.7624895920066612

In [70]:

# Create the Label Model
label_model = LabelModel(cardinality=len(lfs), verbose=True)  # Set cardinality to the number of classes

# Fit the Label Model
label_model.fit(L_train=L_train, n_epochs=500, log_freq=50, seed=123)


INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|          | 0/500 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=0.015]
 10%|▉         | 49/500 [00:05<00:44, 10.16epoch/s]INFO:root:[50 epochs]: TRAIN:[loss=0.011]
 20%|█▉        | 99/500 [00:09<00:36, 10.98epoch/s]INFO:root:[100 epochs]: TRAIN:[loss=0.008]
 30%|███       | 150/500 [00:14<00:46,  7.58epoch/s]INFO:root:[150 epochs]: TRAIN:[loss=0.006]
 40%|████      | 200/500 [00:20<00:32,  9.27epoch/s]INFO:root:[200 epochs]: TRAIN:[loss=0.004]
 50%|█████     | 250/500 [00:27<00:28,  8.89epoch/s]INFO:root:[250 epochs]: TRAIN:[loss=0.003]
 60%|██████    | 300/500 [00:31<00:18, 10.98epoch/s]INFO:root:[300 epochs]: TRAIN:[loss=0.003]
 70%|███████   | 350/500 [00:36<00:15,  9.61epoch/s]INFO:root:[350 epochs]: TRAIN:[loss=0.002]
 80%|████████  | 400/500 [00:41<00:08, 11.21epoch/s]INFO:root:[400 epochs]: TRAIN:[loss=0.002]
 90%|█████████ | 450/500 [00:46<00:04, 10.19epoch/s]INFO:root:[450 epochs]: TRAIN:[loss=0.002]
100%|███████

In [71]:
# Get the learned weights
weights = label_model.get_weights()
print(weights)


[0.50842262 0.51807751 0.61361059 0.65276857 0.51920866 0.51708204
 0.50065043 0.36985781 0.52922138 0.50053402 0.50075704 0.49748007
 0.50139539 0.55444428 0.51451089 0.56303014 0.50911206 0.46033903
 0.51707928 0.50827497 0.5130902  0.50949624 0.52509541 0.43896231
 0.51808366 0.40248585 0.35267715 0.48345141 0.50406381 0.50911816
 0.52181443 0.51706682 0.50845157 0.50103492 0.5074356  0.50841138
 0.52336418 0.15739836 0.52510862 0.50637324 0.51916777 0.50278245
 0.52511267 0.50120685 0.55454208 0.50388123 0.53487687 0.22902721
 0.40027965 0.51384092 0.55457344 0.53493473 0.28463409]


In [72]:
# Generate probabilistic labels
probs_train = label_model.predict_proba(L=L_train)

In [73]:
import numpy as np


# Define the threshold
threshold = 0.25

# Initialize an empty list to store the labels that meet the threshold for each instance
Y_train_pred = []

# Iterate through each row and find all labels above the threshold
for row in probs_train:
    labels_above_threshold = np.where(row >= threshold)[0]  # Get the indices where the condition is met
    Y_train_pred.append(labels_above_threshold)



In [74]:
probs_train[0]

array([0.00509971, 0.00513162, 0.00514609, 0.00514635, 0.00513281,
       0.00513006, 0.00509387, 0.00450164, 0.00513974, 0.00481854,
       0.00485768, 0.00457415, 0.00617124, 0.00514437, 0.00512504,
       0.00515544, 0.00510309, 0.40579689, 0.00535896, 0.00543759,
       0.00512141, 0.00510544, 0.0051378 , 0.00466974, 0.00513109,
       0.00511911, 0.00610815, 0.00454184, 0.00503865, 0.00510461,
       0.00513526, 0.00513   , 0.00533653, 0.00487883, 0.00509147,
       0.32553915, 0.00513666, 0.0111034 , 0.00513785, 0.00702178,
       0.00513265, 0.00509327, 0.00513773, 0.00490142, 0.00514443,
       0.00520057, 0.00514156, 0.00645705, 0.00464798, 0.00512349,
       0.00514446, 0.00514171, 0.00408004])

In [75]:
df_train['numeric_snorkel_label'] = Y_train_pred

In [76]:
df_train['snorkel_label'] = df_train['numeric_snorkel_label'].apply(lambda x: [index_to_label[i] for i in x])

In [81]:
df_filtered = df_train[df_train['snorkel_label'].apply(lambda x: len(x) > 0)]


In [None]:
df_filtered.to_csv("multi_labels/train.tsv",sep="\t")