# **SMS SPAM DETECTION** 

In [None]:
from labeling.lf import *
from labeling.lf_set import *
from labeling.preprocess import *
from labeling.continuous_scoring import *
from labeling.apply import *
from labeling.noisy_labels import *

from utils import load_data_to_numpy

from preprocessor import convert_to_lower
from labeling.lf_set import *
from con_scorer import word_similarity

import numpy as np
import re
import enum

model loading



### **Representation of class Labels**
All the class labels for which we define labeling functions are encoded in enum and utilized in our next tasks

In [1]:
import enum

# enum to hold the class labels
class ClassLabels(enum.Enum):
    SPAM = 1
    HAM = 0

THRESHOLD = 0.8

During labeling the unlabelled data we lookup for few keywords to assign a class SMS.

In [2]:
trigWord1 = {"free","credit","cheap","apply","buy","attention","shop","sex","soon","now","spam"}
trigWord2 = {"gift","click","new","online","discount","earn","miss","hesitate","exclusive","urgent"}
trigWord3 = {"cash","refund","insurance","money","guaranteed","save","win","teen","weight","hair"}
notFreeWords = {"toll","Toll","freely","call","meet","talk","feedback"}
notFreeSubstring = {"not free","you are","when","wen"}
firstAndSecondPersonWords = {"I","i","u","you","ur","your","our","we","us","youre"}
thirdPersonWords = {"He","he","She","she","they","They","Them","them","their","Their"}

### **Labeling Functions**

#### Labeling functions are helpful for users to assign a class for an instance programatically. ####

These labeling functions are heuristics which might yeild very **noisy lables** (or) **abstains** on many datapoints.

Each labeling fuction is associated with a class and each labeling function can trigger on its corresponding class given an instance. 

We use @labeling_function(name,resources, preprocessor, label) decorator for declaring a labeling function.

Before labeling an instance we can preprocess instance through preprocessors.

### **Declaration of simple labeling functions**

In [19]:
from labeling.lf import labeling_function, ABSTAIN

class sampleLabels(enum.Enum):
    red = 1
    green = 0

@labeling_function(label=1,name="SAMPLE_LABELING")
def sample_labeling(x):
    '''A sample labeling function which predicts red when x is "red"
    label=1 argument in decorator indicates that this lf is corresponding to class red'''
    if(x == "red"):
        return sampleLabels.red
    else:
        return ABSTAIN

label, _ = sample_labeling("red")
print(label)

sampleLabels.red


### **Declaration of simple preprocessor functions**

Preprocessor functions are used to preprocess an instance before labeling it. 

We use @preprocessor(name,resources) decorator to declare a function as preprocessor.

In [20]:
from labeling.preprocess import preprocessor

@preprocessor(name = "LOWER_CASE")
def convert_to_lower(x):
    return x.lower().strip()

lower = convert_to_lower("RED")
print(lower)

red


### **Declaration of continious scoring functions**

Along with labeling instances with hard labels we aslo calculate soft labels ragening from 0-1 for an instance.

We use @continuous_scorer(name,resources) decorator to declare a function as continious scorer.

In [24]:
from labeling.continuous_scoring import continuous_scorer

@continuous_scorer(name="INVERSE  SCORER")
def continious(x):
    if x<1:
        return x
    else:
        return 1/x
    

score = continious(5)
print(score)

0.2


### **Few labeling functions to annotate SMS Dataset**

We use glove embeddings of words as part of our continuos scorer to get the soft labels (similarity score)

In [25]:
from labeling.lf import labeling_function, ABSTAIN
from labeling.lf_set import LFSet
from labeling.preprocess import preprocessor

from con_scorer import word_similarity
import re


@preprocessor()
def convert_to_lower(x):
    return x.lower().strip()


@labeling_function(resources=dict(keywords=trigWord1),pre=[convert_to_lower],label=ClassLabels.SPAM)
def LF1(c,**kwargs):    
    if len(kwargs["keywords"].intersection(c.split())) > 0:
        return ClassLabels.SPAM
    else:
        return ABSTAIN

@labeling_function(resources=dict(keywords=trigWord2),pre=[convert_to_lower],label=ClassLabels.SPAM)
def LF2(c,**kwargs):
    if len(kwargs["keywords"].intersection(c.split())) > 0:
        return ClassLabels.SPAM
    else:
        return ABSTAIN

@labeling_function(resources=dict(keywords=trigWord3),pre=[convert_to_lower],label=ClassLabels.SPAM)
def LF3(c,**kwargs):
    if len(kwargs["keywords"].intersection(c.split())) > 0:
        return ClassLabels.SPAM 
    else:
        return ABSTAIN

@labeling_function(resources=dict(keywords=notFreeWords),pre=[convert_to_lower],label=ClassLabels.HAM)
def LF4(c,**kwargs):
    if "free" in c.split() and len(kwargs["keywords"].intersection(c.split()))>0:
        return ClassLabels.HAM
    else:
        return ABSTAIN

@labeling_function(resources=dict(keywords=notFreeSubstring),pre=[convert_to_lower],label=ClassLabels.HAM)
def LF5(c,**kwargs):
    for pattern in kwargs["keywords"]:    
        if "free" in c.split() and re.search(pattern,c, flags= re.I):
            return ClassLabels.HAM
    return ABSTAIN

@labeling_function(resources=dict(keywords=firstAndSecondPersonWords),pre=[convert_to_lower],label=ClassLabels.HAM)
def LF6(c,**kwargs):
    if "free" in c.split() and len(kwargs["keywords"].intersection(c.split()))>0:
        return ClassLabels.HAM
    else:
        return ABSTAIN


@labeling_function(resources=dict(keywords=thirdPersonWords),pre=[convert_to_lower],label=ClassLabels.HAM)
def LF7(c,**kwargs):
    if "free" in c.split() and len(kwargs["keywords"].intersection(c.split()))>0:
        return ClassLabels.HAM
    else:
        return ABSTAIN

@labeling_function(label=ClassLabels.SPAM.value)
def LF8(c,**kwargs):
    if (sum(1 for ch in c if ch.isupper()) > 6):
        return ClassLabels.SPAM
    else:
        return ABSTAIN

# @labeling_function()
# def LF9(c,**kwargs):
#     return ClassLabels.HAM.value

@labeling_function(cont_scorer=word_similarity,resources=dict(keywords=trigWord1),pre=[convert_to_lower],label=ClassLabels.SPAM)
def CLF1(c,**kwargs):
    if kwargs["continuous_score"] >= THRESHOLD:
        return ClassLabels.SPAM
    else:
        return ABSTAIN

@labeling_function(cont_scorer=word_similarity,resources=dict(keywords=trigWord2),pre=[convert_to_lower],label=ClassLabels.SPAM)
def CLF2(c,**kwargs):
    if kwargs["continuous_score"] >= THRESHOLD:
        return ClassLabels.SPAM
    else:
        return ABSTAIN

@labeling_function(cont_scorer=word_similarity,resources=dict(keywords=trigWord3),pre=[convert_to_lower],label=ClassLabels.SPAM)
def CLF3(c,**kwargs):
    if kwargs["continuous_score"] >= THRESHOLD:
        return ClassLabels.SPAM
    else:
        return ABSTAIN

@labeling_function(cont_scorer=word_similarity,resources=dict(keywords=notFreeWords),pre=[convert_to_lower],label=ClassLabels.HAM)
def CLF4(c,**kwargs):
    if kwargs["continuous_score"] >= THRESHOLD:
        return ClassLabels.HAM
    else:
        return ABSTAIN

@labeling_function(cont_scorer=word_similarity,resources=dict(keywords=notFreeSubstring),pre=[convert_to_lower],label=ClassLabels.HAM)
def CLF5(c,**kwargs):
    if kwargs["continuous_score"] >= THRESHOLD:
        return ClassLabels.HAM
    else:
        return ABSTAIN

@labeling_function(cont_scorer=word_similarity,resources=dict(keywords=firstAndSecondPersonWords),pre=[convert_to_lower],label=ClassLabels.HAM)
def CLF6(c,**kwargs):
    if kwargs["continuous_score"] >= THRESHOLD:
        return ClassLabels.HAM
    else:
        return ABSTAIN

@labeling_function(cont_scorer=word_similarity,resources=dict(keywords=thirdPersonWords),pre=[convert_to_lower],label=ClassLabels.HAM)
def CLF7(c,**kwargs):
    if kwargs["continuous_score"] >= THRESHOLD:
        return ClassLabels.HAM
    else:
        return ABSTAIN

@labeling_function(cont_scorer=lambda x: 1-np.exp(float(-(sum(1 for ch in x if ch.isupper()))/2)),label=ClassLabels.SPAM)
def CLF8(c,**kwargs):
    if kwargs["continuous_score"] >= THRESHOLD:
        return ClassLabels.SPAM
    else:
        return ABSTAIN

# @labeling_function()
# def CLF9(c,**kwargs):
#     return ClassLabels.HAM.value


In [4]:
LFS = [LF1,
    LF2,
    LF3,
    LF4,
    LF5,
    LF6,
    LF7,
    LF8,
    CLF1,
    CLF2,
    CLF3,
    CLF4,
    CLF5,
    CLF6,
    CLF7,
    CLF8]

rules = LFSet("SPAM_LF")
rules.add_lf_list(LFS)

In [5]:
from utils import load_data_to_numpy
import numpy as np

X, Y = load_data_to_numpy()
R = np.zeros((X.shape[0],len(rules.get_lfs())))


In [6]:
from labeling.noisy_labels import NoisyLabels

sms_noisy_labels = NoisyLabels("sms",X,Y,rules,R,ClassLabels)
E,S = sms_noisy_labels.get_labels()

100%|██████████| 5574/5574 [05:29<00:00, 16.92it/s]


In [None]:
sms_noisy_labels.generate_pickle()