# ABOUT:
- this notebook:
    1. **append categorical features as a string** to the end of the "text" feature
        - rationale:
            - appending good predictors of spam as text can help BERT learn to predict spam better
    2. tokenize in preparation for input for transformers

In [1]:
import pandas as pd
df = pd.read_csv(r"C:\Users\tanch\Documents\GitHub\Spam Detection (local)\data\smsdata.txt", sep = "\t",names = ['label','text'])

## Extract categorical features

- based on past work and visualisation we found that:
    1. emails with large number of digits are very likely to be spam
    2. emails that contain pounds sign are very likely to be spam
    3. emails that contain links are very liekly to be spam
    4. emails that contain Ellipses are very liekly to be NOT spam
    5. emails that contain happy faces are very liekly to be NOT spam

In [3]:
def happy_face_feature(text):
    if ':)' in text or ':-)' in text:
        return "Contains happy face."
    return ""
df['happy_face']=df['text'].apply(lambda text:happy_face_feature(text))

In [5]:
def ellipses_feature(text):
    return "" if ".." not in text else "Contains ellipses."
df['ellipses']=df['text'].apply(lambda text:ellipses_feature(text))

In [6]:
def links_feature(text):
    if 'www.' in text.lower() or 'http' in text.lower():
        return "Contains links."
    return ""
df['links']=df['text'].apply(lambda text:links_feature(text))

In [8]:
def pounds_feature(text):
    if '£' in text:
        return "Contains pounds sign."
    return ""
df['pounds']=df['text'].apply(lambda text:pounds_feature(text))

In [9]:
def digits_feature(text):
    count = 0
    for c in text:
        if c.isdigit():
            count+=1
        if count>10:
            return "Contains many digits."
    return ""       
df['many_digits']=df['text'].apply(lambda text:digits_feature(text))

# add categorical features into text

In [12]:
text_features = ['text', 'happy_face', 'ellipses', 'links', 'pounds','many_digits']
df['amended_text'] = ""
for col in text_features:
    df['amended_text']+=df[col]

In [19]:
from datasets import ClassLabel, Sequence
import pandas as pd
from IPython.display import display, HTML
def show_elements(dataset, randomize = True, num_samples = 10):
    
    if isinstance(dataset,pd.DataFrame):                  # if DataFrame 
        if randomize:                                          # if random> shuffle
            dataset = dataset.sample(frac=1)
        display(HTML(dataset.iloc[:num_samples].to_html()))             # take first n rows
    
    else:                                                    # if not DataFrame
        if randomize:                                           # if random> shuffle
            dataset = dataset.shuffle()   
        dataset = pd.DataFrame(dataset.select(range(num_samples)))   # convert first n rows to dataframe
        display(HTML(dataset.to_html()))
    

In [22]:
show_elements(df)

Unnamed: 0,label,text,happy_face,ellipses,links,pounds,many_digits,amended_text
2649,0,"Hi, can i please get a &lt;#&gt; dollar loan from you. I.ll pay you back by mid february. Pls.",,,,,,"Hi, can i please get a &lt;#&gt; dollar loan from you. I.ll pay you back by mid february. Pls."
2003,1,TheMob>Yo yo yo-Here comes a new selection of hot downloads for our members to get for FREE! Just click & open the next link sent to ur fone...,,Contains ellipses.,,,,TheMob>Yo yo yo-Here comes a new selection of hot downloads for our members to get for FREE! Just click & open the next link sent to ur fone...Contains ellipses.
2094,1,"Final Chance! Claim ur £150 worth of discount vouchers today! Text YES to 85023 now! SavaMob, member offers mobile! T Cs SavaMob POBOX84, M263UZ. £3.00 Subs 16",,,,Contains pounds sign.,Contains many digits.,"Final Chance! Claim ur £150 worth of discount vouchers today! Text YES to 85023 now! SavaMob, member offers mobile! T Cs SavaMob POBOX84, M263UZ. £3.00 Subs 16Contains pounds sign.Contains many digits."
1303,0,"FRAN I DECIDED 2 GO N E WAY IM COMPLETELY BROKE AN KNACKERED I GOT UP BOUT 3 C U 2MRW LOVE JANX P.S THIS IS MY DADS FONE, -NO CREDIT",,,,,,"FRAN I DECIDED 2 GO N E WAY IM COMPLETELY BROKE AN KNACKERED I GOT UP BOUT 3 C U 2MRW LOVE JANX P.S THIS IS MY DADS FONE, -NO CREDIT"
226,0,Would really appreciate if you call me. Just need someone to talk to.,,,,,,Would really appreciate if you call me. Just need someone to talk to.
4012,1,Please call our customer service representative on FREEPHONE 0808 145 4742 between 9am-11pm as you have WON a guaranteed £1000 cash or £5000 prize!,,,,Contains pounds sign.,Contains many digits.,Please call our customer service representative on FREEPHONE 0808 145 4742 between 9am-11pm as you have WON a guaranteed £1000 cash or £5000 prize!Contains pounds sign.Contains many digits.
5412,0,Gr8. So how do you handle the victoria island traffic. Plus when's the album due,,,,,,Gr8. So how do you handle the victoria island traffic. Plus when's the album due
4095,0,how are you? I miss you!,,,,,,how are you? I miss you!
4077,1,87077: Kick off a new season with 2wks FREE goals & news to ur mobile! Txt ur club name to 87077 eg VILLA to 87077,,,,,Contains many digits.,87077: Kick off a new season with 2wks FREE goals & news to ur mobile! Txt ur club name to 87077 eg VILLA to 87077Contains many digits.
4606,0,"I need an 8th but I'm off campus atm, could I pick up in an hour or two?",,,,,,"I need an 8th but I'm off campus atm, could I pick up in an hour or two?"


### encode label

In [21]:
df.label = df.label.map({"spam":1,"good":0})

### tokenize

In [24]:
from datasets import Dataset
amended_Dataset = Dataset.from_pandas(df[["label","amended_text"]])
amended_Dataset = amended_Dataset.train_test_split(0.3,seed=1)
amended_Dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'amended_text'],
        num_rows: 3900
    })
    test: Dataset({
        features: ['label', 'amended_text'],
        num_rows: 1672
    })
})

In [25]:
max_length = 100

In [26]:
from transformers import AutoTokenizer
model_checkpoint = "distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [27]:
def tokenize(examples):
    return tokenizer(examples['amended_text'],
                     max_length = max_length,
                     padding = "max_length",
                     truncation = True)

In [28]:
tokenized_amended_Dataset = amended_Dataset.map(tokenize)

HBox(children=(FloatProgress(value=0.0, max=3900.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1672.0), HTML(value='')))




In [30]:
show_elements(tokenized_amended_Dataset['train'])

Unnamed: 0,amended_text,attention_mask,input_ids,label
0,"Urgent! call 09066350750 from your landline. Your complimentary 4* Ibiza Holiday or 10,000 cash await collection SAE T&Cs PO BOX 434 SK3 8WP 150 ppm 18+Contains many digits.","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 38046, 13907, 328, 486, 321, 3248, 4280, 10056, 10569, 31, 110, 1212, 1902, 4, 2486, 22310, 204, 3226, 14643, 10071, 10824, 50, 158, 6, 151, 1055, 17396, 2783, 208, 16329, 255, 947, 31229, 17182, 40007, 204, 3079, 14795, 246, 290, 28435, 3982, 42805, 504, 2744, 26412, 5069, 171, 15769, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",1
1,PRIVATE! Your 2004 Account Statement for 07742676969 shows 786 unredeemed Bonus Points. To claim call 08719180248 Identifier Code: 45239 ExpiresContains many digits.,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 4454, 6372, 8625, 328, 2486, 4482, 16036, 19356, 13, 321, 39786, 28497, 4563, 4563, 924, 262, 5334, 542, 2050, 242, 33578, 32844, 19378, 4, 598, 2026, 486, 321, 5677, 1646, 14515, 28654, 28763, 24072, 8302, 35, 2248, 29561, 12806, 7948, 26412, 5069, 171, 15769, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",1
2,I'm so in love with you. I'm excited each day i spend with you. You make me so happy.,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 100, 437, 98, 11, 657, 19, 47, 4, 38, 437, 2283, 349, 183, 939, 1930, 19, 47, 4, 370, 146, 162, 98, 1372, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",0
3,Then. You are eldest know.,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 12948, 4, 370, 32, 21023, 216, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",0
4,Nope i'm not drivin... I neva develop da photos lei...Contains ellipses.,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 487, 9877, 939, 437, 45, 13911, 6320, 734, 38, 3087, 3952, 2179, 2955, 2356, 2084, 118, 734, 26412, 5069, 28041, 7418, 293, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",0
5,wamma get laid?want real doggin locations sent direct to your mobile? join the UKs largest dogging network. txt dogs to 69696 now!nyt. ec2a. 3lp £1.50/msg.Contains pounds sign.,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 605, 424, 1916, 120, 4976, 116, 32835, 588, 109, 6149, 179, 3237, 1051, 2228, 7, 110, 1830, 116, 1962, 5, 987, 29, 1154, 109, 23941, 1546, 4, 326, 11483, 3678, 7, 5913, 36999, 122, 328, 2855, 90, 4, 20508, 176, 102, 4, 155, 39031, 984, 134, 4, 1096, 73, 48593, 4, 26412, 5069, 2697, 1203, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",1
6,Cheers for the card ... Is it that time of year already?Contains ellipses.,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 16764, 268, 13, 5, 1886, 1666, 1534, 24, 14, 86, 9, 76, 416, 116, 26412, 5069, 28041, 7418, 293, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",0
7,hi baby im sat on the bloody bus at the mo and i wont be home until about 7:30 wanna do somethin later? call me later ortxt back jess xx,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 3592, 1928, 4356, 4005, 15, 5, 13629, 2353, 23, 5, 7458, 8, 939, 40067, 28, 184, 454, 59, 262, 35, 541, 23126, 109, 45420, 11040, 423, 116, 486, 162, 423, 50, 46795, 124, 1236, 3361, 37863, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",0
8,I'm serious. You are in the money base,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 100, 437, 1473, 4, 370, 32, 11, 5, 418, 1542, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",0
9,SMS AUCTION - A BRAND NEW Nokia 7250 is up 4 auction today! Auction is FREE 2 join & take part! Txt NOKIA to 86021 now!Contains many digits.,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 104, 6222, 83, 28120, 7744, 111, 83, 6823, 5945, 5178, 10951, 4801, 1096, 16, 62, 204, 4912, 452, 328, 26342, 16, 5198, 132, 1962, 359, 185, 233, 328, 255, 11483, 8228, 530, 2889, 7, 290, 2466, 2146, 122, 328, 26412, 5069, 171, 15769, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",1


## export

In [31]:
tokenized_amended_Dataset.save_to_disk(r"C:\Users\tanch\Documents\GitHub\Spam Detection (local)\data\tokenized_amended_Dataset")