# Classifier

## Imports

In [21]:
file_path = 'dataset.csv'

### I. Preprocessing
Tokenize each line in the dataset into words, ignore punctuation

In [95]:
import csv
import re


def tokenize_text(text):
    # Convert sentences to words
    words = text.lower().split()
    
    # Removing unnecessary punctuation and tags
    words = [re.sub(r"[^a-zA-Z0-9]", "", word) for word in words]
    
    # Removing stop words
    stop_words = set(['the', 'is', 'are', 'and', 'that', 'do', 'have'])
    words = [word for word in words if word not in stop_words]
    
    # Stemming (using a simple stemming rule)
    words = [word[:-1] if word.endswith('s') else word for word in words]
    
    # Lemmatization (not implemented in this example)
    
    return words


def tokenize_csv(file_path, type, cutt_off):
    tokenized_data = []
    
    with open(file_path, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        
        for i, row in enumerate(reader):
            if type == 'train' and i >= cutt_off:
                continue

            if type == 'test' and i < cutt_off:
                continue
            
            tokenized_row = {}
            
            # Tokenize the 'text' column
            tokenized_text = tokenize_text(row['text'])
            tokenized_row['text'] = tokenized_text
            
            # Preserve the 'type' column as is
            tokenized_row['type'] = row['type']
            
            tokenized_data.append(tokenized_row)
    
    return tokenized_data


### II. Model and Training
Split the data (consisting of 5559 messages) and train the model on the 5000 of the data

In [114]:
cutt_off = 500

In [115]:
from collections import Counter


training_data = tokenize_csv(file_path, 'train', cutt_off)
print(training_data[:5])


spam, ham = [], []
total_spam_messages, total_ham_messages = 0, 0

for message in training_data:
    text, type = message['text'], message['type']

    if type == 'spam':
        spam.extend(text)
        total_spam_messages += 1
    else:
        ham.extend(text)
        total_ham_messages += 1

total_words_in_ham = len(ham)
total_words_in_spam = len(spam)

count_ham = Counter(ham)
count_spam = Counter(spam)

likelyhood_ham = {word: count_ham[word] / total_words_in_ham for word in count_ham}
print(likelyhood_ham)
likelyhood_spam = {word: count_spam[word] / total_words_in_spam for word in count_spam}

probability_ham = total_ham_messages / (total_ham_messages + total_spam_messages)
probability_spam = total_spam_messages / (total_ham_messages + total_spam_messages)

print(probability_ham, probability_spam)

[{'text': ['hope', 'you', 'having', 'a', 'good', 'week', 'just', 'checking', 'in'], 'type': 'ham'}, {'text': ['kgive', 'back', 'my', 'thank'], 'type': 'ham'}, {'text': ['am', 'also', 'doing', 'in', 'cbe', 'only', 'but', 'to', 'pay'], 'type': 'ham'}, {'text': ['complimentary', '4', 'star', 'ibiza', 'holiday', 'or', '10000', 'cash', 'need', 'your', 'urgent', 'collection', '09066364349', 'now', 'from', 'landline', 'not', 'to', 'lose', 'out', 'box434sk38wp150ppm18'], 'type': 'spam'}, {'text': ['okmail', 'dear', 'dave', 'thi', 'your', 'final', 'notice', 'to', 'collect', 'your', '4', 'tenerife', 'holiday', 'or', '5000', 'cash', 'award', 'call', '09061743806', 'from', 'landline', 'tc', 'sae', 'box326', 'cw25wx', '150ppm'], 'type': 'spam'}]
{'hope': 0.0015060240963855422, 'you': 0.028279785809906293, 'having': 0.0008366800535475234, 'a': 0.01572958500669344, 'good': 0.0028447121820615795, 'week': 0.001004016064257028, 'just': 0.0048527443105756355, 'checking': 0.00033467202141900936, 'in': 0.0

### III. Testing

#### III.I. Prediction Function

In [116]:
def predict(message):
    probability_ham_message_given_message = 1
    words_given_ham = 1
    words_given_spam = 1

    for word in message:
        words_given_ham *= likelyhood_ham[word] if word in likelyhood_ham else 1 / total_words_in_ham
        words_given_spam *= likelyhood_spam[word] if word in likelyhood_spam else 1 / total_words_in_spam
    
    probability_ham_message_given_message *= words_given_ham * probability_ham
    probability_spam_message_given_message = words_given_spam * probability_spam

    if probability_ham_message_given_message > probability_spam_message_given_message:
        return 'ham'
    return 'spam'


#### III.II. Moment of Truth

In [117]:
testing_data = tokenize_csv(file_path, 'test', cutt_off)

count_correct = 0
count_incorrect = 0

for message in testing_data:
    if predict(message['text']) == message['type']:
        count_correct += 1
    else:
        count_incorrect += 1

print(count_correct / (count_correct + count_incorrect))

0.7404625420043487
