In [17]:
from nltk.tokenize import word_tokenize

In [18]:
text = "Cats are running faster than dogs! Dogs, however, are smarter."
tokens = word_tokenize(text)
tokens

['Cats',
 'are',
 'running',
 'faster',
 'than',
 'dogs',
 '!',
 'Dogs',
 ',',
 'however',
 ',',
 'are',
 'smarter',
 '.']

In [19]:
# vocab
vocab = sorted(set(tokens))
vocab

['!',
 ',',
 '.',
 'Cats',
 'Dogs',
 'are',
 'dogs',
 'faster',
 'however',
 'running',
 'smarter',
 'than']

In [20]:
# this is manual method
from collections import Counter

word_counts = Counter(tokens)
bow_vector = [word_counts[word] for word in vocab]
print(bow_vector)


[1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1]


In [21]:
from collections import Counter

word_counts = Counter(tokens)
bow_vector = [word_counts[word] for word in vocab]
print(bow_vector)

[1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1]


### using sklearn

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

docs = [
    "Cats are running faster than dogs.",
    "Dogs are smarter than cats."
]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(docs)

print(vectorizer.get_feature_names_out())
print(X.toarray())


['are' 'cats' 'dogs' 'faster' 'running' 'smarter' 'than']
[[1 1 1 1 1 0 1]
 [1 1 1 0 0 1 1]]


In [44]:
# to use this we had to downgrade numpy to 1.26
from sklearn.feature_extraction.text import CountVectorizer

docs = [
    "Cats are running faster than dogs.",
    "Dogs are smarter than cats."
]

vectorizer = CountVectorizer()
x = vectorizer.fit_transform(docs)

print(vectorizer.get_feature_names_out())
print(x.toarray())

['are' 'cats' 'dogs' 'faster' 'running' 'smarter' 'than']
[[1 1 1 1 1 0 1]
 [1 1 1 0 0 1 1]]


In [3]:
vectorizer.vocabulary_

{'cats': 1,
 'are': 0,
 'running': 4,
 'faster': 3,
 'than': 6,
 'dogs': 2,
 'smarter': 5}

## Spam filter

In [49]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re 
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

### Data Loading

In [21]:
import pandas as pd

df = pd.read_csv("./data/spam.csv", encoding="latin-1")[['v1', 'v2']]
df.columns = ['label', 'message']

df.head(5)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Data preprocessing

In [26]:
def preprocessing(text):
    text = text.lower()                                         # lowercase
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)         # remove URLs
    text = re.sub(r'\S+@\S+', '', text)                         # remove emails
    text = re.sub(r'\+?\d[\d -]{8,}\d', '', text)               # remove phone numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    # stemming and stopwords
    return text

df['cleaned'] = df['message'].apply(preprocessing)
df.head()

Unnamed: 0,label,message,cleaned
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...


### Convert labels to binary

In [29]:
df['label_num'] = df['label'].map({'ham': 0, "spam": 1})

In [30]:
df.head()

Unnamed: 0,label,message,cleaned,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...,0
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,1
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...,0


### Train test split

In [41]:
x = df['cleaned']
y = df['label_num']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [42]:
print(f"x_train: {len(x_train)}, x_test: {len(x_test)}, y_train: {len(y_train)}, y_test: {len(y_test)}")

x_train: 4457, x_test: 1115, y_train: 4457, y_test: 1115


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(x_train)
X_test_vec = vectorizer.transform(x_test)


### Bag of words vectorization

In [None]:
vectorizer = CountVectorizer()
x_train_vec = vectorizer.fit_transform(x_train)
x_test_vec = vectorizer.transform(x_test)


<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 57296 stored elements and shape (4457, 8055)>

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

model = MultinomialNB()
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_test_vec)


In [57]:
model = MultinomialNB()
model.fit(x_train_vec, y_train)

y_pred = model.predict(x_test_vec)

In [58]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9811659192825112

Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.98      0.88      0.93       150

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [65]:
sample = ["i won"]
sample_clean = [preprocessing(s) for s in sample]
sample_vec = vectorizer.transform(sample_clean)

prediction = model.predict(sample_vec)
print("Spam" if prediction[0] == 1 else "Ham")


Spam


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Assume you have texts and labels
# X_train, X_test, y_train, y_test = train_test_split(text, labels, test_size=0.2)

vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(x_train)
X_test_vec = vectorizer.transform(x_test)

model = MultinomialNB()
model.fit(X_train_vec, y_train)

pred = model.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, pred))


NameError: name 'texts' is not defined