In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development. 

In [3]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("/Users/chandlershortlidge/Desktop/Ironhack/lab-natural-language-processing/data/kg_test.csv",encoding='latin-1')

# Reduce the training set to speed up development. 
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

(1000, 1)


In [4]:
data.head()

Unnamed: 0,text
0,usiness is for the fact that the deceased man ...
1,They are happy to adjust to the afternoon. I a...
2,Lael Brainard was confirmed 78-19 this afterno...
3,H <hrod17@clintonemail.com>Friday March 26 201...
4,"n;""> Dear Good Friend,<br><br><br>I am happy t..."


### Let's divide the training and test set into two partitions

In [5]:
# Your code

# Load training data
data_train = pd.read_csv("../data/kg_train.csv", encoding='latin-1')
data_train = data_train.head(1000)  # Reduce for faster development
data_train.fillna("", inplace=True)

# Load test/validation data
data_val = pd.read_csv("../data/kg_test.csv", encoding='latin-1')
data_val = data_val.head(1000)  # Reduce for faster development
data_val.fillna("", inplace=True)

print(f"Training set: {data_train.shape}")
print(f"Validation set: {data_val.shape}")

Training set: (1000, 2)
Validation set: (1000, 1)


## Data Preprocessing

In [6]:
import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on']


## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters
 
- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [10]:
# Your code

import re

def preprocess_text(text):
    # Remove inline JavaScript/CSS
    text = re.sub(r'<script[^>]*>.*?</script>', '', text, flags=re.DOTALL)
    text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL)
    
    # Remove HTML comments
    text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)
    
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    
    # Remove special characters (keep only letters, numbers, spaces)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove all single characters (standalone letters)
    text = re.sub(r'\b[a-zA-Z]\b', '', text)
    
    # Remove single characters from the start
    text = re.sub(r'^[a-zA-Z]\s', '', text)
    
    # Substitute multiple spaces with single space
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove prefixed 'b' (like b'string' from bytes)
    text = re.sub(r"^b'", '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    return text

# Create preprocessed_text column by applying to original text
data_train['preprocessed_text'] = data_train['text'].apply(preprocess_text)
data_val['preprocessed_text'] = data_val['text'].apply(preprocess_text)

print("Preprocessing complete!")
print(data_train[['text', 'preprocessed_text']].head(2))

Preprocessing complete!
                                                text  \
0  DEAR SIR, STRICTLY A PRIVATE BUSINESS PROPOSAL...   
1                                           Will do.   

                                   preprocessed_text  
0  dear sir strictly private business proposal am...  
1                                            will do  


## Now let's work on removing stopwords
Remove the stopwords.

In [11]:
# Your code

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Apply to your data
data_train['preprocessed_text'] = data_train['preprocessed_text'].apply(remove_stopwords)
data_val['preprocessed_text'] = data_val['preprocessed_text'].apply(remove_stopwords)

## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [12]:
# Your code

from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    words = text.split()
    lemmatized = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized)

# Apply to your data
data_train['preprocessed_text'] = data_train['preprocessed_text'].apply(lemmatize_text)
data_val['preprocessed_text'] = data_val['preprocessed_text'].apply(lemmatize_text)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/chandlershortlidge/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [15]:
# Your code
from collections import Counter

# Split by label
ham_texts = data_train[data_train['label'] == 0]['preprocessed_text']
spam_texts = data_train[data_train['label'] == 1]['preprocessed_text']

# Combine all words for each
ham_words = ' '.join(ham_texts).split()
spam_words = ' '.join(spam_texts).split()

# Count and get top 10
print("Top 10 ham words:")
print(Counter(ham_words).most_common(10))

print("\nTop 10 spam words:")
print(Counter(spam_words).most_common(10))

Top 10 ham words:
[('u', 115), ('pm', 115), ('would', 106), ('state', 103), ('president', 94), ('call', 91), ('time', 84), ('percent', 77), ('secretary', 76), ('work', 73)]

Top 10 spam words:
[('money', 920), ('account', 794), ('bank', 745), ('fund', 703), ('u', 550), ('business', 473), ('transaction', 416), ('country', 406), ('transfer', 392), ('million', 385)]


## Extra features

In [16]:
# We add to the original dataframe two additional indicators (money symbols and suspicious words).
money_simbol_list = "|".join(["euro","dollar","pound","€",r"\$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

data_train['money_mark'] = data_train['preprocessed_text'].str.contains(money_simbol_list)*1
data_train['suspicious_words'] = data_train['preprocessed_text'].str.contains(suspicious_words)*1
data_train['text_len'] = data_train['preprocessed_text'].apply(lambda x: len(x)) 

data_val['money_mark'] = data_val['preprocessed_text'].str.contains(money_simbol_list)*1
data_val['suspicious_words'] = data_val['preprocessed_text'].str.contains(suspicious_words)*1
data_val['text_len'] = data_val['preprocessed_text'].apply(lambda x: len(x)) 

data_train.head()

Unnamed: 0,text,label,preprocessed_text,money_mark,suspicious_words,text_len
0,"DEAR SIR, STRICTLY A PRIVATE BUSINESS PROPOSAL...",1,dear sir strictly private business proposal mi...,1,1,1504
1,Will do.,0,,0,0,0
2,Nora--Cheryl has emailed dozens of memos about...,0,noracheryl emailed dozen memo haiti weekend pl...,0,0,110
3,Dear Sir=2FMadam=2C I know that this proposal ...,1,dear sirfmadamc know proposal might surprise e...,1,1,1382
4,fyi,0,fyi,0,0,3


## How would work the Bag of Words with Count Vectorizer concept?

In [17]:
# Your code
from sklearn.feature_extraction.text import CountVectorizer

# Create the vectorizer
count_vectorizer = CountVectorizer()

# Fit and transform the training data
bow_train = count_vectorizer.fit_transform(data_train['preprocessed_text'])

# Transform the validation data (using the same vocabulary)
bow_val = count_vectorizer.transform(data_val['preprocessed_text'])

print(f"Vocabulary size: {len(count_vectorizer.vocabulary_)}")
print(f"Training BoW shape: {bow_train.shape}")
print(f"Validation BoW shape: {bow_val.shape}")

# Show example: first 10 words in vocabulary
feature_names = count_vectorizer.get_feature_names_out()
print(f"\nFirst 10 words in vocabulary: {feature_names[:10]}")


Vocabulary size: 19352
Training BoW shape: (1000, 19352)
Validation BoW shape: (1000, 19352)

First 10 words in vocabulary: ['aac' 'aaclocated' 'aae' 'aag' 'aaronovitchon' 'abacha' 'abachabefore'
 'abachac' 'abachace' 'abachaco']


## TF-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [19]:
# Your code

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(data_train["preprocessed_text"])
X_val = vectorizer.transform(data_val["preprocessed_text"])

# Print shape
print("Train shape:", X_train.shape)
print("Val shape:", X_val.shape)



Train shape: (1000, 19352)
Val shape: (1000, 19352)


## And the Train a Classifier?

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# Split training data into train and validation
X_train_split, X_val_split, y_train, y_val = train_test_split(
    X_train, data_train['label'], test_size=0.2, random_state=42
)

# Train
model = LogisticRegression(max_iter=1000)
model.fit(X_train_split, y_train)

# Predict
y_pred = model.predict(X_val_split)

# Evaluate
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.91      1.00      0.95       125
           1       1.00      0.84      0.91        75

    accuracy                           0.94       200
   macro avg       0.96      0.92      0.93       200
weighted avg       0.95      0.94      0.94       200



### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to **find the most relevant features**.

For example, you can test the following options and check which of them performs better:
- Using "Bag of Words" only
- Using "TF-IDF" only
- Bag of Words + extra flags (money_mark, suspicious_words, text_len)
- TF-IDF + extra flags


You can work with teams of two persons (recommended).

In [29]:
# Your code

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(data_train['preprocessed_text'])

# Split training data into train and validation
X_train_split, X_val_split, y_train, y_val = train_test_split(
    X_train, data_train['label'], test_size=0.2, random_state=42
)

# Train
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_split, y_train)

# Predict
y_pred = model.predict(X_val_split)

# Evaluate
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.93      0.94       125
           1       0.89      0.93      0.91        75

    accuracy                           0.93       200
   macro avg       0.92      0.93      0.93       200
weighted avg       0.93      0.93      0.93       200



In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(data_train['preprocessed_text'])

# Split training data into train and validation
X_train_split, X_val_split, y_train, y_val = train_test_split(
    X_train, data_train['label'], test_size=0.2, random_state=42
)

# Train
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_split, y_train)

# Predict
y_pred = model.predict(X_val_split)

# Evaluate
print(classification_report(y_val, y_pred))


              precision    recall  f1-score   support

           0       0.98      0.92      0.95       125
           1       0.88      0.97      0.92        75

    accuracy                           0.94       200
   macro avg       0.93      0.95      0.94       200
weighted avg       0.94      0.94      0.94       200



In [33]:
from scipy.sparse import hstack
import numpy as np

# Create extra features (do this BEFORE the train/test split)
data_train['money_mark'] = data_train['text'].str.contains('\$|£|€|money|cash', regex=True).astype(int)
data_train['suspicious_words'] = data_train['text'].str.lower().str.contains('free|win|click|urgent|claim', regex=True).astype(int)

# Vectorize text
vectorizer = TfidfVectorizer()
X_text = vectorizer.fit_transform(data_train['preprocessed_text'])

# Get extra features as array
extra_features = data_train[['money_mark', 'suspicious_words']].values

# Combine them
X_combined = hstack([X_text, extra_features])

# Now do your train/test split on X_combined
X_train_split, X_val_split, y_train, y_val = train_test_split(
    X_combined, data_train['label'], test_size=0.2, random_state=42
)

# Train and evaluate
model = MultinomialNB()
model.fit(X_train_split, y_train)
y_pred = model.predict(X_val_split)
print(classification_report(y_val, y_pred))


              precision    recall  f1-score   support

           0       0.97      0.89      0.93       125
           1       0.84      0.96      0.89        75

    accuracy                           0.92       200
   macro avg       0.91      0.92      0.91       200
weighted avg       0.92      0.92      0.92       200

