In [7]:
# importing the Dataset

import pandas as pd

messages = pd.read_csv('SMSSpamCollection', sep='\t',
                           names=["label", "message"])


In [8]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [9]:
messages['message'].loc[451]

'hanks lotsly!'

In [10]:
X=messages['message']

In [11]:
y=pd.get_dummies(messages['label'])
y=y.iloc[:,1].values

In [12]:
y

array([False, False,  True, ..., False, False, False])

In [13]:
X.shape

(5572,)

In [14]:
y.shape

(5572,)

In [15]:
# Train Test Split before processing to avoid data leakage and prevent overfitting.

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)


In [16]:
X_train[0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [17]:
print(type(X_train.iloc[0]))


<class 'str'>


In [18]:
#Data cleaning and preprocessing
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [19]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [20]:
## doing stemming on both train and test data
corpus_train = []
corpus_test = []
for i in range(0, len(X_train)):
    review = re.sub('[^a-zA-Z]', ' ', X_train.iloc[i])
    review = review.lower()
    review = review.split()

    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus_train.append(review)
for i in range(0, len(X_test)):
    review = re.sub('[^a-zA-Z]', ' ', X_test.iloc[i])
    review = review.lower()
    review = review.split()

    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus_test.append(review)

In [21]:
corpus_train

['good movi ok leav hourish',
 'free give otherwis nalla adi entey nattil kittum',
 'emigr someth ok mayb bit hope',
 'got home babe still awak',
 'kay sinc alreadi',
 'workin get job',
 'said matter mind say matter',
 'oh yeah diet flew window',
 'sorri got thing may pub later',
 'ill call even ill idea',
 'dear reach room',
 'got look scrumptiou daddi want eat night long',
 'badrith chennai sure pick us competit',
 'da car park',
 'okay soon best',
 'yar els thk sort funni thing',
 'yup izzit still rain heavili co e mrt c outsid',
 'way ur home',
 'ok pop ask bout someth said around tonght wen girl come',
 'talk g x',
 'cool tyler take gonna buy drop place later tonight total order quarter got enough',
 'work time',
 'yup free',
 'mayb westshor hyde park villag place near hous',
 'said would woke',
 'drama pl enough famili struggl hot sun strang place reason ego go invit actual necess go wait seriou reppurcuss',
 'what come hill monster hope great day thing r go fine busi though',
 '

In [22]:
corpus_test

['storm msg wen u lift phne u say hello u knw wt real mean hello name girl ye u knw dat girl margaret hello girlfrnd f grahmbel invnt telphon moral one get name person bt girlfrnd g n g h',
 'forward pleas call immedi urgent messag wait',
 'also sorta blown coupl time recent id rather text blue look weed',
 'sir goodmorn free call',
 'come aliv better correct good look figur',
 'hous maid murder coz man murder lt gt th januari public holiday govt instituit close includ post offic understand',
 'hot n horni will live local text repli hear strt back p per msg netcollex ltdhelpdesk repli stop end',
 'sorri call later meet thing relat trade pleas call arul lt gt',
 'oh sorri pleas',
 'hey hun onbu goin meet want go meal donyt feel like cuz get last bu home he sweet latelyxxx',
 'stupid auto correct phone',
 'oh k k big hitter anyway good',
 'roger probabl go rem',
 'ya ok vikki vl c witin lt gt min il repli u',
 'well leav class babe never came back hope nice sleep love',
 'problem babi go

In [23]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500)
X_train = cv.fit_transform(corpus_train).toarray()

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500)
X_test = cv.fit_transform(corpus_test).toarray()

In [24]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [25]:
# List of models to test
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

In [26]:
# Dictionary to hold models and their names
models = {
    'Multinomial Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Support Vector Machine (SVM)': SVC(kernel='linear'),
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'K-Nearest Neighbors (KNN)': KNeighborsClassifier(n_neighbors=3),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(n_estimators=100)
}


In [27]:
from sklearn.metrics import accuracy_score, classification_report

# Step 3: Train models, make predictions and evaluate
results = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)  # Train the model
    y_pred = model.predict(X_test)  # Make predictions

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Generate classification report
    report = classification_report(y_test, y_pred)

    # Store the results
    results[model_name] = {
        'accuracy': accuracy,
        'classification_report': report
    }

# Step 4: Print all models' accuracy and classification report
for model_name, result in results.items():
    print(f"{model_name}: Accuracy = {result['accuracy']:.4f}")
    print(f"Classification Report:\n{result['classification_report']}")

# Step 5: Find the best model based on accuracy
best_model = max(results, key=lambda x: results[x]['accuracy'])
best_accuracy = results[best_model]['accuracy']
best_report = results[best_model]['classification_report']

# Print the best model and its accuracy and classification report
print(f"\nBest Model: {best_model}")
print(f"Best Accuracy: {best_accuracy:.4f}")
print(f"Best Classification Report:\n{best_report}")


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Multinomial Naive Bayes: Accuracy = 0.7283
Classification Report:
              precision    recall  f1-score   support

       False       0.87      0.81      0.84       955
        True       0.18      0.26      0.21       160

    accuracy                           0.73      1115
   macro avg       0.52      0.53      0.52      1115
weighted avg       0.77      0.73      0.75      1115

Logistic Regression: Accuracy = 0.8565
Classification Report:
              precision    recall  f1-score   support

       False       0.86      1.00      0.92       955
        True       0.00      0.00      0.00       160

    accuracy                           0.86      1115
   macro avg       0.43      0.50      0.46      1115
weighted avg       0.73      0.86      0.79      1115

Support Vector Machine (SVM): Accuracy = 0.8502
Classification Report:
              precision    recall  f1-score   support

       False       0.86      0.99      0.92       955
        True       0.27      0.03     

In [28]:
# Creating the TFIDF model
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features=2500)
X_train = tv.fit_transform(corpus_train).toarray()

from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features=2500)
X_test = tv.fit_transform(corpus_test).toarray()

In [29]:
X_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [30]:
# List of models to test
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

In [31]:
# Dictionary to hold models and their names
models = {
    'Multinomial Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Support Vector Machine (SVM)': SVC(kernel='linear'),
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'K-Nearest Neighbors (KNN)': KNeighborsClassifier(n_neighbors=3),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(n_estimators=100)
}


In [32]:
from sklearn.metrics import accuracy_score, classification_report

# Step 3: Train models, make predictions and evaluate
results = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)  # Train the model
    y_pred = model.predict(X_test)  # Make predictions

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Generate classification report
    report = classification_report(y_test, y_pred)

    # Store the results
    results[model_name] = {
        'accuracy': accuracy,
        'classification_report': report
    }

# Step 4: Print all models' accuracy and classification report
for model_name, result in results.items():
    print(f"{model_name}: Accuracy = {result['accuracy']:.4f}")
    print(f"Classification Report:\n{result['classification_report']}")

# Step 5: Find the best model based on accuracy
best_model = max(results, key=lambda x: results[x]['accuracy'])
best_accuracy = results[best_model]['accuracy']
best_report = results[best_model]['classification_report']

# Print the best model and its accuracy and classification report
print(f"\nBest Model: {best_model}")
print(f"Best Accuracy: {best_accuracy:.4f}")
print(f"Best Classification Report:\n{best_report}")


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Multinomial Naive Bayes: Accuracy = 0.8287
Classification Report:
              precision    recall  f1-score   support

       False       0.86      0.95      0.90       955
        True       0.25      0.09      0.14       160

    accuracy                           0.83      1115
   macro avg       0.55      0.52      0.52      1115
weighted avg       0.77      0.83      0.79      1115

Logistic Regression: Accuracy = 0.8565
Classification Report:
              precision    recall  f1-score   support

       False       0.86      1.00      0.92       955
        True       0.00      0.00      0.00       160

    accuracy                           0.86      1115
   macro avg       0.43      0.50      0.46      1115
weighted avg       0.73      0.86      0.79      1115

Support Vector Machine (SVM): Accuracy = 0.8502
Classification Report:
              precision    recall  f1-score   support

       False       0.86      0.99      0.92       955
        True       0.11      0.01     

#### We can see an improvement of around 10% in the accuracy of the Multinomial Naive Bayes model when using TF-IDF compared to Bag of Words (BoW)

# Word2Vec and AvgWord2Vec Implementation for Text Classification


In [33]:
!pip install gensim



In [35]:
import gensim

In [50]:
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [39]:
# Download the necessary NLTK data package
nltk.download('punkt_tab')  # Download the punkt_tab data package

# Tokenize and preprocess the corpus
tokenized_corpus = [word_tokenize(sentence.lower()) for sentence in messages['message']]

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [40]:
# Step 2: Train Word2Vec model using the tokenized sentences
model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=2, sg=1)

In [41]:
# Step 3: Function to average word vectors for a sentence
def get_average_vector(sentence, model):
    # Get the vector for each word and average them
    word_vectors = [model.wv[word] for word in sentence if word in model.wv]
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

In [43]:
# Step 4: Convert each document into a vector by averaging the word vectors
X = np.array([get_average_vector(sentence, model) for sentence in tokenized_corpus])
y = np.array(messages['label'])

In [44]:
# Step 5: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [53]:
# Train a logistic regression model using the averaged Word2Vec embeddings
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Display the results
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)


Accuracy: 0.9695067264573991
Classification Report:
              precision    recall  f1-score   support

         ham       0.97      0.99      0.98       966
        spam       0.95      0.81      0.88       149

    accuracy                           0.97      1115
   macro avg       0.96      0.90      0.93      1115
weighted avg       0.97      0.97      0.97      1115



## See, we are getting almost 97% accuracy, which is significantly better than Bag of Words (BoW) and TF-IDF. This is because AvgWord2Vec captures the semantic meaning of words by representing them as dense vectors, whereas BoW and TF-IDF rely on word frequencies and do not account for the context or relationships between words.
