In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

In [None]:
df=pd.read_csv("/kaggle/input/sms-spam-collection-dataset/spam.csv",sep=",",encoding="ISO-8859-1")
df

In [None]:
## First we will remove unnecessary Columns
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],axis=1,inplace=True)

In [None]:
## Rename the Columns 
df.rename({"v1":"label","v2":"message"},axis=1,inplace=True)
df

In [None]:
## To check the the dataset is Balanced or Imbalaced 
df["label"].value_counts(),sns.countplot(x=df["label"])

From the above plot we can see that dataset is imbalanced and out of 5572 SMS,we have
1. 4825 ham SMS
2. 747 Spam SMS

In [None]:
## Import nltk library to do NLP tasks 
# (*) Will import all modules from that library
import nltk
## For stopwords
from nltk.corpus import *
# For Stemming and Lemmatization Preprocessing Models
from nltk.stem import *
## For regex Modlue to find patterns and preprorcessing of text
import re
## Install wordnet 
!pip install wordnet
## Import wordnet for Lemmatization process to find meaningful word
nltk.download('wordnet')
## To unzip the corpora wordnet
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

## Define Function for stemming and Lemmatization Text Preprocessing


In [None]:
def preprocess_text(text, use_stemming=True, use_lemmatization=True):
    '''It Takes 3 Argumetns , first one is input(text) , input has to be in entries in dataframe,
    if For Stemmed text then we have to put use_stemming as True and use_lemmatization as False,
    and for Lemmatized text vice Versa.'''
#each time the function is called, it initializes a fresh corpus list and processes 
#the text accordingly, ensuring that you get the desired stemmed or lemmatized corpus based on the arguments provided.
    corpus = []
    ## When tuning the parameters for better accuracy ,
    ##we can change the stemmer/Lemmatizer to see variation of accuracy
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    
    for i in range(len(text)):
        review = re.sub('[^a-zA-Z]', ' ', text[i])
        review = review.lower().split()
        
        if use_stemming:
            review = [stemmer.stem(word) for word in review if word not in stopwords.words("english")]
        elif use_lemmatization:
            review = [lemmatizer.lemmatize(word) for word in review if word not in stopwords.words("english")]
        
        review = " ".join(review)
        corpus.append(review)
    
    return corpus

In [None]:
## Give input for the first argument of preprocess_text funtion
text=df['message']
## Stemmed text after Stemming 
Stemmed_text = preprocess_text(text, use_stemming=True, use_lemmatization=False)
## Stemmed text after Lemmatization
Lemmatized_text = preprocess_text(text, use_stemming=False, use_lemmatization=True)

## Totally we have 3 types of Texts Now
1.  Raw Text
2. Stemmed Text
3. Lemmatized Text

While Checking for Accuracy of Model , try different types of texts to get Different Results 

In [None]:
## We can see that differnt text for stemming and lemmatization
for i in range(2490,2495):
    print(Stemmed_text[i])
    print(Lemmatized_text[i])
    print()

In [None]:
import sklearn
from sklearn.feature_extraction.text import *
## Import Word2vec
from gensim.models import Word2Vec


def Text_representation_models(text_corpus,use_Word2Vec=True) :
    X= []  # Initialize an empty list to store text representations

    if use_Word2Vec :
        # Training the Word2Vec model
        model = Word2Vec(sentences=text_corpus, min_count=1,vector_size=100,window=5,workers=3)
        # The Number of Words in the vocabulary and number Dimesions for each Word vector
        #model.wv.vectors.shape

        # Iterate over each text entry in the corpus
        for entry in text_corpus:
            # Retrieve Word2Vec vectors for each word in the entry and filter out words not in the model's vocabulary
            word_vectors = [model.wv[word] for word in entry.split() if word in model.wv]

            # Check if any valid word vectors are found
            if len(word_vectors) > 0:
                # Calculate the mean vector of all word vectors
                entry_vector = np.mean(word_vectors, axis=0)
            else:
                # Assign a zero vector if no valid word vectors are found
                entry_vector = np.zeros(model.vector_size)

            # Append the entry vector to the Word2Vec representations list
            X.append(entry_vector)

        # Convert the Word2Vec representations list into a NumPy array
        X= np.array(X)

    else:
        raise ValueError("Invalid text representation flag.")
        
    ##  Using these models returns the Sparsed Matrix of text 
    return X


vector_size: This parameter sets the dimensionality of the word vectors or embeddings. It determines the length of the feature vectors representing each word in the corpus. A higher value for size can capture more nuanced relationships between words, but it also increases the computational complexity.

window: The window size determines the maximum distance between the target word and its surrounding context words. It specifies the number of words before and after the target word that the model considers as context. For example, a window of 5 means that the model looks at five words before and five words after the target word in the training samples.

min_count: This parameter sets the minimum frequency threshold for words to be included in the vocabulary. Words that occur less frequently than min_count are ignored and not considered in the training process. Setting a higher value for min_count can help filter out rare words, reducing noise in the data and improving the quality of the learned embeddings.

workers: The workers parameter determines the number of parallel processes to use for training the model. By utilizing multiple workers, the training process can be accelerated, especially for large corpora. The recommended value depends on your hardware capabilities. Setting it to a higher value can speed up training, but it may consume more system resources.

In [None]:
## while checking for accuracy we can change the type of stemming and type of text representation
text_corpus=Stemmed_text
features=Text_representation_models(text_corpus,use_Word2Vec=True)
features,features.shape,type(features)

The Word2Vec representation shows a smaller number of samples (27) compared to the other representations. This is because Word2Vec is a context-based embedding model that learns word representations based on the context of words in the corpus. It requires a sufficient amount of training data to capture meaningful word embeddings.

In the given code, the Word2Vec model is trained on the preprocessed text corpus, and the resulting word vectors are extracted. The number of unique words in the vocabulary (5572) represents the vocabulary size. However, the number of samples (27) corresponds to the number of unique sentences or documents in the corpus that contain at least one word from the vocabulary.

In [None]:
## We are converting target column from text format to numerical representation using pd_get dummies method
target=pd.get_dummies(data=df["label"],drop_first=True)
# Target Column
print(target,"\n\n",target.value_counts())

## Define a function to transform the text data and to return Train and test data

In [None]:
# import Necessary Libraries to build model and to check performance of Model
from sklearn.pipeline import Pipeline
# To import Logistic regression Models
from sklearn.linear_model import *
#for Metrics
from sklearn.metrics import *
## To train_test_split method
from sklearn.model_selection import *
# For Naive Bayes method
from sklearn.naive_bayes import *
# For Support vector Machine
from sklearn.svm import SVC

## Using Defined Functions get Train and Test data 

In [None]:
## To check the Function is Working Correctly or Not 
text=df["message"]
## Assigning Target Column to traget variable
target=target
X_train, X_test, y_train, y_test=fit_data(text,target,use_stemming=True,use_lemmatization=False,use_Word2Vec=True)
X_train

In [None]:
def evaluate_model(model,X_test,y_test):
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    f1_micro = f1_score(y_test, predictions, average='micro')
    f1_macro = f1_score(y_test, predictions, average='macro')
    f1_weighted = f1_score(y_test, predictions, average='weighted')
    recall_micro = recall_score(y_test, predictions, average='micro')
    recall_macro = recall_score(y_test, predictions, average='macro')
    recall_weighted = recall_score(y_test, predictions, average='weighted')
    precision_micro = precision_score(y_test, predictions, average='micro')
    precision_macro = precision_score(y_test, predictions, average='macro')
    precision_weighted = precision_score(y_test, predictions, average='weighted')
    cm = confusion_matrix(y_test, predictions)
    
    return {
        'accuracy': accuracy,
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'recall_micro': recall_micro,
        'recall_macro': recall_macro,
        'recall_weighted': recall_weighted,
        'precision_micro': precision_micro,
        'precision_macro': precision_macro,
        'precision_weighted': precision_weighted,
        'confusion_matrix': cm
    }


In [None]:
## import necessary libraries for models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import RidgeClassifier, PassiveAggressiveClassifier


## List of Classification Models to check performance of each model on this task 
models = [
    LogisticRegression(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    SVC(),
    #GaussianNB(),## It Needs Non Negative data
    #MultinomialNB(),## It Needs Non Negative data
    #ComplementNB(),## It Needs Non Negative data
    BernoulliNB(),
    KNeighborsClassifier(),
    XGBClassifier(),
    MLPClassifier(),
    AdaBoostClassifier(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    GaussianProcessClassifier(),
    ExtraTreesClassifier(),
    RidgeClassifier(),
    PassiveAggressiveClassifier()
]

In [None]:
for model in models:
    results = evaluate_model(model, X_test, y_test)
    print("Model:", model)
    print("Accuracy:", results['accuracy'])
    print("F1 Score (Micro):", results['f1_micro'])
    print("F1 Score (Macro):", results['f1_macro'])
    print("F1 Score (Weighted):", results['f1_weighted'])
    print("Recall (Micro):", results['recall_micro'])
    print("Recall (Macro):", results['recall_macro'])
    print("Recall (Weighted):", results['recall_weighted'])
    print("Precision (Micro):", results['precision_micro'])
    print("Precision (Macro):", results['precision_macro'])
    print("Precision (Weighted):", results['precision_weighted'])
    print("Confusion Matrix:\n", results['confusion_matrix'])
    print("F1 Score (Average):", (results['f1_weighted']+results['f1_micro']+results['f1_macro'])/3)

    print()