# **DOWNLOADING DATASETS**

In [None]:
!kaggle datasets download -d "uciml/sms-spam-collection-dataset"

Dataset URL: https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset
License(s): unknown
Downloading sms-spam-collection-dataset.zip to /content
  0% 0.00/211k [00:00<?, ?B/s]
100% 211k/211k [00:00<00:00, 74.3MB/s]


# **EXTRACTING DATASETS**

In [None]:
!unzip "/content/sms-spam-collection-dataset.zip"

Archive:  /content/sms-spam-collection-dataset.zip
  inflating: spam.csv                


# **READING CSV FILE**

In [303]:
import pandas as pd
df = pd.read_csv("/content/spam.csv", encoding='latin-1')

In [304]:
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [None]:
### PREPROCESSING AND CLEANING THE DATASET

# **Drop the null columns , not used columns**

In [305]:
df.dropna(axis=1, inplace=True)

In [306]:
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


# **Renaming column names**

In [307]:
df.rename(columns={"v1":"target", "v2":"text"}, inplace=True)

In [308]:
df

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


# **Checking for null values**


In [309]:
df.isna().any()

Unnamed: 0,0
target,False
text,False


# **Using labelencoder for transforming target string objects into numerical values**

In [310]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(df.target)

In [311]:
df.target = le.transform(df.target)

In [312]:
df

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [313]:
le.classes_

array(['ham', 'spam'], dtype=object)

In [314]:
df.isna().any()

Unnamed: 0,0
target,False
text,False


# **Since text int the fromat of string object , using nlp for understanding the messages and transforming into vector formats**

In [315]:
df['text'][2]

"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

In [316]:
import nltk

In [317]:
!pip install nltk
nltk.download('punkt_tab')



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [318]:
#tokenizing the texts

from nltk.tokenize import word_tokenize
df['text'] = df['text'].apply(word_tokenize)

In [319]:
df

Unnamed: 0,target,text
0,0,"[Go, until, jurong, point, ,, crazy, .., Avail..."
1,0,"[Ok, lar, ..., Joking, wif, u, oni, ...]"
2,1,"[Free, entry, in, 2, a, wkly, comp, to, win, F..."
3,0,"[U, dun, say, so, early, hor, ..., U, c, alrea..."
4,0,"[Nah, I, do, n't, think, he, goes, to, usf, ,,..."
...,...,...
5567,1,"[This, is, the, 2nd, time, we, have, tried, 2,..."
5568,0,"[Will, Ì_, b, going, to, esplanade, fr, home, ?]"
5569,0,"[Pity, ,, *, was, in, mood, for, that, ., So, ..."
5570,0,"[The, guy, did, some, bitching, but, I, acted,..."


In [320]:
df['text'][2]

['Free',
 'entry',
 'in',
 '2',
 'a',
 'wkly',
 'comp',
 'to',
 'win',
 'FA',
 'Cup',
 'final',
 'tkts',
 '21st',
 'May',
 '2005',
 '.',
 'Text',
 'FA',
 'to',
 '87121',
 'to',
 'receive',
 'entry',
 'question',
 '(',
 'std',
 'txt',
 'rate',
 ')',
 'T',
 '&',
 'C',
 "'s",
 'apply',
 '08452810075over18',
 "'s"]

In [321]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [322]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [323]:
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
stemmer = SnowballStemmer("english")
stop_words = set(stopwords.words('english'))
def stem_text(text):

    # Remove stopwords and apply stemming
    return [stemmer.stem(word) for word in text if word.lower() not in stop_words]
   # Join the stemmed words back into a sentence

# Step 5: Apply stemming and stopword removal to the dataset
df['text'] = df['text'].apply(stem_text)


'''from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
  """Lemmatizes each word in a list of tokens.
  """
  return [lemmatizer.lemmatize(word) for word in text]

df['text'] = df['text'].apply(lemmatize_text)'''

'from nltk.stem import WordNetLemmatizer\n\nlemmatizer = WordNetLemmatizer()\n\ndef lemmatize_text(text):\n  """Lemmatizes each word in a list of tokens.\n  """\n  return [lemmatizer.lemmatize(word) for word in text]\n\ndf[\'text\'] = df[\'text\'].apply(lemmatize_text)'

In [324]:
df

Unnamed: 0,target,text
0,0,"[go, jurong, point, ,, crazi, .., avail, bugi,..."
1,0,"[ok, lar, ..., joke, wif, u, oni, ...]"
2,1,"[free, entri, 2, wkli, comp, win, fa, cup, fin..."
3,0,"[u, dun, say, earli, hor, ..., u, c, alreadi, ..."
4,0,"[nah, n't, think, goe, usf, ,, live, around, t..."
...,...,...
5567,1,"[2nd, time, tri, 2, contact, u., u, å£750, pou..."
5568,0,"[ì_, b, go, esplanad, fr, home, ?]"
5569,0,"[piti, ,, *, mood, ., ..., suggest, ?]"
5570,0,"[guy, bitch, act, like, 'd, interest, buy, som..."


In [269]:
'''import gensim
from gensim.models import Word2Vec'''

In [325]:
df

Unnamed: 0,target,text
0,0,"[go, jurong, point, ,, crazi, .., avail, bugi,..."
1,0,"[ok, lar, ..., joke, wif, u, oni, ...]"
2,1,"[free, entri, 2, wkli, comp, win, fa, cup, fin..."
3,0,"[u, dun, say, earli, hor, ..., u, c, alreadi, ..."
4,0,"[nah, n't, think, goe, usf, ,, live, around, t..."
...,...,...
5567,1,"[2nd, time, tri, 2, contact, u., u, å£750, pou..."
5568,0,"[ì_, b, go, esplanad, fr, home, ?]"
5569,0,"[piti, ,, *, mood, ., ..., suggest, ?]"
5570,0,"[guy, bitch, act, like, 'd, interest, buy, som..."


In [237]:
'''import pandas as pd
from gensim.models import Word2Vec



# Train Word2Vec model
word2vec_model = Word2Vec(sentences=df['text'], vector_size=500, window=5, min_count=1)

# Function to generate sentence embeddings (average of word vectors)
def get_sentence_embedding(sentence, model):
    vectors = [model.wv[word] for word in sentence if word in model.wv]
    if vectors:  # Check if the sentence contains words in the Word2Vec vocabulary
        return sum(vectors) / len(vectors)  # Average of word vectors
    else:
        return [0] * model.vector_size  # Return zero vector if no words in vocabulary

# Apply the function to the 'text' column
df['sentence_embedding'] = df['text'].apply(lambda x: get_sentence_embedding(x, word2vec_model))

# Check the result
print(df[['target', 'sentence_embedding']])'''


      target                                 sentence_embedding
0          0  [0.14661013, 0.22315003, 0.23790945, 0.2500203...
1          0  [0.15416342, 0.23217803, 0.25472036, 0.2916861...
2          1  [0.16036883, 0.2568386, 0.2492064, 0.22946227,...
3          0  [0.19987339, 0.3012363, 0.33002788, 0.3599376,...
4          0  [0.17128257, 0.2587591, 0.32503897, 0.35387123...
...      ...                                                ...
5567       1  [0.19549264, 0.30328315, 0.32095817, 0.3188045...
5568       0  [0.1583334, 0.2381819, 0.27524248, 0.2894657, ...
5569       0  [0.22152296, 0.35037568, 0.351419, 0.40101498,...
5570       0  [0.18209597, 0.2757654, 0.31861553, 0.33101583...
5571       0  [0.17847538, 0.27770898, 0.30890206, 0.3111588...

[5572 rows x 2 columns]


In [326]:
df

Unnamed: 0,target,text
0,0,"[go, jurong, point, ,, crazi, .., avail, bugi,..."
1,0,"[ok, lar, ..., joke, wif, u, oni, ...]"
2,1,"[free, entri, 2, wkli, comp, win, fa, cup, fin..."
3,0,"[u, dun, say, earli, hor, ..., u, c, alreadi, ..."
4,0,"[nah, n't, think, goe, usf, ,, live, around, t..."
...,...,...
5567,1,"[2nd, time, tri, 2, contact, u., u, å£750, pou..."
5568,0,"[ì_, b, go, esplanad, fr, home, ?]"
5569,0,"[piti, ,, *, mood, ., ..., suggest, ?]"
5570,0,"[guy, bitch, act, like, 'd, interest, buy, som..."


#**Creating model for spam classification**




In [327]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(class_weight='balanced')

In [328]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['text'].tolist(), df['target'], test_size=0.2, random_state=42)

In [329]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_str = [' '.join(tokens) for tokens in X_train]
X_test_str = [' '.join(tokens) for tokens in X_test]
X_train = vectorizer.fit_transform(X_train_str)
X_test = vectorizer.transform(X_test_str)



In [330]:
X_train_str

["'m boat . still mom . check yo . 'm half nake .",
 '( bank granit issu strong-buy ) explos pick member * * * * * 300 % * * * * * * * * * * * nasdaq symbol cdgt $ 5.00 per ..',
 'r give second chanc rahul dengra .',
 'play smash bros & lt ; # & gt ; religi .',
 'privat ! 2003 account statement 07973788240 show 800 un-redeem s. i. m. point . call 08715203649 identifi code : 40533 expir 31/10/04',
 'g say never answer text , confirm/deni',
 '88066 88066 lost 3pound help',
 'okey dokey , i\x89û÷ll bit sort stuff .',
 'come peopl',
 'wah lucki man ... save money ... hee ...',
 'much better thank lol',
 'madam , regret disturbance.might receiv refer check dlf premarica.kind informed.rgd , rakhesh , kerala .',
 "'m come home 4 dinner .",
 'ok ...',
 'ì_ decid faster cos sis go home liao ..',
 'hi mate rv u hav nice hol messag 3 say hello coz havenåõt sent u 1 age start drive stay road ! rvx',
 'amaz : rearrang letter give mean ... dormitori = dirti room astronom = moon starer eye = see elec

In [331]:
model.fit(X_train, y_train)

In [332]:
model.score(X_test, y_test)

0.97847533632287

In [333]:
model.predict(X_test[4])

array([1])

In [334]:
y_test

Unnamed: 0,target
3245,0
944,0
1044,1
2484,0
812,1
...,...
4264,0
2439,0
5556,0
4205,0


In [335]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report

# Assuming y_test is the true labels and y_pred are the predicted labels
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]  # for AUC and Log-Loss

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)
cm = confusion_matrix(y_test, y_pred)

# Classification Report (provides precision, recall, F1, support for each class)
report = classification_report(y_test, y_pred)

# Print the results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"AUC: {roc_auc}")
print("Confusion Matrix:")
print(cm)
print("Classification Report:")
print(report)


Accuracy: 0.97847533632287
Precision: 0.92
Recall: 0.92
F1 Score: 0.92
AUC: 0.9889464594127807
Confusion Matrix:
[[953  12]
 [ 12 138]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       965
           1       0.92      0.92      0.92       150

    accuracy                           0.98      1115
   macro avg       0.95      0.95      0.95      1115
weighted avg       0.98      0.98      0.98      1115



#**Now model is created for spam classifocation , let's again create the model for converting users message to vector format and classifying whether it's spam or ham**

In [355]:

def tokenizatio(text):
    return word_tokenize(text)
def stem_text(text):
    stemmer = SnowballStemmer("english")
    stop_words = set(stopwords.words('english'))

    return [stemmer.stem(word) for word in text if word.lower() not in stop_words]

# Initialize the TfidfVectorizer with the vocabulary from training


message = input("Enter any message:")
message = tokenizatio(message)
message = stem_text(message)
message = ' '.join(message)  # Join tokens into a string for TF-IDF
message_vector = vectorizer.transform([message])  # Transform using TF-IDF

# Make prediction using the TF-IDF vector
prediction = model.predict(message_vector)
print(prediction)
print("Spam" if prediction[0]==1 else "ham")

Enter any message:Congratulations! You've won a free iPhone. Click here to claim your prize.
[1]
Spam
