In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('spam.csv',encoding='latin-1')

In [3]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df.shape

(5572, 5)

In [5]:
# 1. Data cleaning
# 2. EDA
# 3. Text Preprocessing
# 4. Model building
# 5. Evaluation
# 6. Improvement
# 7. Website
# 8. Deploy

## Data cleaning

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [7]:
df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1,inplace=True)

In [8]:
df.rename(columns={'v1':'label','v2':'text'},inplace=True)

In [9]:
df.sample(5)

Unnamed: 0,label,text
2012,ham,Beautiful Truth against Gravity.. Read careful...
3175,ham,Havent still waitin as usual... ÌÏ come back s...
2158,ham,I think you should go the honesty road. Call t...
1037,ham,"naughty little thought: 'its better to flirt, ..."
4972,ham,Oops I was in the shower when u called. Hey a ...


In [11]:
from sklearn.preprocessing import LabelEncoder
lb=LabelEncoder()
df['label']=lb.fit_transform(df['label'])

In [12]:
#drop duplicates value
df.drop_duplicates(inplace=True)

In [13]:
# Data pre processing
# lower case
#tokenization
#removing of special chracter
# removing of stop words and punction
#stemming

In [14]:
#lower the text
df['text']  = df['text'].str.lower()

In [15]:
#removal of html tags
import re
def remove_html_tags(data):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', data)

In [16]:
text = "<html><body><p> Movie 1</p><p> Actor - Aamir Khan</p><p> Click here to <a href='http://google.com'>download</a></p></body></html>"

In [17]:
remove_html_tags(text)

' Movie 1 Actor - Aamir Khan Click here to download'

In [18]:
df['text'] = df['text'].apply(lambda x: remove_html_tags(x)) 


In [19]:
df['text'][0]

'go until jurong point, crazy.. available only in bugis n great world la e buffet... cine there got amore wat...'

In [20]:
#remove stop words 
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
sw_lsit = stopwords.words('english')

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


In [21]:
df['text'] = df['text'].apply(lambda x: [item for item in x.split() if item not in sw_lsit]).apply(lambda x:" ".join(x))

In [22]:
df['text'][0]

'go jurong point, crazy.. available bugis n great world la e buffet... cine got amore wat...'

In [23]:
# word tokenization k lya yah download krna zrori hoti ha
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [24]:
#remove the punctions and alpha numeric
from nltk.tokenize import word_tokenize

def remove_punct_alphanumeric(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Filter out punctuation and alphanumeric tokens
    cleaned_tokens = [word for word in tokens if word.isalpha()]
    # Join tokens back to a single string
    return " ".join(cleaned_tokens)

In [25]:
df['text'] = df['text'].apply(lambda x: remove_punct_alphanumeric(x))

In [26]:
df['text'][0]

'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

In [27]:
df['label'].unique()

array([0, 1])

In [28]:
from gensim.models import Word2Vec


In [29]:
tokenized_text = df['text'].apply(word_tokenize)

In [30]:
word2vec_model = Word2Vec( window=5, min_count=2, sorted_vocab=True)

In [31]:
word2vec_model.build_vocab(tokenized_text)

In [32]:
word2vec_model.train(tokenized_text, total_examples=len(tokenized_text), epochs=10)

(376442, 454610)

In [33]:
word2vec_model.wv.most_similar('free')

[('mobile', 0.9966954588890076),
 ('camcorder', 0.9958181977272034),
 ('nokia', 0.9927514791488647),
 ('call', 0.9907126426696777),
 ('txt', 0.9900854229927063),
 ('reply', 0.9890351295471191),
 ('urgent', 0.9867748022079468),
 ('awarded', 0.9861122369766235),
 ('bonus', 0.9838712811470032),
 ('caller', 0.9833742380142212)]

In [34]:
word2vec_model.wv.get_normed_vectors()

array([[-0.07717932,  0.13817424,  0.01980715, ..., -0.12675694,
         0.07149473, -0.01175104],
       [-0.07868521,  0.1451806 ,  0.00268184, ..., -0.13861053,
         0.05760023, -0.0028626 ],
       [-0.06277093,  0.10062246, -0.04851905, ..., -0.03144937,
         0.06673976, -0.04043418],
       ...,
       [-0.08459446,  0.12437933,  0.00636146, ..., -0.1057489 ,
         0.0360407 , -0.00572099],
       [-0.09371207,  0.13587268, -0.00604754, ..., -0.13311465,
         0.0539788 , -0.02461235],
       [ 0.01195649,  0.1801031 , -0.06017263, ..., -0.17108291,
         0.02745568,  0.03434882]], dtype=float32)

In [35]:
# Step 1: Generate Average Embedding for Each Document
def document_vector(doc):
    # Filter out words not in the model vocabulary
    doc = [word for word in doc if word in word2vec_model.wv]
    # Return the average of word vectors for words in the document
    return np.mean(word2vec_model.wv[doc], axis=0) if doc else np.zeros(word2vec_model.vector_size)


In [36]:
df['text'][0]

'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

In [37]:
df['vector'] = df['text'].apply(lambda x: document_vector(word_tokenize(x)))

In [38]:
df['vector'][0]

array([-0.19312821,  0.34967312,  0.00156326,  0.05803084,  0.02221723,
       -0.55361015,  0.01770016,  0.6452265 , -0.24848506, -0.1365827 ,
       -0.07524671, -0.4936824 , -0.09975356,  0.22425131,  0.0799759 ,
       -0.20962784,  0.08364524, -0.3834261 , -0.02499807, -0.6342934 ,
        0.22141005,  0.07646465,  0.169377  , -0.10779257, -0.17806569,
        0.03337314, -0.2848181 , -0.23494625, -0.31284165,  0.0644025 ,
        0.35615495,  0.10046743,  0.22058637, -0.21175675, -0.16047491,
        0.31492043,  0.0328128 , -0.2610713 , -0.19873162, -0.59968835,
       -0.00135402, -0.3158462 , -0.03056359, -0.08386309,  0.25837964,
       -0.07437836, -0.32562682, -0.07913299,  0.15954177,  0.25466388,
        0.15314987, -0.30272555, -0.04332826, -0.04855994, -0.19888006,
        0.1079392 ,  0.22312436, -0.1080882 , -0.34962034,  0.08483746,
        0.16346918,  0.06331046, -0.08800558,  0.02430646, -0.4222074 ,
        0.2652118 ,  0.2692789 ,  0.38362473, -0.4032521 ,  0.40

In [39]:
words = word2vec_model.wv.index_to_key
print("Words in the vocabulary:")
print(words[:50])  

Words in the vocabulary:
['u', 'i', 'call', 'get', 'ur', 'gt', 'lt', 'go', 'free', 'know', 'got', 'like', 'good', 'ok', 'you', 'come', 'now', 'time', 'want', 'day', 'love', 'me', 'text', 'going', 'one', 'send', 'need', 'lor', 'home', 'see', 'still', 'back', 'r', 'it', 'txt', 'da', 'today', 'stop', 'think', 'reply', 'tell', 'dont', 'n', 'take', 'hi', 'new', 'sorry', 'please', 'mobile', 'phone']


In [40]:
X = df['vector']

In [41]:
type(X)

pandas.core.series.Series

In [42]:
# type of x is not a array so we need to convert it to array
X =X.to_list()

In [43]:
X = np.array(X)

In [44]:
type(X)

numpy.ndarray

In [45]:
y = df['label']

In [46]:
type(y)

pandas.core.series.Series

In [47]:
y = np.array(y.to_list())

In [48]:
from sklearn.model_selection import train_test_split


In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [50]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_model.predict(X_test)


In [51]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print results
print("Accuracy:", accuracy)
print("\nClassification Report:\n", report)
print("\nConfusion Matrix:\n", conf_matrix)


Accuracy: 0.965183752417795

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98       889
           1       0.92      0.82      0.87       145

    accuracy                           0.97      1034
   macro avg       0.95      0.90      0.92      1034
weighted avg       0.96      0.97      0.96      1034


Confusion Matrix:
 [[879  10]
 [ 26 119]]


In [52]:
import pickle

In [53]:
pickle.dump(word2vec_model,open('word2vec.pkl','wb'))
pickle.dump(rf_model,open('model.pkl','wb'))

In [54]:
import tensorflow

In [55]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam

In [56]:
X_train_pad = pad_sequences(X_train, maxlen=100)
X_test_pad = pad_sequences(X_test, maxlen=100)

In [57]:
vocab_size = len(word2vec_model.wv)
embedding_dim = word2vec_model.vector_size

In [58]:
vocab_size

3387

In [59]:
embedding_dim

100

In [60]:
model = Sequential()

# Add the Embedding layer using pre-trained Word2Vec embeddings
model.add(Embedding(input_dim=vocab_size, 
                    output_dim=embedding_dim, 
                    weights=[word2vec_model.wv.vectors], 
                    input_length=100,  # Set input length according to your data
                    trainable=False))  # Keep the embeddings fixed (non-trainable)

# Add LSTM layer
model.add(LSTM(128, return_sequences=False))  # Adjust units and return_sequences based on needs

# Add Dropout layer to prevent overfitting
model.add(Dropout(0.5))

# Add a Dense layer for classification (binary classification)
model.add(Dense(1, activation='sigmoid'))



In [61]:
# Compile the model
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()

In [62]:
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))


Epoch 1/5
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 107ms/step - accuracy: 0.8537 - loss: 0.4100 - val_accuracy: 0.8598 - val_loss: 0.4062
Epoch 2/5
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 98ms/step - accuracy: 0.8748 - loss: 0.3846 - val_accuracy: 0.8598 - val_loss: 0.4062
Epoch 3/5
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 100ms/step - accuracy: 0.8763 - loss: 0.3786 - val_accuracy: 0.8598 - val_loss: 0.4098
Epoch 4/5
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 126ms/step - accuracy: 0.8820 - loss: 0.3677 - val_accuracy: 0.8598 - val_loss: 0.4066
Epoch 5/5
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 108ms/step - accuracy: 0.8753 - loss: 0.3840 - val_accuracy: 0.8598 - val_loss: 0.4116


<keras.src.callbacks.history.History at 0x1f4f234f490>

In [63]:
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)  # Convert probabilities to binary values

# Evaluate accuracy and print classification report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("\nClassification Report:\n", report)


[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step
Accuracy: 0.8597678916827853

Classification Report:
               precision    recall  f1-score   support

           0       0.86      1.00      0.92       889
           1       0.00      0.00      0.00       145

    accuracy                           0.86      1034
   macro avg       0.43      0.50      0.46      1034
weighted avg       0.74      0.86      0.79      1034



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [75]:
model.save('lstm_spam_model.h5')  # Save the trained LSTM model

# Optionally save the Word2Vec model for later use
pickle.dump(word2vec_model, open('word2vec_model.pkl', 'wb'))

