In [1]:
!pip install numpy pandas tensorflow nltk scikit-learn



In [3]:
import pandas as pd

# Load the dataset to examine its structure
file_path = '/content/train.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
data.head()

Unnamed: 0,sms,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...\n,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [4]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
# Ensure NLTK resources are available
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
# Load the dataset
file_path = "train.csv"  # Make sure this path is correct
data = pd.read_csv(file_path)


In [7]:
# Preprocess: Remove NaNs and clean the text
data.dropna(subset=['sms', 'label'], inplace=True)


In [8]:
# Text cleaning function
def preprocess_text(text):
    stop_words = set(stopwords.words('english')) - {'won', 'gift', 'call', 'free', 'prize'}
    text = re.sub(r'\W', ' ', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\d+', '', text)
    return ' '.join(word for word in text.split() if word not in stop_words)

data['sms'] = data['sms'].apply(preprocess_text)

In [9]:
# Split for ML models
X_train_raw, X_test_raw, y_train, y_test = train_test_split(data['sms'], data['label'], test_size=0.2, random_state=42)

In [10]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train_raw)
X_test_tfidf = tfidf.transform(X_test_raw)

In [11]:
# --------- Model 1: Naive Bayes ----------
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
nb_pred = nb.predict(X_test_tfidf)

In [12]:
# --------- Model 2: SVM ----------
svm = LinearSVC()
svm.fit(X_train_tfidf, y_train)
svm_pred = svm.predict(X_test_tfidf)

In [13]:
# --------- Model 3: Random Forest ----------
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_tfidf, y_train)
rf_pred = rf.predict(X_test_tfidf)

In [14]:
# --------- Model 4: LSTM ----------
# Tokenization and sequence padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['sms'])
X_seq = tokenizer.texts_to_sequences(data['sms'])
max_len = 50
X_pad = pad_sequences(X_seq, maxlen=max_len)


X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(X_pad, data['label'].values, test_size=0.2, random_state=42)


In [15]:
vocab_size = len(tokenizer.word_index) + 1

model = Sequential([
    Embedding(vocab_size, 128, input_length=max_len),
    SpatialDropout1D(0.3),
    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])



In [16]:
# Tokenization and sequence padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['sms'])
X_seq = tokenizer.texts_to_sequences(data['sms'])
max_len = 50
X_pad = pad_sequences(X_seq, maxlen=max_len)

# Split for LSTM model
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(X_pad, data['label'].values, test_size=0.2, random_state=42)

vocab_size = len(tokenizer.word_index) + 1

model = Sequential([
    Embedding(vocab_size, 128, input_length=max_len),
    SpatialDropout1D(0.3),
    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train_lstm, y_train_lstm, epochs=5, batch_size=32, validation_data=(X_test_lstm, y_test_lstm), verbose=1)

Epoch 1/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 186ms/step - accuracy: 0.8879 - loss: 0.3020 - val_accuracy: 0.9830 - val_loss: 0.0615
Epoch 2/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 206ms/step - accuracy: 0.9865 - loss: 0.0422 - val_accuracy: 0.9874 - val_loss: 0.0467
Epoch 3/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 166ms/step - accuracy: 0.9981 - loss: 0.0091 - val_accuracy: 0.9901 - val_loss: 0.0401
Epoch 4/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 165ms/step - accuracy: 0.9995 - loss: 0.0044 - val_accuracy: 0.9910 - val_loss: 0.0408
Epoch 5/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 164ms/step - accuracy: 0.9976 - loss: 0.0080 - val_accuracy: 0.9901 - val_loss: 0.0441


<keras.src.callbacks.history.History at 0x7c5c0512d550>

In [17]:
# LSTM Prediction
lstm_pred = (model.predict(X_test_lstm) > 0.5).astype("int32")

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step


In [18]:
# --------- Evaluation ---------
print("\n--- Naive Bayes ---")
print("Accuracy:", accuracy_score(y_test, nb_pred))
print(classification_report(y_test, nb_pred))

print("\n--- SVM ---")
print("Accuracy:", accuracy_score(y_test, svm_pred))
print(classification_report(y_test, svm_pred))

print("\n--- Random Forest ---")
print("Accuracy:", accuracy_score(y_test, rf_pred))
print(classification_report(y_test, rf_pred))

print("\n--- LSTM ---")
print("Accuracy:", accuracy_score(y_test_lstm, lstm_pred))
print(classification_report(y_test_lstm, lstm_pred))


--- Naive Bayes ---
Accuracy: 0.968609865470852
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       954
           1       1.00      0.78      0.88       161

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115


--- SVM ---
Accuracy: 0.9838565022421525
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       954
           1       0.98      0.91      0.94       161

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.97      1115
weighted avg       0.98      0.98      0.98      1115


--- Random Forest ---
Accuracy: 0.9766816143497757
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       954
           1       1.00      0.84      0.91       161

    accuracy                           

In [19]:
import pickle

# Save Naive Bayes model
with open('nb_model.pkl', 'wb') as f:
    pickle.dump(nb, f)

# Save SVM model
with open('svm_model.pkl', 'wb') as f:
    pickle.dump(svm, f)

# Save Random Forest model
with open('rf_model.pkl', 'wb') as f:
    pickle.dump(rf, f)

# Save TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

# Save Tokenizer
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)


In [20]:
model.save('lstm_model.h5')

